Merge branch 'master' into py37

author: Ben <bstriner@gmail.com> 2018-08-26 15:44:13 -0400
committer: Ben <bstriner@gmail.com> 2018-08-26 15:44:13 -0400
commit: 88ec342544096d895908dac6b0bf6b44dadaaca1 (patch)
tree: cd570c40e6a40e37f14747d6fd387596ff324d01 /tensorflow
parent: 32d4ffeb95a344fde6a1b956a4a8d6792432bf15 (diff)
parent: 09792df012c22622324f085f46edde33006c7355 (diff)
2646 files changed, 70704 insertions, 41164 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index b807c8c2c6..9cc4c4567b 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -23,6 +23,10 @@ load(
     "//tensorflow/python/tools/api/generator:api_gen.bzl",
     "gen_api_init_files",  # @unused
 )
+load(
+    "//third_party/ngraph:build_defs.bzl",
+    "if_ngraph",
+)
 
 # Config setting used when building for products
 # which requires restricted licenses to be avoided.
@@ -411,6 +415,14 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# This flag is set from the configure step when the user selects with nGraph option.
+# By default it should be false
+config_setting(
+    name = "with_ngraph_support",
+    values = {"define": "with_ngraph_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 package_group(
     name = "internal",
     packages = [
@@ -424,12 +436,12 @@ package_group(
 
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
+    "if_mkl_ml",
 )
 
 filegroup(
     name = "intel_binary_blob",
-    data = if_mkl(
+    data = if_mkl_ml(
         [
             "//third_party/mkl:intel_binary_blob",
         ],
@@ -563,7 +575,7 @@ tf_cc_shared_object(
         "//tensorflow/cc:scope",
         "//tensorflow/cc/profiler",
         "//tensorflow/core:tensorflow",
-    ],
+    ] + if_ngraph(["@ngraph_tf//:ngraph_tf"]),
 )
 
 exports_files(
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 440e9f8dbd..21677512b6 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -28,7 +28,8 @@ contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
+from tensorflow.python.platform import app  # pylint: disable=g-import-not-at-top
+app.flags = flags
 
 del absolute_import
 del division
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 19ccb6e71d..b8adf6c127 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -202,7 +202,8 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
   buf->len_ = len;
   if (dtype != TF_STRING && dtype != TF_RESOURCE &&
       tensorflow::DataTypeCanUseMemcpy(static_cast<DataType>(dtype)) &&
-      reinterpret_cast<intptr_t>(data) % EIGEN_MAX_ALIGN_BYTES != 0) {
+      reinterpret_cast<intptr_t>(data) % std::max(1, EIGEN_MAX_ALIGN_BYTES) !=
+          0) {
     // TF_STRING and TF_RESOURCE tensors have a different representation in
     // TF_Tensor than they do in tensorflow::Tensor. So a copy here is a waste
     // (any alignment requirements will be taken care of by TF_TensorToTensor
diff --git a/tensorflow/c/checkpoint_reader.h b/tensorflow/c/checkpoint_reader.h
index 4de1300a7f..91654c8d4f 100644
--- a/tensorflow/c/checkpoint_reader.h
+++ b/tensorflow/c/checkpoint_reader.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_C_CHECKPOINT_READER_H
-#define TENSORFLOW_C_CHECKPOINT_READER_H
+#ifndef TENSORFLOW_C_CHECKPOINT_READER_H_
+#define TENSORFLOW_C_CHECKPOINT_READER_H_
 
 #include <memory>
 #include <string>
@@ -79,4 +79,4 @@ class CheckpointReader {
 }  // namespace checkpoint
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_CHECKPOINT_READER_H
+#endif  // TENSORFLOW_C_CHECKPOINT_READER_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 71d5f3613c..7126227cf5 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1471,4 +1471,61 @@ void BM_ReadVariable(int iters) {
 }
 BENCHMARK(BM_ReadVariable);
 
+TEST(CAPI, StringAttributes) {
+  // Test that TFE_OpSetAttrString doesn't hold on to the value after it
+  // returns.
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::vector<int64_t> dims(4, 1);
+  TFE_Op* op = TFE_NewOp(ctx, "AvgPool", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* tensor =
+      TF_AllocateTensor(TF_FLOAT, dims.data(), dims.size(), sizeof(float));
+  float tensor_data[] = {1};
+  memcpy(TF_TensorData(tensor), tensor_data, TF_TensorByteSize(tensor));
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandle(tensor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, tensor_handle, status);
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(tensor_handle);
+
+  std::vector<int64_t> values(4, 1);
+  TFE_OpSetAttrIntList(op, "ksize", values.data(), values.size());
+  TFE_OpSetAttrIntList(op, "strides", values.data(), values.size());
+
+  const int BUFFER_SIZE = 10;
+  char buffer[BUFFER_SIZE];
+  std::strncpy(buffer, "VALID", BUFFER_SIZE);
+  TFE_OpSetAttrString(op, "padding", buffer, std::strlen(buffer));
+  // Overwriting value in "buffer", should be fine since TFE_Op
+  // shouldn't be holding on to it.
+  std::strncpy(buffer, "NHWC", BUFFER_SIZE);
+  TFE_OpSetAttrString(op, "data_format", buffer, std::strlen(buffer));
+
+  TFE_OpSetAttrType(op, "T", TF_FLOAT);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(op, &retvals[0], &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  tensor = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ(4, TF_TensorByteSize(tensor));
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(op);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
 }  // namespace
diff --git a/tensorflow/c/tf_status_helper.h b/tensorflow/c/tf_status_helper.h
index 86e687df20..7661a01de4 100644
--- a/tensorflow/c/tf_status_helper.h
+++ b/tensorflow/c/tf_status_helper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_C_TF_STATUS_HELPER_H
-#define TENSORFLOW_C_TF_STATUS_HELPER_H
+#ifndef TENSORFLOW_C_TF_STATUS_HELPER_H_
+#define TENSORFLOW_C_TF_STATUS_HELPER_H_
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -29,4 +29,4 @@ Status StatusFromTF_Status(const TF_Status* tf_status);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_TF_STATUS_HELPER_H
+#endif  // TENSORFLOW_C_TF_STATUS_HELPER_H_
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index dfdef88945..c20ea95a15 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -508,15 +508,6 @@ bool HasOptionalAttrs(
   return false;
 }
 
-const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.in_arg_size(); ++i) {
-    if (api_def.in_arg(i).name() == name) {
-      return &api_def.in_arg(i);
-    }
-  }
-  return nullptr;
-}
-
 struct OpInfo {
   // graph_op_def: The OpDef used by the runtime, has the names that
   //   must be used when calling NodeBuilder.
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 5dcf00857d..1329b568ab 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -441,21 +441,20 @@ Status RealDivGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("RealDiv", RealDivGrad);
 
-Status UnsafeDivGrad(const Scope& scope, const Operation& op,
-                     const std::vector<Output>& grad_inputs,
-                     std::vector<Output>* grad_outputs) {
+Status DivNoNanGrad(const Scope& scope, const Operation& op,
+                    const std::vector<Output>& grad_inputs,
+                    std::vector<Output>* grad_outputs) {
   auto x_1 = ConjugateHelper(scope, op.input(0));
   auto x_2 = ConjugateHelper(scope, op.input(1));
   // y = x_1 / x_2
   // dy/dx_1 = 1/x_2
   // dy/dx_2 = -x_1/x_2^2
-  auto gx_1 = UnsafeDiv(scope, grad_inputs[0], x_2);
-  auto gx_2 =
-      Mul(scope, grad_inputs[0],
-          UnsafeDiv(scope, UnsafeDiv(scope, Neg(scope, x_1), x_2), x_2));
+  auto gx_1 = DivNoNan(scope, grad_inputs[0], x_2);
+  auto gx_2 = Mul(scope, grad_inputs[0],
+                  DivNoNan(scope, DivNoNan(scope, Neg(scope, x_1), x_2), x_2));
   return BinaryGradCommon(scope, op, grad_outputs, gx_1, gx_2);
 }
-REGISTER_GRADIENT_OP("UnsafeDiv", UnsafeDivGrad);
+REGISTER_GRADIENT_OP("DivNoNan", DivNoNanGrad);
 
 Status SquaredDifferenceGrad(const Scope& scope, const Operation& op,
                              const std::vector<Output>& grad_inputs,
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 88aef1fab4..c16938322c 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -33,6 +33,7 @@ using ops::AddN;
 using ops::BatchMatMul;
 using ops::Const;
 using ops::Div;
+using ops::DivNoNan;
 using ops::MatMul;
 using ops::Max;
 using ops::Maximum;
@@ -48,7 +49,6 @@ using ops::SegmentSum;
 using ops::SquaredDifference;
 using ops::Sub;
 using ops::Sum;
-using ops::UnsafeDiv;
 
 // TODO(andydavis) Test gradient function against numeric gradients output.
 // TODO(andydavis) As more gradients are added move common test functions
@@ -854,13 +854,13 @@ TEST_F(NaryGradTest, RealDiv) {
   RunTest({x}, {x_shape}, {y}, {x_shape});
 }
 
-TEST_F(NaryGradTest, UnsafeDiv) {
+TEST_F(NaryGradTest, DivNoNan) {
   {
     TensorShape x_shape({3, 2, 5});
     const auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
     // Test x / (1 + |x|) rather than x_1 / x_2 to avoid triggering large
     // division errors in the numeric estimator used by the gradient checker.
-    const auto y = UnsafeDiv(
+    const auto y = DivNoNan(
         scope_, x, Add(scope_, Const<float>(scope_, 1), Abs(scope_, x)));
     RunTest({x}, {x_shape}, {y}, {x_shape});
   }
@@ -868,7 +868,7 @@ TEST_F(NaryGradTest, UnsafeDiv) {
     // Return 0 gradient (rather than NaN) for division by zero.
     const auto x = Placeholder(scope_, DT_FLOAT);
     const auto zero = Const<float>(scope_, 0.0);
-    const auto y = UnsafeDiv(scope_, x, zero);
+    const auto y = DivNoNan(scope_, x, zero);
 
     std::vector<Output> grad_outputs;
     TF_EXPECT_OK(AddSymbolicGradients(scope_, {y}, {x}, &grad_outputs));
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 3830416159..222e769881 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -182,7 +182,7 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
   variables_path_tensor.scalar<string>()() = variables_path;
 
   std::vector<std::pair<string, Tensor>> inputs = {
-      {variable_filename_const_op_name.ToString(), variables_path_tensor}};
+      {string(variable_filename_const_op_name), variables_path_tensor}};
 
   AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
 
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 1899a32e4d..59b961cdd9 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -32,7 +32,6 @@ cc_library(
     deps = [
         ":embedded_protocol_buffers",
         "//tensorflow/compiler/tf2xla",
-        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
         "//tensorflow/compiler/tf2xla:tf2xla_proto",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -55,6 +54,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -71,6 +72,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
         "@llvm//:support",  # fixdeps: keep
         "@llvm//:x86_code_gen",  # fixdeps: keep
     ],
@@ -99,6 +101,7 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -193,6 +196,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:support",
         "@llvm//:target",
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 89fefdad54..e77a8fecf0 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -19,9 +19,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_replace.h"
 #include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
 #include "tensorflow/compiler/tf2xla/cpu_function_runtime.h"
-#include "tensorflow/compiler/tf2xla/str_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
@@ -29,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -141,7 +142,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   }
   rewrites->push_back({"{{I}}", strings::StrCat(i)});
   rewrites->push_back({"{{TYPE}}", type});
-  rewrites->push_back({"{{DIM_VARS}}", str_util::Join(dim_vars, ", ")});
+  rewrites->push_back({"{{DIM_VARS}}", absl::StrJoin(dim_vars, ", ")});
   rewrites->push_back({"{{DIM_SIZES}}", dim_sizes});
   rewrites->push_back({"{{INDICES}}", indices});
   return Status::OK();
@@ -157,8 +158,9 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
 // text-templating mechanism.
 string RewriteWithName(const string& name, string code,
                        const std::vector<std::pair<string, string>>& rewrites) {
-  str_util::ReplaceAllPairs(&code, rewrites);
-  return str_util::StringReplace(code, "{{NAME}}", name, /*replace_all=*/true);
+  absl::StrReplaceAll(rewrites, &code);
+  absl::StrReplaceAll({{"{{NAME}}", name}}, &code);
+  return code;
 }
 
 // Generate methods for args (inputs).
@@ -570,11 +572,11 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_BYTES_TOTAL}}", strings::StrCat(arg_bytes_total)},
       {"{{ARG_NAMES_CODE}}", arg_names_code},
       {"{{ARG_NUM}}", strings::StrCat(arg_index_table.size())},
-      {"{{ARG_INDEX_TABLE}}", str_util::Join(arg_index_table, ", ")},
+      {"{{ARG_INDEX_TABLE}}", absl::StrJoin(arg_index_table, ", ")},
       {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size},
       {"{{CLASS}}", opts.class_name},
       {"{{DECLS_FROM_OBJ_FILE}}",
-       str_util::Join(metadata_result.header_variable_decls, "\n")},
+       absl::StrJoin(metadata_result.header_variable_decls, "\n")},
       {"{{ENTRY}}", compile_result.entry_point},
       {"{{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}}",
        metadata_result.hlo_profile_printer_data_access_shim},
@@ -594,8 +596,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{TEMP_BYTES_TOTAL}}", strings::StrCat(temp_bytes_total)},
       {"{{NUM_BUFFERS}}", strings::StrCat(buffer_infos.size())},
       {"{{BUFFER_INFOS_AS_STRING}}",
-       str_util::Join(buffer_infos_as_strings, ",\n")}};
-  str_util::ReplaceAllPairs(header, rewrites);
+       absl::StrJoin(buffer_infos_as_strings, ",\n")}};
+  absl::StrReplaceAll(rewrites, header);
   return Status::OK();
 }
 
@@ -617,7 +619,8 @@ Status GenerateMetadata(const CodegenOpts& opts,
 
   if (opts.gen_program_shape) {
     program_shape =
-        tensorflow::MakeUnique<xla::ProgramShape>(compile_result.program_shape);
+        absl::make_unique<xla::ProgramShape>(compile_result.program_shape);
+
     // The parameter names are currently meaningless, and redundant with the
     // rest of our metadata, so clear them out to avoid confusion and save
     // space.
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 60d59ae996..e3a53edb73 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "llvm/Support/TargetSelect.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -34,9 +34,9 @@ namespace {
 
 using ::tensorflow::cpu_function_runtime::BufferInfo;
 
-void ExpectErrorContains(const Status& status, StringPiece str) {
+void ExpectErrorContains(const Status& status, absl::string_view str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index 4e27aafec7..1401aae758 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_replace.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
@@ -26,8 +28,6 @@ limitations under the License.
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "tensorflow/compiler/tf2xla/str_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -65,14 +65,13 @@ static string CreateCPPShimExpression(StringPiece qualified_cpp_protobuf_name,
       "    return proto;\n"
       "  }()";
 
-  str_util::ReplaceAllPairs(
-      &code,
+  return absl::StrReplaceAll(
+      code,
       {
           {"{{ARRAY_SYMBOL}}", strings::StrCat(protobuf_array_symbol_name)},
           {"{{ARRAY_SIZE}}", strings::StrCat(protobuf_array_size)},
           {"{{PROTOBUF_NAME}}", strings::StrCat(qualified_cpp_protobuf_name)},
       });
-  return code;
 }
 
 static StatusOr<string> CodegenModule(llvm::TargetMachine* target_machine,
@@ -97,7 +96,7 @@ static StatusOr<std::unique_ptr<llvm::TargetMachine>>
 GetTargetMachineFromTriple(StringPiece target_triple) {
   std::string error;
   std::string normalized_triple =
-      llvm::Triple::normalize(AsStringRef(target_triple));
+      llvm::Triple::normalize(AsStringRef(absl::string_view(target_triple)));
   const llvm::Target* target =
       llvm::TargetRegistry::lookupTarget(normalized_triple, error);
   if (target == nullptr) {
@@ -105,7 +104,7 @@ GetTargetMachineFromTriple(StringPiece target_triple) {
                               error.c_str());
   }
 
-  return WrapUnique(target->createTargetMachine(
+  return absl::WrapUnique(target->createTargetMachine(
       normalized_triple, /*CPU=*/"",
       /*Features=*/"", llvm::TargetOptions(), llvm::None));
 }
@@ -118,7 +117,7 @@ StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
 
   llvm::LLVMContext llvm_context;
   std::unique_ptr<llvm::Module> module_with_serialized_proto =
-      MakeUnique<llvm::Module>("embedded_data_module", llvm_context);
+      absl::make_unique<llvm::Module>("embedded_data_module", llvm_context);
 
   EmbeddedProtocolBuffers result;
 
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 0ecc3feeb6..7364d63b53 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -226,5 +226,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 0c0c676ece..dd2b151098 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 
+#include "absl/strings/str_split.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -546,7 +546,7 @@ TEST(TFCompileTest, HloProfiling) {
   VLOG(1) << "HLO profile string:\n" << hlo_profile_as_string;
 
   std::vector<string> hlo_profile_lines =
-      tensorflow::str_util::Split(hlo_profile_as_string, '\n');
+      absl::StrSplit(hlo_profile_as_string, '\n');
 
   auto header = HasSubstr("Execution profile for");
   auto total_cycles_profile_line = HasSubstr("[total]");
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index 839e1588b7..f3c44e9dda 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/aot/codegen.h"
 #include "tensorflow/compiler/aot/compile.h"
 #include "tensorflow/compiler/aot/flags.h"
@@ -34,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -55,7 +56,7 @@ const char kUsageHeader[] =
     "\n";
 
 Status ReadProtoFile(const string& fname, protobuf::Message* proto) {
-  if (str_util::EndsWith(fname, ".pbtxt")) {
+  if (absl::EndsWith(fname, ".pbtxt")) {
     return ReadTextProto(Env::Default(), fname, proto);
   } else {
     return ReadBinaryProto(Env::Default(), fname, proto);
@@ -75,7 +76,7 @@ Status Main(const MainFlags& flags) {
     for (const tf2xla::Fetch& fetch : config.fetch()) {
       nodes.insert(fetch.id().node_name());
     }
-    std::cout << str_util::Join(nodes, ",");
+    std::cout << absl::StrJoin(nodes, ",");
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 9e6d7fa0b1..df81f3c23e 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -128,11 +128,11 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -191,6 +191,7 @@ cc_library(
         "//tensorflow/core/kernels/data:generator_dataset_op",
         "//tensorflow/core/kernels/data:iterator_ops",
         "//tensorflow/core/kernels/data:prefetch_dataset_op",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -235,6 +236,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -283,6 +285,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = 1,
 )
@@ -303,6 +306,52 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "resource_operation_safety_analysis",
+    srcs = ["resource_operation_safety_analysis.cc"],
+    hdrs = ["resource_operation_safety_analysis.h"],
+    deps = [
+        "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "resource_operation_safety_analysis_test",
+    srcs = ["resource_operation_safety_analysis_test.cc"],
+    deps = [
+        ":common",
+        ":resource_operation_safety_analysis",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/compiler/jit/kernels:xla_launch_op",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -315,6 +364,7 @@ cc_library(
         "encapsulate_subgraphs_pass.cc",
         "mark_for_compilation_pass.cc",
         "mark_for_compilation_pass_test_helper.cc",
+        "partially_decluster_pass.cc",
     ],
     hdrs = [
         "build_xla_launch_ops_pass.h",
@@ -322,6 +372,7 @@ cc_library(
         "encapsulate_subgraphs_pass.h",
         "mark_for_compilation_pass.h",
         "mark_for_compilation_pass_test_helper.h",
+        "partially_decluster_pass.h",
     ],
     deps = [
         ":common",
@@ -329,11 +380,10 @@ cc_library(
         ":union_find",
         ":xla_cluster_util",
         "//tensorflow/compiler/jit/graphcycles",
-        "//tensorflow/compiler/jit/kernels:parallel_check_op",
         "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
-        "//tensorflow/compiler/jit/ops:parallel_check_op",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
+        "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
@@ -345,6 +395,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:bounds_check",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -353,11 +404,13 @@ cc_library(
     srcs = ["xla_cluster_util.cc"],
     hdrs = ["xla_cluster_util.h"],
     deps = [
+        ":resource_operation_safety_analysis",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:bounds_check",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -420,14 +473,17 @@ tf_cc_test(
     srcs = [
         "encapsulate_subgraphs_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
+        "partially_decluster_pass_test.cc",
     ],
     deps = [
         ":common",
         ":compilation_passes",
+        ":xla_cluster_util",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/compiler/jit/kernels:xla_launch_op",
         "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -439,6 +495,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -519,6 +576,9 @@ tf_cuda_cc_test(
         ":common",
         ":xla_cluster_util",
         ":xla_fusion_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/core:graph",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/jit/create_xla_launch_op.cc b/tensorflow/compiler/jit/create_xla_launch_op.cc
index a2e6285339..a7f8a5613c 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/jit/create_xla_launch_op.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
@@ -125,7 +126,8 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
   const DataTypeVector& arg_types = (*fbody)->arg_types;
   std::vector<bool> const_args(arg_types.size());
   // If we can't analyze the const args. Bail out.
-  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*((*fbody)->graph), &const_args));
+  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
+      *((*fbody)->graph), &const_args, /*compile_time_const_nodes=*/nullptr));
 
   for (int i = 0; i < const_args.size(); ++i) {
     if (const_args[i]) {
@@ -223,8 +225,8 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def,
       &fbody->fdef.signature(), flr, fbody->arg_types, input_memory_types,
       fbody->ret_types, output_memory_types, flr->graph_def_version(), &s);
 
-  *kernel = MakeUnique<XlaLocalLaunchBase>(&construction, constant_arg_indices,
-                                           resource_arg_indices, function);
+  *kernel = absl::make_unique<XlaLocalLaunchBase>(
+      &construction, constant_arg_indices, resource_arg_indices, function);
   return s;
 }
 
diff --git a/tensorflow/compiler/jit/create_xla_launch_op_test.cc b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
index b75ab486b8..7386660762 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op_test.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/create_xla_launch_op.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
@@ -65,11 +66,11 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
     for (const auto& fdef : flib) {
       *(proto.add_function()) = fdef;
     }
-    lib_def_ =
-        MakeUnique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+    lib_def_ = absl::make_unique<FunctionLibraryDefinition>(
+        OpRegistry::Global(), proto);
     OptimizerOptions opts;
-    device_mgr_ = MakeUnique<DeviceMgr>(devices_);
-    pflr_ = MakeUnique<ProcessFunctionLibraryRuntime>(
+    device_mgr_ = absl::make_unique<DeviceMgr>(devices_);
+    pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
     flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 62007e6115..fe28502f69 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/deadness_analysis.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/tensor_id.h"
@@ -21,18 +22,79 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 
 // ALGORITHM OVERVIEW
+// ==================
 //
 // We map every output produced by each node in the TensorFlow graph (including
 // control dependence) into an instance of the Predicate class.  Instances of
 // Predicate denote logical formulas and mapping a node `n` to a predicate
-// `pred` implies that `n` is executed whenver `pred` is true.  Then we can
-// deduce mismatching liveness in the inputs to node by comparing the predicate
-// those inputs are mapped to.
+// `pred` implies that `n` is live whenever `pred` is true.  Then we can deduce
+// mismatching liveness in the inputs to node by comparing the predicate those
+// inputs are mapped to.  The core logic of this pass resides in creating the
+// map from TensorFlow nodes to predicates.
 //
-// Loops are handled pessimistically -- we map Merge nodes with backedges to
-// uninterpreted symbols (the same kind we use to represent Switch and _Recv).
-// Predicate equality has to hold over all possible assignments to these
-// uninterpreted symbols.
+//
+// MAPPING NODES TO PREDICATES, MODULO CYCLES
+// ------------------------------------------
+//
+// If we ignore cycles for a moment, computing predicates is fairly
+// straightforward.  We traverse the graph in RPO, mapping each node to a
+// predicate based on the predicates its inputs are mapped to.  For instance a
+// Merge(X, Y) node will be mapped to OR(PredicateFor(X), PredicateFor(Y)).
+// Roughtly speaking, we abstract interpret each node on the "liveness" domain,
+// where values in the domain represent if a tensor carries a dead signal or
+// not.
+//
+//
+// DEALING WITH CYCLES
+// -------------------
+//
+// We map Merge nodes that are the target of a backedge to AndRecurrence
+// instances.  An AndRecurrence with start() = S and step() = X, printed as
+// {S,&,X}, *roughly* represents the infinite list of predicates
+// [S,S&X,S&X&X,S&X&X, ...].  So {S,&,X} can be used to represent the predicate
+// for Merge in a graph like:
+//
+//     Init
+//       |
+//       v
+//     Merge <-----------+
+//       |               |
+//       v               |
+//      Incr             |
+//       |               |
+//       v               |
+//      Switch <- Cond   |
+//       |               |
+//       v (oidx: 1)     |
+//       |               |
+//       +---------------+
+//
+// Where S is the predicate for Init and X is the predicate that asserts that
+// Cond is true.  {S,&,X} states that Merge is live on the first "iteration" iff
+// S is true, live on the second iteration iff "S&X" is true, live on the third
+// iteration iff "S&X&X" is true etc.  There is a subtlety here, S&X&X would
+// normally be equivalent to S&X which isn't quite what we want to represent.
+// Instead we want {S,&,X} to denote the infinite list [S, S&X,
+// S&X&X',S&X&X'&X'', ...] where X, X', X'' are predicates that assert Cond is
+// true on iteration 0, 1, 2 respectively.  This is made more precise in the
+// comment on the AndRecurrence class.
+//
+// The general algorithm that deals with cycles does two RPO (reverse post
+// order) passes over the graph.  On the first pass it assigns a symbolic
+// predicate to merge nodes with backedges.  On the second pass it tries to
+// pattern matche the predicates for the backedges of these merges and infer an
+// AndRecurrence for the merge.
+//
+// In other words, we do a pessimistic data flow analysis where the data-flow
+// lattice has two elements, Symbolic and NonSymbolic with Symbolic >
+// NonSymbolic. The lattice has height = 2 so two iterations are sufficient to
+// converge.  We don't do an optimistic data flow analysis to make pattern
+// matching easier: if we assigned the predicate of the initial value to the
+// merge during the first pass, on the second pass the backedge may see a
+// simplified value that would be difficult to pattern match.
+//
+// We still use symbolic predicates for merges for which we can't pattern match
+// on the backedge predicate.  This is conservatively correct.
 
 namespace tensorflow {
 
@@ -42,7 +104,7 @@ namespace {
 // above.
 class Predicate {
  public:
-  enum class Kind { kAnd, kOr, kNot, kSymbol };
+  enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol };
 
   virtual string ToString() const = 0;
   int64 hash() const { return hash_; }
@@ -51,6 +113,12 @@ class Predicate {
   virtual Kind kind() const = 0;
   virtual ~Predicate() {}
 
+  // Invokes func on p and on all of its operands recursively.  Does not invoke
+  // `func` on the same Predicate instance twice.  Aborts the search if `func`
+  // returns true.
+  template <typename FunctionTy>
+  static void Visit(Predicate* p, const FunctionTy& func);
+
  protected:
   explicit Predicate(int64 hash) : hash_(hash) {}
 
@@ -86,7 +154,7 @@ class AndPredicate : public Predicate {
                    std::back_inserter(operands_str),
                    [](Predicate* pred) { return pred->ToString(); });
 
-    return strings::StrCat("(", str_util::Join(operands_str, " & "), ")");
+    return strings::StrCat("(", absl::StrJoin(operands_str, " & "), ")");
   }
 
   Kind kind() const override { return Kind::kAnd; }
@@ -115,7 +183,7 @@ class OrPredicate : public Predicate {
                    std::back_inserter(operands_str),
                    [](Predicate* pred) { return pred->ToString(); });
 
-    return strings::StrCat("(", str_util::Join(operands_str, " | "), ")");
+    return strings::StrCat("(", absl::StrJoin(operands_str, " | "), ")");
   }
 
   Kind kind() const override { return Kind::kOr; }
@@ -145,10 +213,44 @@ class NotPredicate : public Predicate {
   std::array<Predicate*, 1> operands_;
 };
 
+// Represents an infinite list of predicates.
+//
+// An AndRecurrence with start = S and step = X is printed as {S,&,X} and stands
+// for the list of predicates:
+//
+//   S, S & GenSym(X,1), S & GenSym(X,1) & GenSym(X,2), ...
+//
+// where GenSym(<expression>, <id>) renames every SymbolPredicate in
+// <expression> by appending <id> to it, in effect creating a "fresh" symbol.
+// This means {P,&,Q} is not equal to "P on the first iteration; P&Q on
+// subsequent iterations".
+class AndRecurrencePredicate : public Predicate {
+ public:
+  explicit AndRecurrencePredicate(Predicate* start, Predicate* step)
+      : Predicate(HashPredicateSequence(Kind::kAndRecurrence, {start, step})),
+        operands_({start, step}) {}
+
+  Predicate* start() const { return operands_[0]; }
+  Predicate* step() const { return operands_[1]; }
+
+  string ToString() const override {
+    return strings::StrCat("{", start()->ToString(), ",&,", step()->ToString(),
+                           "}");
+  }
+
+  Kind kind() const override { return Kind::kAndRecurrence; }
+
+  gtl::ArraySlice<Predicate*> GetOperands() const override { return operands_; }
+
+ private:
+  std::array<Predicate*, 2> operands_;
+};
+
 // Represents an uninterpreted symbol in a logical predicate.
 //
 // Two predicates are equivalent iff they are equivalent for all assignments to
-// the symbols contained in them.
+// the symbols contained in them, i.e. predicates are forall qualified over
+// symbols.
 class SymbolPredicate : public Predicate {
  public:
   explicit SymbolPredicate(TensorId tensor_id, bool must_be_true)
@@ -184,6 +286,29 @@ class SymbolPredicate : public Predicate {
   }
 };
 
+template <typename FunctionTy>
+/*static*/ void Predicate::Visit(Predicate* p, const FunctionTy& func) {
+  gtl::FlatSet<Predicate*> visited;
+  std::vector<Predicate*> stack;
+
+  stack.push_back(p);
+  visited.insert(p);
+
+  while (!stack.empty()) {
+    Predicate* current = stack.back();
+    stack.pop_back();
+    bool done = func(current);
+    if (done) {
+      return;
+    }
+    for (Predicate* op : current->GetOperands()) {
+      if (visited.insert(op).second) {
+        stack.push_back(op);
+      }
+    }
+  }
+}
+
 // Creates and owns Predicate instances.  Simplifies predicates as it creates
 // them.
 class PredicateFactory {
@@ -209,6 +334,21 @@ class PredicateFactory {
     }
   }
 
+  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step) {
+    auto it = interned_and_rec_instances_.find({start, step});
+    if (it != interned_and_rec_instances_.end()) {
+      return it->second.get();
+    }
+
+    std::unique_ptr<Predicate> new_pred =
+        Make<AndRecurrencePredicate>(start, step);
+    Predicate* new_pred_ptr = new_pred.get();
+    CHECK(interned_and_rec_instances_
+              .emplace(SignatureForAndRec(start, step), std::move(new_pred))
+              .second);
+    return new_pred_ptr;
+  }
+
   Predicate* MakeSymbolPredicate(TensorId tensor_id, bool must_be_true) {
     SignatureForSymbol signature = {tensor_id, must_be_true};
     auto it = interned_symbol_instances_.find(signature);
@@ -249,6 +389,7 @@ class PredicateFactory {
   using SignatureForAndOr =
       std::pair<Predicate::Kind, gtl::ArraySlice<Predicate*>>;
   using SignatureForNot = Predicate*;
+  using SignatureForAndRec = std::pair<Predicate*, Predicate*>;
   using SignatureForSymbol = std::pair<SafeTensorId, bool>;
 
   struct HashSignatureForAndOr {
@@ -273,6 +414,8 @@ class PredicateFactory {
       interned_and_or_instances_;
   gtl::FlatMap<SignatureForNot, std::unique_ptr<Predicate>>
       interned_not_instances_;
+  gtl::FlatMap<SignatureForAndRec, std::unique_ptr<Predicate>>
+      interned_and_rec_instances_;
   gtl::FlatMap<SignatureForSymbol, std::unique_ptr<Predicate>,
                HashSignatureForSymbol>
       interned_symbol_instances_;
@@ -353,6 +496,7 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
       : graph_(*graph), vlog_(VLOG_IS_ON(2)) {}
 
   Status Populate();
+  Status PopulateWithReversePostOrder(gtl::ArraySlice<Node*> rpo);
   bool HasInputsWithMismatchingDeadness(const Node& node) override;
   void Print() const override;
   gtl::FlatMap<TensorId, string, TensorId::Hasher> PredicateMapAsString() const;
@@ -361,20 +505,40 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly };
 
   std::vector<Predicate*> GetIncomingPreds(Node* n, EdgeKind edge_kind);
-  void SetPred(Node* n, int output_idx, Predicate* pred) {
-    CHECK(
-        predicate_map_.insert({TensorId(n->name(), output_idx), pred}).second);
+
+  // Sets the predicate for output `output_idx` of `n` to `pred`.  Sets the i'th
+  // bit of `should_revisit` if `pred` is different from the current predicate
+  // for the `output_idx` output of `n`.
+  void SetPredicate(Node* n, int output_idx, Predicate* pred,
+                    std::vector<bool>* should_revisit) {
+    auto insert_result =
+        predicate_map_.insert({TensorId(n->name(), output_idx), pred});
+    if (!insert_result.second && insert_result.first->second != pred) {
+      VLOG(4) << "For " << n->name() << ":" << output_idx << " from "
+              << insert_result.first->second->ToString() << " "
+              << insert_result.first->second << " to " << pred->ToString()
+              << " " << pred;
+      insert_result.first->second = pred;
+      if (should_revisit != nullptr) {
+        for (const Edge* e : n->out_edges()) {
+          (*should_revisit)[e->dst()->id()] = true;
+        }
+      }
+    }
   }
-  void SetPred(Node* n, gtl::ArraySlice<int> output_idxs, Predicate* pred) {
+
+  void SetPredicate(Node* n, gtl::ArraySlice<int> output_idxs, Predicate* pred,
+                    std::vector<bool>* should_revisit) {
     for (int output_idx : output_idxs) {
-      SetPred(n, output_idx, pred);
+      SetPredicate(n, output_idx, pred, should_revisit);
     }
   }
 
-  Status HandleSwitch(Node* n);
-  Status HandleMerge(Node* n);
-  Status HandleRecv(Node* n);
-  Status HandleGeneric(Node* n);
+  Status HandleSwitch(Node* n, std::vector<bool>* should_revisit);
+  Status HandleMerge(Node* n, std::vector<bool>* should_revisit);
+  Status HandleRecv(Node* n, std::vector<bool>* should_revisit);
+  Status HandleGeneric(Node* n, std::vector<bool>* should_revisit);
+  Status HandleNode(Node* n, std::vector<bool>* should_revisit);
 
   const Graph& graph_;
   gtl::FlatMap<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
@@ -397,14 +561,15 @@ std::vector<Predicate*> DeadnessAnalysisImpl::GetIncomingPreds(
 
     if (should_process) {
       auto it = predicate_map_.find(InputEdgeToTensorId(in_edge));
-      CHECK(it != predicate_map_.end());
+      CHECK(it != predicate_map_.end()) << n->name();
       incoming_preds.push_back(it->second);
     }
   }
   return incoming_preds;
 }
 
-Status DeadnessAnalysisImpl::HandleSwitch(Node* n) {
+Status DeadnessAnalysisImpl::HandleSwitch(Node* n,
+                                          std::vector<bool>* should_revisit) {
   std::vector<Predicate*> input_preds =
       GetIncomingPreds(n, EdgeKind::kDataAndControl);
   const Edge* pred_edge;
@@ -416,84 +581,252 @@ Status DeadnessAnalysisImpl::HandleSwitch(Node* n) {
 
   // Output 0 is alive iff all inputs are alive and the condition is false.
   input_preds.push_back(false_switch);
-  SetPred(n, 0, predicate_factory_.MakeAndPredicate(input_preds));
+  SetPredicate(n, 0, predicate_factory_.MakeAndPredicate(input_preds),
+               should_revisit);
   input_preds.pop_back();
 
   // Output 1 is alive iff all inputs are alive and the condition is true.
   input_preds.push_back(true_switch);
-  SetPred(n, 1, predicate_factory_.MakeAndPredicate(input_preds));
+  SetPredicate(n, 1, predicate_factory_.MakeAndPredicate(input_preds),
+               should_revisit);
   input_preds.pop_back();
 
-  // Control is alive iff any inputs are alive.
-  SetPred(n, Graph::kControlSlot,
-          predicate_factory_.MakeAndPredicate(input_preds));
+  // Control is alive iff all inputs are alive.
+  SetPredicate(n, Graph::kControlSlot,
+               predicate_factory_.MakeAndPredicate(input_preds),
+               should_revisit);
 
   return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::HandleMerge(Node* n) {
+namespace {
+const Edge* FindUniqueBackedge(Node* merge) {
+  CHECK(merge->IsMerge());
+  const Edge* result = nullptr;
+  for (const Edge* e : merge->in_edges()) {
+    if (e->src()->IsNextIteration()) {
+      CHECK_EQ(result, nullptr)
+          << "Multiple backedges to " << merge->DebugString();
+      result = e;
+    }
+  }
+  return result;
+}
+
+// If `backedge_predicate` is equal to `symbolic_predicate` & Step where Step
+// does not contain `symbolic_predicate` as an inner (not top-level) operand
+// then returns `Step`.  Otherwise returns nullptr.
+Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory,
+                               Predicate* symbolic_predicate,
+                               Predicate* backedge_predicate) {
+  CHECK(dynamic_cast<SymbolPredicate*>(symbolic_predicate));
+  if (backedge_predicate->kind() != Predicate::Kind::kAnd) {
+    return nullptr;
+  }
+
+  std::vector<Predicate*> and_ops;
+  gtl::ArraySlice<Predicate*> recurrent_pred_ops =
+      backedge_predicate->GetOperands();
+
+  bool found_sym = false;
+  for (Predicate* and_op : recurrent_pred_ops) {
+    // We want the `symbol_predicate` to be the one of the operands of
+    // `backedge_predicate`,
+    if (and_op == symbolic_predicate) {
+      found_sym = true;
+      continue;
+    }
+
+    // but we don't want it to be present anywhere else in the formula.  E.g. we
+    // don't want the recurrent predicate to be
+    // symbol_predicate&(X|symbol_predicate).
+    bool found_sym_as_inner_operand = false;
+    auto has_self_as_inner_operand = [&](Predicate* p) {
+      if (p == symbolic_predicate) {
+        found_sym_as_inner_operand = true;
+        return true;  // Stop searching, we're done.
+      }
+
+      // Continue searching.
+      return false;
+    };
+
+    Predicate::Visit(and_op, has_self_as_inner_operand);
+    if (found_sym_as_inner_operand) {
+      return nullptr;
+    }
+    and_ops.push_back(and_op);
+  }
+
+  return found_sym ? predicate_factory->MakeAndPredicate(and_ops) : nullptr;
+}
+}  // namespace
+
+Status DeadnessAnalysisImpl::HandleMerge(Node* n,
+                                         std::vector<bool>* should_revisit) {
   // Merge ignores deadness of its control inputs.  A merge that isn't the
-  // target of a backedge has is alive iff any of its data inputs are.  We treat
-  // the liveness of a merge that is the target of a backedge symbolically.
+  // target of a backedge has is alive iff any of its data inputs are.  The
+  // liveness of a merge that is the target of a backedge can sometimes be
+  // represented using a AndRecurrencePredicate.  If neither apply, we represent
+  // the liveness of the merge symbolically.
+
+  bool has_unvisited_backedge = false;
+  for (const Edge* e : n->in_edges()) {
+    if (!e->IsControlEdge() && e->src()->IsNextIteration()) {
+      has_unvisited_backedge |= !predicate_map_.count(InputEdgeToTensorId(e));
+    }
+  }
+
+  auto it = predicate_map_.find(TensorId(n->name(), 0));
+  if (it == predicate_map_.end()) {
+    if (has_unvisited_backedge) {
+      // We're visiting this merge for the first time and it has an unvisited
+      // backedge.
+      Predicate* input_data_pred = predicate_factory_.MakeSymbolPredicate(
+          TensorId(n->name(), 0), /*must_be_true=*/false);
+      SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
+                   should_revisit);
+      return Status::OK();
+    }
 
-  bool has_backedge = std::any_of(
-      n->in_edges().begin(), n->in_edges().end(), [](const Edge* e) {
-        return !e->IsControlEdge() && e->src()->IsNextIteration();
-      });
+    // We're visiting this merge for the first time and it is a acyclic merge.
+    Predicate* input_data_pred = predicate_factory_.MakeOrPredicate(
+        GetIncomingPreds(n, EdgeKind::kDataOnly));
+    SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
+                 should_revisit);
+    return Status::OK();
+  }
 
-  Predicate* input_data_pred =
-      has_backedge ? predicate_factory_.MakeSymbolPredicate(
-                         TensorId(n->name(), 0), /*must_be_true=*/false)
-                   : predicate_factory_.MakeOrPredicate(
-                         GetIncomingPreds(n, EdgeKind::kDataOnly));
+  if (it->second->kind() == Predicate::Kind::kSymbol) {
+    // Last time we visited this merge we only got a symbolic predicate because
+    // of an unvisited backedge.  Try to pattern match the predicate expression
+    // for that backedge (which should be visited now) into an and recurrence
+    // for the merge node.
+    if (const Edge* unique_backedge = FindUniqueBackedge(n)) {
+      if (Predicate* step = DeduceStepPredicate(
+              &predicate_factory_, it->second,
+              predicate_map_[InputEdgeToTensorId(unique_backedge)])) {
+        // If the predicate for the backedge is "Sym&X" where "Sym" is the
+        // predicate for the merge then the merge has predicate {S,&,X} where S
+        // is the predicate for the merge ignoring the backedge.
+        std::vector<Predicate*> non_recurrent_inputs;
+        for (const Edge* e : n->in_edges()) {
+          if (e != unique_backedge) {
+            non_recurrent_inputs.push_back(
+                predicate_map_[InputEdgeToTensorId(e)]);
+          }
+        }
 
-  SetPred(n, {0, 1, Graph::kControlSlot}, input_data_pred);
+        Predicate* start =
+            predicate_factory_.MakeOrPredicate(non_recurrent_inputs);
+        Predicate* and_rec =
+            predicate_factory_.MakeAndRecurrencePredicate(start, step);
+        SetPredicate(n, {0, 1, Graph::kControlSlot}, and_rec, should_revisit);
+        return Status::OK();
+      }
+    }
+  }
   return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::HandleRecv(Node* n) {
+Status DeadnessAnalysisImpl::HandleRecv(Node* n,
+                                        std::vector<bool>* should_revisit) {
   // In addition to being alive or dead based on the inputs, a _Recv can also
   // acquire a dead signal from a _Send.
   std::vector<Predicate*> input_preds =
       GetIncomingPreds(n, EdgeKind::kDataAndControl);
   input_preds.push_back(predicate_factory_.MakeSymbolPredicate(
       TensorId(n->name(), 0), /*must_be_true=*/false));
-  SetPred(n, {0, Graph::kControlSlot},
-          predicate_factory_.MakeAndPredicate(input_preds));
+  SetPredicate(n, {0, Graph::kControlSlot},
+               predicate_factory_.MakeAndPredicate(input_preds),
+               should_revisit);
   return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::HandleGeneric(Node* n) {
+Status DeadnessAnalysisImpl::HandleGeneric(Node* n,
+                                           std::vector<bool>* should_revisit) {
   // Generally nodes are alive iff all their inputs are alive.
   Predicate* pred = predicate_factory_.MakeAndPredicate(
       GetIncomingPreds(n, EdgeKind::kDataAndControl));
   for (int output_idx = 0; output_idx < n->num_outputs(); output_idx++) {
-    SetPred(n, output_idx, pred);
+    SetPredicate(n, output_idx, pred, should_revisit);
+  }
+  SetPredicate(n, Graph::kControlSlot, pred, should_revisit);
+  return Status::OK();
+}
+
+Status DeadnessAnalysisImpl::HandleNode(Node* n,
+                                        std::vector<bool>* should_revisit) {
+  if (n->IsSwitch()) {
+    TF_RETURN_IF_ERROR(HandleSwitch(n, should_revisit));
+  } else if (n->IsMerge()) {
+    TF_RETURN_IF_ERROR(HandleMerge(n, should_revisit));
+  } else if (n->IsControlTrigger()) {
+    SetPredicate(n, Graph::kControlSlot, predicate_factory_.MakeTrue(),
+                 nullptr);
+  } else if (n->IsRecv() || n->IsHostRecv()) {
+    TF_RETURN_IF_ERROR(HandleRecv(n, should_revisit));
+  } else if (n->IsNextIteration()) {
+    TF_RETURN_IF_ERROR(HandleGeneric(n, should_revisit));
+  } else {
+    TF_RETURN_IF_ERROR(HandleGeneric(n, should_revisit));
   }
-  SetPred(n, Graph::kControlSlot, pred);
   return Status::OK();
 }
 
 Status DeadnessAnalysisImpl::Populate() {
   std::vector<Node*> rpo;
-  GetReversePostOrder(graph_, &rpo, /*stable_comparator=*/{},
+  GetReversePostOrder(graph_, &rpo, /*stable_comparator=*/NodeComparatorName(),
                       /*edge_filter=*/[](const Edge& edge) {
                         return !edge.src()->IsNextIteration();
                       });
+  return PopulateWithReversePostOrder(rpo);
+}
 
+Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
+    gtl::ArraySlice<Node*> rpo) {
   // This an abstract interpretation over the deadness propagation semantics of
   // the graph executor.
+  //
+  // We iterate over the graph twice, each time in RPO.  On the first iteration
+  // merge nodes with backedges are mapped to symbolic predicates.  On the
+  // second iteration we use the predicates assigned to the backedges in the
+  // previous iteration to infer a more precise predicate for the backedge merge
+  // nodes and all the nodes that transitively use it.
+  //
+  // We don't track the output indices for should_revisit.  Instead, putting a
+  // node in `should_revisit` denotes that the deadness flowing out from any
+  // output from said node may have changed.  This is fine; only switches
+  // propagate different deadness along different output edges, and since the
+  // delta is solely due to the input *values* (and not input deadness), the
+  // delta should not change in the second iteration.
+  std::vector<bool> should_revisit;
+  should_revisit.resize(graph_.num_node_ids());
   for (Node* n : rpo) {
-    if (n->IsSwitch()) {
-      TF_RETURN_IF_ERROR(HandleSwitch(n));
-    } else if (n->IsMerge()) {
-      TF_RETURN_IF_ERROR(HandleMerge(n));
-    } else if (n->IsControlTrigger()) {
-      SetPred(n, Graph::kControlSlot, predicate_factory_.MakeTrue());
-    } else if (n->IsRecv() || n->IsHostRecv()) {
-      TF_RETURN_IF_ERROR(HandleRecv(n));
-    } else {
-      TF_RETURN_IF_ERROR(HandleGeneric(n));
+    VLOG(4) << "Visiting " << n->name();
+    TF_RETURN_IF_ERROR(HandleNode(n, /*should_revisit=*/nullptr));
+    if (n->IsNextIteration()) {
+      // If this is a backedge for a merge node then remember to reprocess the
+      // merge the next time we run.
+      for (const Edge* e : n->out_edges()) {
+        if (e->dst()->IsMerge()) {
+          should_revisit[e->dst()->id()] = true;
+        }
+      }
+    }
+  }
+
+  for (Node* n : rpo) {
+    // The nodes added to should_revisit in the previous loop need to be
+    // revisited now.  Reprocesing these initial nodes may add *their* consumers
+    // to should_revisit, and these newly added nodes will also be processed by
+    // this very same loop.  Since we're traversing the graph in reverse post
+    // order (producers before consumers) and HandleNode(n) can only ever add
+    // n's consumers to should_revisit, we won't "miss" an addition to
+    // should_revisit.
+    if (should_revisit[n->id()]) {
+      VLOG(4) << "Revisiting " << n->name();
+      TF_RETURN_IF_ERROR(HandleNode(n, &should_revisit));
     }
   }
 
@@ -589,6 +922,15 @@ Status ComputePredicates(const Graph& graph,
   *out_predicate_map = impl.PredicateMapAsString();
   return Status::OK();
 }
+
+Status ComputePredicates(const Graph& graph,
+                         gtl::ArraySlice<Node*> reverse_post_order,
+                         PredicateMapTy* out_predicate_map) {
+  DeadnessAnalysisImpl impl(&graph);
+  TF_RETURN_IF_ERROR(impl.PopulateWithReversePostOrder(reverse_post_order));
+  *out_predicate_map = impl.PredicateMapAsString();
+  return Status::OK();
+}
 }  // namespace deadness_analysis_internal
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h
index cdef405110..401d6e406a 100644
--- a/tensorflow/compiler/jit/deadness_analysis_internal.h
+++ b/tensorflow/compiler/jit/deadness_analysis_internal.h
@@ -26,6 +26,14 @@ namespace deadness_analysis_internal {
 // testing purposes only.
 using PredicateMapTy = gtl::FlatMap<TensorId, string, TensorId::Hasher>;
 Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map);
+
+// Returns a map describing the predicate each Tensor was mapped to.  For
+// testing purposes only.  Makes deadness analysis visit the graph in the order
+// specified in `reverse_post_order` which must be a valid RPO for the graph
+// minus NextIteration->Merge edges.
+Status ComputePredicates(const Graph& graph,
+                         gtl::ArraySlice<Node*> reverse_post_order,
+                         PredicateMapTy* out_predicate_map);
 }  // namespace deadness_analysis_internal
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 6881095b51..28a56044d5 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -32,12 +32,14 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
+using deadness_analysis_internal::ComputePredicates;
+using deadness_analysis_internal::PredicateMapTy;
+
 Status AnalyzeDeadness(Graph* graph,
                        std::unique_ptr<DeadnessAnalysis>* result) {
   FixupSourceAndSinkEdges(graph);
@@ -51,13 +53,73 @@ ops::Switch CreateSwitch(const Scope& root, const string& prefix) {
   return ops::Switch(root.WithOpName(prefix + "/switch"), value, predicate);
 }
 
-Output CreateInductionVariable(const Scope& root, const string& prefix,
-                               const string& frame_name, int32 init) {
-  Output initial_value = ops::Const(root.WithOpName(prefix + "/init"), init);
+TensorId ControlOutputFor(const Output& o) {
+  return {o.node()->name(), Graph::kControlSlot};
+}
+
+void VLogGraphIfAsked(const Graph& graph) {
+  if (VLOG_IS_ON(3)) {
+    GraphDef graph_def;
+    graph.ToGraphDef(&graph_def);
+    string serialized;
+    ::tensorflow::protobuf::TextFormat::PrintToString(graph_def, &serialized);
+    LOG(INFO) << serialized;
+  }
+}
+
+struct InductionVarInfo {
+  Output induction_var;
+  Output loop_cond;
+};
+
+// Creates an induction variable with the following structure (simplified for
+// brevity):
+//
+//            +---------------+
+//            | initial_value |
+//            +---------------+
+//              |
+//              |
+//              v
+//            +---------------+
+//            |     Enter     |
+//            +---------------+
+//              |
+//              |
+//              v
+//            +---------------+
+//         +> |     Merge     | -+
+//         |  +---------------+  |
+//         |    |                |
+//         |    |                |
+//         |    v                |
+//         |  +---------------+  |
+//         |  |  LessThan10   |  |
+//         |  +---------------+  |
+//         |    |                |
+//         |    |                |
+//         |    v                |
+//         |  +---------------+  |
+//    +----+- |    Switch     | <+
+//    |    |  +---------------+
+//    |    |    |
+//    |    |    |
+//    |    |    v
+//    |    |  +---------------+
+//    |    +- |    AddOne     |
+//    |       +---------------+
+//    |       +---------------+
+//    +-----> |     Exit      |
+//            +---------------+
+InductionVarInfo CreateInductionVariable(const Scope& root,
+                                         const string& prefix,
+                                         const string& frame_name,
+                                         const Output& initial_value) {
   Output enter_initial_value = ops::internal::Enter(
       root.WithOpName(prefix + "/enter"), initial_value, frame_name);
 
-  ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_initial_value});
+  ops::Merge iv(root.WithOpName(prefix + "/iv"),
+                {enter_initial_value, enter_initial_value});
   Output increment_by = ops::Const(root.WithOpName(prefix + "/incr"), 1);
   Output final_value = ops::Const(root.WithOpName(prefix + "/final"), 10);
   Output loop_cond_expr =
@@ -66,16 +128,84 @@ Output CreateInductionVariable(const Scope& root, const string& prefix,
       ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
   ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
-  Output iv_next =
-      ops::Add(root.WithOpName(prefix + "/ivnext"), iv.output, increment_by);
+  Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
+                            latch.output_true, increment_by);
   Output next_iteration =
-      ops::NextIteration(root.WithOpName(prefix + "next_iteration"), iv_next);
+      ops::NextIteration(root.WithOpName(prefix + "/next_iteration"), iv_next);
 
-  root.graph()->AddEdge(next_iteration.node(), 0, iv.output.node(), 1);
+  CHECK(root.graph()
+            ->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1)
+            .ok());
   root.graph()->AddControlEdge(iv.output.node(), increment_by.node());
   root.graph()->AddControlEdge(iv.output.node(), final_value.node());
 
-  return iv.output;
+  return {iv.output, loop_cond};
+}
+
+InductionVarInfo CreateInductionVariable(const Scope& root,
+                                         const string& prefix,
+                                         const string& frame_name, int32 init) {
+  return CreateInductionVariable(
+      root, prefix, frame_name,
+      ops::Const(root.WithOpName(prefix + "/init"), init));
+}
+
+// Creates an induction variable with the following structure:
+//
+//                           +---------------+
+//                           | initial_value |
+//                           +---------------+
+//                             |
+//                             |
+//                             v
+//                           +---------------+
+//                           |     Enter     |
+//                           +---------------+
+//                             |
+//                             |
+//                             v
+//                           +---------------+
+//                           |     Merge     | <+
+//                           +---------------+  |
+//                             |                |
+//                             |                |
+//                             v                |
+//         +-----------+     +---------------+  |
+//         | loop_cond | --> |    Switch     | -+
+//         +-----------+     +---------------+
+//                             |
+//                             |
+//                             v
+//                           +---------------+
+//                           |     Exit      |
+//                           +---------------+
+struct DependentInductionVar {
+  Output induction_var;
+  ops::Switch latch;
+};
+
+DependentInductionVar CreateDependentLoopInvariantValue(
+    const Scope& root, const string& prefix, const string& frame_name,
+    const Output& loop_cond, const Output& value) {
+  Output enter_value = ops::internal::Enter(root.WithOpName(prefix + "/enter"),
+                                            value, frame_name);
+  ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_value, enter_value});
+  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  Output next_iteration = ops::NextIteration(
+      root.WithOpName(prefix + "/next_iteration"), latch.output_true);
+  CHECK(root.graph()
+            ->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1)
+            .ok());
+  return {iv.output, latch};
+}
+
+DependentInductionVar CreateDependentLoopInvariantValue(
+    const Scope& root, const string& prefix, const string& frame_name,
+    const Output& loop_cond, int32 value) {
+  return CreateDependentLoopInvariantValue(
+      root, prefix, frame_name, loop_cond,
+      ops::Const(root.WithOpName(prefix + "/init"), value));
 }
 
 TEST(DeadnessAnalysisTest, BasicPositive) {
@@ -337,21 +467,224 @@ TEST(DeadnessAnalysisTest, HostRecv) {
 
 TEST(DeadnessAnalysisTest, Loop) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  Output iv0 = CreateInductionVariable(root, "iv0", "fr0", 0);
-  Output iv1 = CreateInductionVariable(root, "iv1", "fr0", 0);
-  Output iv2 = CreateInductionVariable(root, "iv2", "fr0", 1);
+  Output iv0 = CreateInductionVariable(root, "iv0", "fr0", 0).induction_var;
+  Output iv1 = CreateInductionVariable(root, "iv1", "fr0", 0).induction_var;
+  Output iv2 = CreateInductionVariable(root, "iv2", "fr0", 1).induction_var;
   Output add0 = ops::Add(root.WithOpName("add0"), iv0, iv1);
   Output add1 = ops::Add(root.WithOpName("add1"), iv1, iv2);
 
-  std::unique_ptr<DeadnessAnalysis> result;
-  TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
-
   // NB!  iv0 and iv1 are equivalent and a smarter deadness analysis would have
   // noticed that.  Today we are pessimistic here because we assign an
   // uninterpreted symbol to merges with backedges.
 
-  EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node()));
-  EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add1.node()));
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+    EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add1.node()));
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    // In theory we should be able to tell that iv0/cond:0 and iv1/cond:0
+    // produce the same deadness.  But we're not that smart today.
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)], "{#true,&,*iv0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)], "{#true,&,*iv1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)], "{#true,&,*iv2/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "({#true,&,*iv1/cond:0} & {#true,&,*iv0/cond:0})");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add1)],
+              "({#true,&,*iv1/cond:0} & {#true,&,*iv2/cond:0})");
+  }
+}
+
+TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0);
+  Output dependent_iv0 =
+      CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0)
+          .induction_var;
+  Output dependent_iv1 =
+      CreateDependentLoopInvariantValue(root, "div1", "frame", iv.loop_cond, 0)
+          .induction_var;
+  Output add0 = ops::Add(root.WithOpName("add0"), dependent_iv0, dependent_iv1);
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
+              "{#true,&,*iv0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)],
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)],
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+  }
+}
+
+TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) {
+  // Create a merge that "looks like" a loop but isn't really.  It has a value
+  // that does not depend on the merge on its backedge.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0);
+  DependentInductionVar dependent_iv =
+      CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0);
+  FixupSourceAndSinkEdges(root.graph());
+
+  // To make deadness analysis think that dependent_iv is a loop we need an RPO
+  // that visits the merge before the backedge.  This is a legal RPO for
+  // deadness analysis since it ignores NextIteration->Merge edges during RPO.
+  // Right now dependent_iv has an edge from Merge to NextIteration so do the
+  // RPO with this edge in place.  Then remove this edge to get our test case.
+  std::vector<Node*> rpo;
+  GetReversePostOrder(*root.graph(), &rpo, /*stable_comparator=*/{},
+                      /*edge_filter=*/[](const Edge& edge) {
+                        return !edge.src()->IsNextIteration();
+                      });
+  TF_ASSERT_OK(root.graph()->UpdateEdge(
+      iv.induction_var.node(), 0, dependent_iv.latch.output_true.node(), 0));
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), rpo, &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv.induction_var)],
+              "div0/iv:0");
+  }
+}
+
+TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_outer =
+      CreateInductionVariable(root, "iv_outer", "frame", 0);
+  ops::Switch inner_value(root.WithOpName("outer_is_live"),
+                          ops::Const(root.WithOpName("constant"), 5),
+                          iv_outer.loop_cond);
+  InductionVarInfo iv_inner = CreateInductionVariable(
+      root, "iv_inner", "frame",
+      ops::internal::Enter(root.WithOpName("iv_inner/enter"),
+                           inner_value.output_true, "frame_inner"));
+
+  Output dependent_outer_iv0 =
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0", "frame",
+                                        iv_outer.loop_cond, 0)
+          .induction_var;
+  Output dependent_outer_iv1 =
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1", "frame",
+                                        iv_outer.loop_cond, 0)
+          .induction_var;
+
+  Output dependent_inner_iv0 =
+      CreateDependentLoopInvariantValue(root, "dependent_inner_iv0", "frame",
+                                        iv_inner.loop_cond, dependent_outer_iv0)
+          .induction_var;
+  Output dependent_inner_iv1 =
+      CreateDependentLoopInvariantValue(root, "dependent_inner_iv1", "frame",
+                                        iv_inner.loop_cond, dependent_outer_iv1)
+          .induction_var;
+
+  Output add0 = ops::Add(root.WithOpName("add0"), dependent_inner_iv0,
+                         dependent_inner_iv1);
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_FALSE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
+              "{#true,&,*iv_outer/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)],
+              "{(*iv_outer/cond:0 & {#true,&,*iv_outer/cond:0}),&,"
+              "*iv_inner/cond:0}");
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)],
+              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
+              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
+              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
+              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
+              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+  }
+}
+
+TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_outer_0 =
+      CreateInductionVariable(root, "iv_outer_0", "frame", 0);
+  ops::Switch inner_value_0(root.WithOpName("outer_0_is_live"),
+                            ops::Const(root.WithOpName("constant"), 5),
+                            iv_outer_0.loop_cond);
+  InductionVarInfo iv_inner_0 = CreateInductionVariable(
+      root, "iv_inner_0", "frame",
+      ops::internal::Enter(root.WithOpName("iv_inner_0/enter"),
+                           inner_value_0.output_true, "frame_inner"));
+
+  InductionVarInfo iv_outer_1 =
+      CreateInductionVariable(root, "iv_outer_1", "frame", 1);
+  ops::Switch inner_init_value_1(root.WithOpName("outer_1_is_live"),
+                                 ops::Const(root.WithOpName("constant"), 5),
+                                 iv_outer_1.loop_cond);
+  InductionVarInfo iv_inner_1 = CreateInductionVariable(
+      root, "iv_inner_1", "frame",
+      ops::internal::Enter(root.WithOpName("iv_inner_1/enter"),
+                           inner_init_value_1.output_true, "frame_inner"));
+  Output add0 = ops::Add(root.WithOpName("add0"), iv_inner_0.induction_var,
+                         iv_inner_1.induction_var);
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*add0.node()));
+  }
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_0.induction_var)],
+              "{#true,&,*iv_outer_0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_0.induction_var)],
+              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
+              "*iv_inner_0/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_1.induction_var)],
+              "{#true,&,*iv_outer_1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_1.induction_var)],
+              "{(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
+              "*iv_inner_1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              "({(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
+              "*iv_inner_1/cond:0} & "
+              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
+              "*iv_inner_0/cond:0})");
+  }
 }
 
 TEST(DeadnessAnalysisTest, ControlInputs) {
@@ -454,9 +787,8 @@ TEST(DeadnessAnalysisTest, RecvVsSwitchText) {
   std::unique_ptr<DeadnessAnalysis> result;
   TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
 
-  deadness_analysis_internal::PredicateMapTy predicate_map;
-  TF_ASSERT_OK(deadness_analysis_internal::ComputePredicates(*root.graph(),
-                                                             &predicate_map));
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
   TensorId logical_and_output_0 = {logical_and.node()->name(),
                                    Graph::kControlSlot};
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index f150bf1819..2788102620 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -2504,7 +2504,8 @@ Status EncapsulateSubgraphsPass::Run(
 
         const int num_args = input_permutation->size();
         std::vector<bool> const_args(num_args);
-        TF_RETURN_IF_ERROR(BackwardsConstAnalysis(**subgraph, &const_args));
+        TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
+            **subgraph, &const_args, /*compile_time_const_nodes=*/nullptr));
 
         DataTypeVector arg_types(num_args);
         TF_RETURN_IF_ERROR(GetArgTypes(**subgraph, &arg_types));
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index c0543a0079..b3600fc48b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
@@ -124,8 +124,8 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
   std::unordered_set<string> control_input_a;
   std::unordered_set<string> control_input_b;
   for (int i = 0; i < a.input_size(); ++i) {
-    if (str_util::StartsWith(a.input(i), "^")) {
-      if (!str_util::StartsWith(b.input(i), "^")) {
+    if (absl::StartsWith(a.input(i), "^")) {
+      if (!absl::StartsWith(b.input(i), "^")) {
         if (diff) {
           *diff = strings::StrCat(
               diff_preamble, " mismatch for node ", a.name(), " input ", i,
@@ -768,7 +768,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              str_util::StartsWith(n->name(), "const")) {
+              absl::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
@@ -813,7 +813,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              str_util::StartsWith(n->name(), "const")) {
+              absl::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 4d49a14b24..c37b6112cc 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/build_xla_launch_ops_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/partially_decluster_pass.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 
 namespace tensorflow {
@@ -23,15 +24,18 @@ namespace tensorflow {
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10,
                       MarkForCompilationPass);
 
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
+                      PartiallyDeclusterPass);
+
 // The EncapsulateSubgraphs pass must run after the MarkForCompilationPass. We
 // also need to run it after the graph been rewritten to have _Send nodes added
 // for fetches. Before the _Send nodes are added, fetch nodes are identified by
 // name, and encapsulation might remove that node from the graph.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
                       EncapsulateSubgraphsPass);
 
 // Must run after EncapsulateSubgraphsPass.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 40,
                       BuildXlaLaunchOpsPass);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 8f78c110cb..253a5d2547 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -29,16 +29,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-cc_library(
-    name = "parallel_check_op",
-    srcs = ["parallel_check_op.cc"],
-    visibility = ["//tensorflow/compiler/jit:friends"],
-    deps = [
-        "//tensorflow/compiler/jit/legacy_flags:parallel_check_op_flags",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/compiler/jit/kernels/parallel_check_op.cc b/tensorflow/compiler/jit/kernels/parallel_check_op.cc
deleted file mode 100644
index bd4eefbc0b..0000000000
--- a/tensorflow/compiler/jit/kernels/parallel_check_op.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/legacy_flags/parallel_check_op_flags.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-namespace {
-
-// Inputs 2*N tensors, outputs the first N inputs.
-// Logs errors if input tensor i and i + N are not (near) identical
-// in any position.
-class ParallelCheckOp : public OpKernel {
- public:
-  explicit ParallelCheckOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  template <typename T>
-  int CompareTensors(DataType dtype, const char* v0, const char* v1,
-                     int64 num_elts, int input_idx) {
-    int failed = 0;
-    const T* p0 = reinterpret_cast<const T*>(v0);
-    const T* p1 = reinterpret_cast<const T*>(v1);
-    double rtol;
-    legacy_flags::ParallelCheckOpFlags* flags =
-        legacy_flags::GetParallelCheckOpFlags();
-    if (!tensorflow::strings::safe_strtod(flags->parallel_check_rtol.c_str(),
-                                          &rtol)) {
-      LOG(ERROR) << "can't convert parallel_check_rtol "
-                 << flags->parallel_check_rtol << " to double";
-    }
-    double atol;
-    if (!tensorflow::strings::safe_strtod(flags->parallel_check_atol.c_str(),
-                                          &atol)) {
-      LOG(ERROR) << "can't convert parallel_check_atol "
-                 << flags->parallel_check_atol << " to double";
-    }
-    for (int i = 0; i < num_elts; ++i) {
-      bool ok = (p0[i] == p1[i]);
-      VLOG(2) << "output " << input_idx << " element " << i << ": " << p0[i];
-      if (!ok) {
-        if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
-          float tolerance =
-              std::max(atol, std::max(fabs(rtol * p0[i]), fabs(rtol * p1[i])));
-          T diff = p0[i] - p1[i];
-          if (diff < 0) diff = 0 - diff;
-          ok = (diff <= tolerance);
-        }
-        if (ok) continue;
-        LOG(ERROR) << "Op " << name() << " fails equality at output "
-                   << input_idx << " type " << DataTypeString(dtype)
-                   << " element " << i << ": std_val=" << p0[i]
-                   << " test_val=" << p1[i] << " diff=" << (p0[i] - p1[i]);
-        if (++failed > 10) break;
-      }
-    }
-    return failed;
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    VLOG(1) << "Compute " << name();
-    const int num_pairs = ctx->num_inputs() / 2;
-    for (int i = 0; i < num_pairs; ++i) {
-      CHECK_EQ(ctx->input_dtype(i), ctx->input_dtype(i + num_pairs));
-      Tensor t0 = ctx->input(i);
-      Tensor t1 = ctx->input(i + num_pairs);
-      int64 num_elts = t0.NumElements();
-      CHECK_EQ(num_elts, t1.NumElements());
-
-      // Compare inputs elementwise for near-exact equality.
-      const char* v0 = t0.tensor_data().data();
-      const char* v1 = t1.tensor_data().data();
-      int failed = 0;
-      switch (ctx->input_dtype(i)) {
-        case DT_INT32:
-          failed =
-              CompareTensors<int32>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_INT64:
-          failed =
-              CompareTensors<int64>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_FLOAT:
-          failed =
-              CompareTensors<float>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_DOUBLE:
-          failed =
-              CompareTensors<double>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        case DT_BOOL:
-          failed =
-              CompareTensors<bool>(ctx->input_dtype(i), v0, v1, num_elts, i);
-          break;
-        default:
-          LOG(FATAL) << "unimpl: " << ctx->input_dtype(i);
-      }
-      if (failed > 0) {
-        LOG(ERROR) << "check failed for " << name() << " output " << i
-                   << " num_elts: " << num_elts;
-        legacy_flags::ParallelCheckOpFlags* flags =
-            legacy_flags::GetParallelCheckOpFlags();
-        if (flags->parallel_check_failfast) {
-          LOG(QFATAL) << "failfast on first parallel-check failure";
-        }
-      } else {
-        VLOG(1) << "check passed for " << name() << " output " << i
-                << " num_elts: " << num_elts;
-      }
-
-      // Propagate the std value.
-      if (IsRefType(ctx->input_dtype(i))) {
-        ctx->forward_ref_input_to_ref_output(i, i);
-      } else {
-        ctx->set_output(i, ctx->input(i));
-      }
-    }
-  }
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ParallelCheckOp);
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelCheck").Device(DEVICE_CPU),
-                        ParallelCheckOp);
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 7f4370b5b0..fde4135bf7 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -176,17 +176,18 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   }
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
-  // Optimization: don't resolve constants. If we resolve constants we never
-  // emit them on the device, meaning that if they are needed by a following
-  // computation the host has to transfer them.
-  compile_options.resolve_compile_time_constants = false;
+  // If we resolve constants we never emit them on the device, meaning that if
+  // they are needed by a following computation the host has to transfer
+  // them. Not resolving constants is expected to be faster than resolving
+  // constants.
+  compile_options.resolve_compile_time_constants = true;
   // Optimization: where possible, have the computation return a naked array
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
 
   OP_REQUIRES_OK(
       ctx, cache->Compile(options, function_, constant_args, variables, ctx,
-                          &kernel, &executable, &compile_options));
+                          &kernel, &executable, compile_options));
 
   VLOG(1) << "Executing XLA Computation...";
 
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h
index 8dfc4b382d..bf1e990668 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_
-#define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_
+#ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LAUNCH_OP_H_
+#define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LAUNCH_OP_H_
 
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -81,4 +81,4 @@ class XlaLocalLaunchOp : public XlaLocalLaunchBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LOCAL_LAUNCH_OP_H_
+#endif  // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LAUNCH_OP_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index d33287fcc3..518c39ec15 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -27,7 +27,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -39,7 +41,10 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -72,18 +77,40 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
 }
 
+bool HasResourceOutput(const Node& node) {
+  return std::find(node.output_types().begin(), node.output_types().end(),
+                   DT_RESOURCE) != node.output_types().end();
+}
+
+bool HasResourceInput(const Node& node) {
+  return std::find(node.input_types().begin(), node.input_types().end(),
+                   DT_RESOURCE) != node.input_types().end();
+}
+
+// Returns true if `node` is a resource operation recognized by tf2xla that
+// operates on something other than resource variables.
+bool IsNonResourceVarResourceOp(const Node& node) {
+  // TODO(b/112837194): We can't cluster these because we only support
+  // snapshotting resource variables (and we can't e.g. snapshot stacks).  This
+  // limitation may be fixable with some work.
+  const XlaResourceOpInfo* op_info = GetResourceOpInfoForOp(node.type_string());
+  return op_info && op_info->resource_kind() != XlaResourceKind::kVariable;
+}
+
 // Make sure we don't recurse infinitely on recursive functions.
 const int kMaxRecursionDepth = 10;
 
 bool IsCompilableCall(const NodeDef& call_def,
-                      const DeviceType& jit_device_type, int depth,
+                      const DeviceType& jit_device_type,
+                      bool allow_resource_ops, int depth,
                       FunctionLibraryRuntime* lib_runtime);
 
 // Tests whether 'while_node' is a completely compilable loop.
 // Every operator in the condition and body functions must be compilable for a
 // while loop to be compilable.
 bool IsCompilableWhile(const Node& while_node,
-                       const DeviceType& jit_device_type, int depth,
+                       const DeviceType& jit_device_type,
+                       bool allow_resource_ops, int depth,
                        FunctionLibraryRuntime* lib_runtime) {
   const NameAttrList* name_attr;
   NodeDef call;
@@ -98,7 +125,8 @@ bool IsCompilableWhile(const Node& while_node,
   call.set_name("while_cond");
   call.set_op(cond_func);
   *call.mutable_attr() = name_attr->attr();
-  if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) {
+  if (!IsCompilableCall(call, jit_device_type, allow_resource_ops, depth + 1,
+                        lib_runtime)) {
     VLOG(2) << "Rejecting While " << while_node.name()
             << ": can't compile loop condition: " << cond_func;
     return false;
@@ -113,7 +141,8 @@ bool IsCompilableWhile(const Node& while_node,
   call.set_name("while_body");
   call.set_op(body_func);
   *call.mutable_attr() = name_attr->attr();
-  if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) {
+  if (!IsCompilableCall(call, jit_device_type, allow_resource_ops, depth + 1,
+                        lib_runtime)) {
     VLOG(2) << "Rejecting While " << while_node.name()
             << ": can't compile loop body: " << body_func;
     return false;
@@ -125,7 +154,8 @@ bool IsCompilableWhile(const Node& while_node,
 // Every operator in the function must be compilable for a function to be
 // compilable.
 bool IsCompilableCall(const NodeDef& call_def,
-                      const DeviceType& jit_device_type, int depth,
+                      const DeviceType& jit_device_type,
+                      bool allow_resource_ops, int depth,
                       FunctionLibraryRuntime* lib_runtime) {
   if (depth > kMaxRecursionDepth) {
     VLOG(2) << "Rejecting " << call_def.op()
@@ -141,6 +171,10 @@ bool IsCompilableCall(const NodeDef& call_def,
             << ": could not instantiate: " << status;
     return false;
   }
+
+  auto release_handle_on_return = gtl::MakeCleanup(
+      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
   CHECK(fbody);
   const FunctionDef& fdef = fbody->fdef;
@@ -161,12 +195,17 @@ bool IsCompilableCall(const NodeDef& call_def,
     if (node->type_string() == "_Arg" || node->type_string() == "_Retval")
       continue;
     if (node->type_string() == "While") {
-      // Handle functional While loop (not in open source build).
-      return IsCompilableWhile(*node, jit_device_type, depth + 1, lib_runtime);
+      // Handle functional While loop.
+      return IsCompilableWhile(*node, jit_device_type, allow_resource_ops,
+                               depth + 1, lib_runtime);
+    }
+    if (!allow_resource_ops &&
+        (HasResourceInput(*node) || HasResourceOutput(*node))) {
+      return false;
     }
     if (!HasXLAKernel(*node, jit_device_type) &&
-        !IsCompilableCall(node->def(), jit_device_type, depth + 1,
-                          lib_runtime)) {
+        !IsCompilableCall(node->def(), jit_device_type, allow_resource_ops,
+                          depth + 1, lib_runtime)) {
       VLOG(2) << "Rejecting " << call_def.op() << ": unsupported op "
               << node->name() << ": " << node->def().ShortDebugString();
       return false;
@@ -175,14 +214,6 @@ bool IsCompilableCall(const NodeDef& call_def,
   return true;
 }
 
-// Tests whether `node` has a DT_RESOURCE typed input or output.
-bool HasResourceInputOrOutput(const Node& node) {
-  return std::find(node.input_types().begin(), node.input_types().end(),
-                   DT_RESOURCE) != node.input_types().end() ||
-         std::find(node.output_types().begin(), node.output_types().end(),
-                   DT_RESOURCE) != node.output_types().end();
-}
-
 // Returns true if the op can be decomposed into XLA ops for which
 // there are fusable elemental implementations.
 //
@@ -345,6 +376,10 @@ Status FindCompilationCandidates(
                                         flib_def, opts));
   FunctionLibraryRuntime* lib_runtime =
       pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+  std::vector<bool> compile_time_const_nodes(graph.num_node_ids(), false);
+  TF_RETURN_IF_ERROR(
+      BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr,
+                             &compile_time_const_nodes));
 
   int64& fuel =
       legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
@@ -388,19 +423,46 @@ Status FindCompilationCandidates(
         XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration));
     DeviceType jit_device_type(registration->compilation_device_name);
     if (!HasXLAKernel(*node, jit_device_type) &&
-        !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime)) {
+        !IsCompilableCall(node->def(), jit_device_type,
+                          registration->compile_resource_ops, 0, lib_runtime)) {
       VLOG(2) << "Rejecting " << node->name() << ": unsupported op "
               << node->type_string();
       continue;
     }
     if (!registration->compile_resource_ops &&
-        HasResourceInputOrOutput(*node)) {
-      VLOG(2) << "Rejecting: " << node->name() << ": resource input/output "
+        (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) {
+      // We don't have a way of returning values of type DT_RESOURCE from XLA
+      // computations so we avoid auto-clustering nodes producing DT_RESOURCE.
+      // XlaLaunchOp also cannot snapshot resources that are not resource
+      // variables so we avoid clustering resource operations that operate on
+      // non-resource variables.
+      VLOG(2) << "Rejecting: " << node->name() << ": resource output "
               << node->type_string();
       continue;
     }
+    if (compile_time_const_nodes[node->id()] &&
+        !registration->requires_compilation) {
+      const OpDef* op_def;
+      TF_RETURN_IF_ERROR(
+          OpRegistry::Global()->LookUpOpDef(node->type_string(), &op_def));
+      if (op_def->is_stateful()) {
+        // We need to be able to constant fold the nodes in
+        // compile_time_const_nodes given constant inputs (required by XLA) and
+        // therefore can't auto-cluster stateful ops since these can never be
+        // constant folded.
+        VLOG(2) << "Rejecting " << node->name()
+                << ": must-be-constant stateful op";
+        continue;
+      }
+    }
+    // We don't auto-cluster functional control flow nodes containing resource
+    // operations because safety checks are trickier in this case.
+    // registration->compile_resource_ops is true for XLA_CPU/XLA_GPU but not
+    // for CPU/GPU.
     if (node->type_string() == "While" &&
-        !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime)) {
+        !IsCompilableWhile(*node, jit_device_type,
+                           registration->compile_resource_ops, 0,
+                           lib_runtime)) {
       continue;
     }
     // _Arg nodes in a top-level function represent feeds.
@@ -420,6 +482,31 @@ Status FindCompilationCandidates(
   return Status::OK();
 }
 
+// Determine the global jit level which is ON if either the
+// GraphOptimizationPassOptions has the jit ON, or if the --tf_xla_auto_jit flag
+// is true.
+OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
+    const GraphOptimizationPassOptions& options) {
+  OptimizerOptions::GlobalJitLevel global_jit_level =
+      options.session_options->config.graph_options()
+          .optimizer_options()
+          .global_jit_level();
+  if (global_jit_level == OptimizerOptions::DEFAULT) {
+    // To set compilation to be on by default, change the following line.
+    global_jit_level = OptimizerOptions::OFF;
+  }
+  legacy_flags::MarkForCompilationPassFlags* flags =
+      legacy_flags::GetMarkForCompilationPassFlags();
+  if (flags->tf_xla_auto_jit == -1 ||
+      (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) {
+    // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides
+    // the setting in ConfigProto.
+    global_jit_level =
+        static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
+  }
+  return global_jit_level;
+}
+
 struct Cluster {
   // Identifies the node that represents this cluster in the cycle detection
   // graph.
@@ -434,7 +521,11 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
   CHECK(XlaOpRegistry::GetCompilationDevice(device->device_type(),
                                             &registration));
   DeviceType jit_device_type(registration->compilation_device_name);
-  return IsCompilableCall(ndef, jit_device_type, 0, flr);
+
+  // We can always *compile* resource operations, even if we are sometimes
+  // unable to auto-cluster them.
+  const bool compile_resource_ops = true;
+  return IsCompilableCall(ndef, jit_device_type, compile_resource_ops, 0, flr);
 }
 
 Status MarkForCompilationPass::Run(
@@ -442,22 +533,9 @@ Status MarkForCompilationPass::Run(
   // TODO(phawkins): precompute the "GetCompilationDevice" properties of each
   // device ahead of time.
   OptimizerOptions::GlobalJitLevel global_jit_level =
-      options.session_options->config.graph_options()
-          .optimizer_options()
-          .global_jit_level();
-  if (global_jit_level == OptimizerOptions::DEFAULT) {
-    // To set compilation to be on by default, change the following line.
-    global_jit_level = OptimizerOptions::OFF;
-  }
+      GetGlobalJitLevel(options);
   legacy_flags::MarkForCompilationPassFlags* flags =
       legacy_flags::GetMarkForCompilationPassFlags();
-  if (flags->tf_xla_auto_jit == -1 ||
-      (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) {
-    // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides
-    // the setting in ConfigProto.
-    global_jit_level =
-        static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
-  }
   bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
   bool fusion_only = flags->tf_xla_fusion_only;
 
@@ -525,9 +603,9 @@ Status MarkForCompilationPass::Run(
     bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU;
     bool should_compile =
         (ignore_registration || registration->enable_jit_by_default) &&
-        global_jit_level > 0;
+        global_jit_level != OptimizerOptions::OFF;
     if (!should_compile) {
-      if (global_jit_level <= 0) {
+      if (global_jit_level == OptimizerOptions::OFF) {
         VLOG(2) << "Rejecting " << node->name() << ": global jit disabled.";
       } else {
         VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled.";
@@ -538,6 +616,60 @@ Status MarkForCompilationPass::Run(
   return RunImpl(options, is_compilable);
 }
 
+static string RatioToString(int numerator, int denominator) {
+  return strings::Printf("%d / %d (%.2f%%)", numerator, denominator,
+                         (100.0 * numerator) / denominator);
+}
+
+static void VLogClusteringSummary(const Graph& g) {
+  if (!VLOG_IS_ON(2)) {
+    return;
+  }
+
+  std::map<StringPiece, int> cluster_name_to_size;
+  std::map<StringPiece, std::map<StringPiece, int>>
+      cluster_name_to_op_histogram;
+  std::map<StringPiece, int> unclustered_op_histogram;
+  int clustered_node_count = 0;
+
+  for (Node* n : g.nodes()) {
+    absl::optional<StringPiece> cluster_name = GetXlaClusterForNode(*n);
+    if (cluster_name) {
+      clustered_node_count++;
+      cluster_name_to_size[*cluster_name]++;
+      cluster_name_to_op_histogram[*cluster_name][n->type_string()]++;
+    } else {
+      unclustered_op_histogram[n->type_string()]++;
+    }
+  }
+
+  int unclustered_node_count = g.num_nodes() - clustered_node_count;
+
+  VLOG(2) << "*** Clustering info for graph of size " << g.num_nodes();
+  VLOG(2) << " Built " << cluster_name_to_size.size() << " clusters, size "
+          << RatioToString(clustered_node_count, g.num_nodes());
+
+  for (const auto& cluster_name_size_pair : cluster_name_to_size) {
+    StringPiece cluster_name = cluster_name_size_pair.first;
+    int size = cluster_name_size_pair.second;
+    VLOG(2) << "  " << cluster_name << " "
+            << RatioToString(size, g.num_nodes());
+    for (const auto& op_count_pair :
+         cluster_name_to_op_histogram[cluster_name]) {
+      VLOG(3) << "   " << op_count_pair.first << ": " << op_count_pair.second
+              << " instances";
+    }
+  }
+
+  if (!unclustered_op_histogram.empty()) {
+    VLOG(2) << " Unclustered nodes: "
+            << RatioToString(unclustered_node_count, g.num_nodes());
+    for (const auto& pair : unclustered_op_histogram) {
+      VLOG(3) << "  " << pair.first << ": " << pair.second << " instances";
+    }
+  }
+}
+
 // Is 'node' an operator that consumes only the shape of its input, not the
 // data itself?
 static bool IsShapeConsumerOp(const Node& node) {
@@ -545,6 +677,43 @@ static bool IsShapeConsumerOp(const Node& node) {
          node.type_string() == "Size";
 }
 
+static Status IgnoreResourceOpForSafetyAnalysis(const Node& n, bool* ignore) {
+  // If a resource operation is assigned to XLA_CPU or XLA_GPU explicitly then
+  // ignore it during resource operation safety analysis.  We need this hack
+  // because of two reasons:
+  //
+  //  1. Operations assigned to XLA_CPU and XLA_GPU have to always be compiled.
+  //  2. We don't support live-out values of type DT_RESOURCE and live-in values
+  //     of type DT_RESOURCE that are not resource variables.
+  //
+  // Together these imply we cannot let resource variable safety analysis
+  // constrain e.g. a TensorArrayV3->TensorArrayAssignV3 edge to be in different
+  // clusters: both of them will have to be clustered because of (1) and we
+  // won't be able to keep the edge between the two as neither the input to the
+  // second XLA cluster nor the output from the first XLA cluster are supported
+  // because of (2).
+  //
+  // TODO(b/113100872): This can be fixed if the TensorFlow representation for
+  // TensorArray and Stack on the XLA_{C|G}PU devices were the same in XLA; then
+  // (2) would no longer hold.
+
+  if (n.assigned_device_name().empty()) {
+    *ignore = false;
+    return Status::OK();
+  }
+  DeviceType device_type("");
+  TF_RETURN_IF_ERROR(
+      DeviceToDeviceType(n.assigned_device_name(), &device_type));
+
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
+    *ignore = true;
+  } else {
+    *ignore = registration->compile_resource_ops;
+  }
+  return Status::OK();
+}
+
 // Sequence number generator to ensure clusters have unique names.
 static std::atomic<int64> cluster_sequence_num;
 
@@ -573,6 +742,8 @@ Status MarkForCompilationPass::RunImpl(
 
   GraphCycles cycles;
   TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(graph, &cycles));
+  TF_RETURN_IF_ERROR(AdjustCycleDetectionGraphForResourceOps(
+      graph, options.flib_def, IgnoreResourceOpForSafetyAnalysis, &cycles));
 
   // Each compilation candidate belongs to a cluster. The cluster's
   // representative
@@ -585,6 +756,8 @@ Status MarkForCompilationPass::RunImpl(
     worklist.push_back(&clusters[node->id()]);
   }
 
+  OptimizerOptions::GlobalJitLevel global_jit_level =
+      GetGlobalJitLevel(options);
   legacy_flags::MarkForCompilationPassFlags* flags =
       legacy_flags::GetMarkForCompilationPassFlags();
 
@@ -609,7 +782,7 @@ Status MarkForCompilationPass::RunImpl(
     string to_scope;
     for (int to : cycles.Successors(from)) {
       if (to >= graph->num_node_ids()) {
-        // Node is a "frame" node that is present only in the cycle detection
+        // Node is a fictitious node that is present only in the cycle detection
         // graph. No clustering is possible.
         continue;
       }
@@ -624,13 +797,15 @@ Status MarkForCompilationPass::RunImpl(
       }
       // Look for an _XlaScope on both nodes.  If both nodes have a
       // scope and the scopes do not match, do not cluster along this
-      // edge.  If even one of the nodes lacks an _XlaScope attribute,
+      // edge. This restriction is overridden if the global_jit_level is ON. If
+      // even one of the nodes lacks an _XlaScope attribute,
       // then it is treated as a "bridge" and a cluster may be created
       // along it.  We may want to restrict this behavior to require
       // all nodes marked with _XlaCompile=true to also have a
       // _XlaScope property set (and raise an error otherwise); but
       // for now we don't do this.
-      if (GetNodeAttr(node_from->attrs(), kXlaScopeAttr, &from_scope).ok() &&
+      if (global_jit_level == OptimizerOptions::OFF &&
+          GetNodeAttr(node_from->attrs(), kXlaScopeAttr, &from_scope).ok() &&
           GetNodeAttr(node_to->attrs(), kXlaScopeAttr, &to_scope).ok() &&
           from_scope != to_scope) {
         continue;
@@ -726,6 +901,9 @@ Status MarkForCompilationPass::RunImpl(
     dump_graph::DumpGraphToFile("mark_for_compilation", **options.graph,
                                 options.flib_def);
   }
+
+  VLogClusteringSummary(*graph);
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index a780d4a936..807ab51fd3 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
@@ -26,11 +28,11 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -48,9 +50,35 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
       ids[node->name()] = cluster;
     }
   }
+
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Clusters:";
+    for (const auto& p : ids) {
+      VLOG(2) << " " << p.first << " -> " << p.second;
+    }
+  }
   return ids;
 }
 
+gtl::FlatMap<string, std::vector<string>> GetClusterSets(
+    const Graph& g, std::vector<string>* cluster_names = nullptr) {
+  CHECK(cluster_names == nullptr || cluster_names->empty());
+  gtl::FlatMap<string, std::vector<string>> cluster_sets;
+  for (const auto& p : GetClusters(g)) {
+    cluster_sets[p.second].push_back(p.first);
+  }
+  for (auto& p : cluster_sets) {
+    if (cluster_names != nullptr) {
+      cluster_names->push_back(p.first);
+    }
+    std::sort(p.second.begin(), p.second.end());
+  }
+  if (cluster_names != nullptr) {
+    std::sort(cluster_names->begin(), cluster_names->end());
+  }
+  return cluster_sets;
+}
+
 TEST(XlaCompilationTest, Chains) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
@@ -199,7 +227,7 @@ TEST(XlaCompilationTest, FunctionCalls) {
                                 {}, {{{"n_c"}, "UncompilableUnary", {"n_a"}}});
   FunctionDef noinline = compilable;
   noinline.mutable_signature()->set_name("NoInlineFn");
-  AddAttr("_noinline", bool(true), noinline.mutable_attr());
+  AddAttr("_noinline", static_cast<bool>(true), noinline.mutable_attr());
 
   FunctionDefLibrary flib;
   *flib.add_function() = compilable;
@@ -372,6 +400,44 @@ TEST(XlaCompilationTest, Loops) {
   EXPECT_EQ(0, clusters.size());
 }
 
+TEST(XlaCompilationTest, CyclesWithAllDifferentScopesGlobalJitOverridden) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor())
+                                         .WithAttr(kXlaScopeAttr, "ScopeA"));
+    Node* b = ops::UnaryOp(
+        "Relu", a,
+        builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "ScopeB"));
+    ops::BinaryOp(
+        "MatMul", a, b,
+        builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
+    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  FunctionDefLibrary flib;
+  FunctionLibraryDefinition flib_def(graph->op_registry(), flib);
+  SessionOptions session_options;
+  session_options.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_global_jit_level(OptimizerOptions::ON_2);
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
+      &graph, &flib_def, &session_options));
+  auto clusters = GetClusters(*graph);
+
+  // The computation is: C = A + relu(A)
+  // where A sits in ScopeA, relu(A) sits in ScopeB, and C sits in ScopeC.
+  // In this case, the GlobalJitLevel overrides the scopes to cluster while
+  // ignoring scopes.
+  EXPECT_EQ(3, clusters.size());
+  EXPECT_EQ(clusters["A"], clusters["B"]);
+  EXPECT_EQ(clusters["A"], clusters["C"]);
+}
+
 TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
@@ -463,38 +529,104 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
   EXPECT_EQ(clusters["B"], clusters["C"]);
 }
 
-REGISTER_OP("ResourceInput").Input("a: resource").Output("o: float");
-REGISTER_OP("ResourceOutput").Input("a: float").Output("o: resource");
-
 namespace {
+Node* MakeRead(const Scope& scope, const string& id) {
+  Output var_handle =
+      ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
+  Output read =
+      ops::ReadVariableOp(scope.WithOpName("Read" + id), var_handle, DT_FLOAT);
+  return read.node();
+}
 
-class DummyOp : public XlaOpKernel {
-  using XlaOpKernel::XlaOpKernel;
-  void Compile(XlaOpKernelContext* ctx) override {}
-};
-
-REGISTER_XLA_OP(Name("ResourceInput"), DummyOp);
-REGISTER_XLA_OP(Name("ResourceOutput"), DummyOp);
+Node* MakeWrite(const Scope& scope, const string& id) {
+  Output var_handle =
+      ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
+  Output value_to_write =
+      ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f);
+  ops::AssignVariableOp assign_op(scope.WithOpName("Assignment" + id),
+                                  var_handle, value_to_write);
+  return assign_op.operation.node();
+}
 
+Node* MakeNeutral(const Scope& scope, const string& id) {
+  return ops::Const(scope.WithOpName("Const" + id), 42.0f).node();
+}
 }  // namespace
 
-TEST(XlaCompilationTest, Resources) {
+TEST(XlaCompilationTest, ResourcesClusteringAllowed) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(read, write);
+
+  FixupSourceAndSinkEdges(root.graph());
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  GraphDef graphdef;
-  {
-    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-    Node* a =
-        ops::SourceOp("UncompilableNullary", builder.opts().WithName("A"));
-    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
-    // We should not form clusters with resource ops by default.
-    Node* c = ops::UnaryOp("ResourceOutput", b, builder.opts().WithName("C"));
-    Node* d = ops::UnaryOp("ResourceInput", c, builder.opts().WithName("D"));
-    ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
-    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
-  }
+  TF_EXPECT_OK(root.ToGraph(graph.get()));
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
-  auto clusters = GetClusters(*graph);
-  EXPECT_EQ(0, clusters.size());  // Nothing should be compiled.
+  gtl::FlatMap<string, std::vector<string>> cluster_sets =
+      GetClusterSets(*graph);
+  ASSERT_EQ(cluster_sets.size(), 1);
+  std::vector<string> expected_clustered_nodes = {"AssignmentW", "ReadR",
+                                                  "ValueToAssignW"};
+  ASSERT_EQ(cluster_sets.begin()->second, expected_clustered_nodes);
+}
+
+TEST(XlaCompilationTest, ResourcesClusteringDisallowed) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(write, read);
+
+  FixupSourceAndSinkEdges(root.graph());
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_EXPECT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+  gtl::FlatMap<string, std::vector<string>> cluster_sets =
+      GetClusterSets(*graph);
+  ASSERT_EQ(cluster_sets.size(), 1);
+  std::vector<string> expected_clustered_nodes = {"AssignmentW",
+                                                  "ValueToAssignW"};
+  ASSERT_EQ(cluster_sets.begin()->second, expected_clustered_nodes);
+}
+
+TEST(XlaCompilationTest, ChainOfOps) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* write_0 = MakeWrite(root, "W0");
+  Node* neutral_0 = MakeNeutral(root, "N0");
+  Node* read_0 = MakeRead(root, "R0");
+  Node* write_1 = MakeWrite(root, "W1");
+  Node* neutral_1 = MakeNeutral(root, "N1");
+  Node* read_1 = MakeRead(root, "R1");
+
+  root.graph()->AddControlEdge(write_0, neutral_0);
+  root.graph()->AddControlEdge(neutral_0, read_0);
+  root.graph()->AddControlEdge(read_0, write_1);
+  root.graph()->AddControlEdge(write_1, neutral_1);
+  root.graph()->AddControlEdge(neutral_1, read_1);
+
+  FixupSourceAndSinkEdges(root.graph());
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_EXPECT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::vector<string> cluster_names;
+  gtl::FlatMap<string, std::vector<string>> cluster_sets =
+      GetClusterSets(*graph, &cluster_names);
+
+  ASSERT_EQ(cluster_sets.size(), 2);
+
+  std::vector<string> expected_clustered_nodes_a = {"AssignmentW0", "ConstN0",
+                                                    "ValueToAssignW0"};
+  ASSERT_EQ(cluster_sets[cluster_names[0]], expected_clustered_nodes_a);
+
+  std::vector<string> expected_clustered_nodes_b = {
+      "AssignmentW1", "ConstN1", "ReadR0", "ValueToAssignW1"};
+  ASSERT_EQ(cluster_sets[cluster_names[1]], expected_clustered_nodes_b);
 }
 
 TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
@@ -524,11 +656,11 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
 
   Status status = MarkForCompilationPassTestHelper::MarkForCompilation(&graph);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.ToString(),
-                                    "Edge from c to a would create a cycle.\n"
-                                    "+-> a\n"
-                                    "|   b\n"
-                                    "+-- c\n"));
+  EXPECT_TRUE(absl::StrContains(status.ToString(),
+                                "Edge from c to a would create a cycle.\n"
+                                "+-> a\n"
+                                "|   b\n"
+                                "+-- c\n"));
 }
 
 TEST(XlaCompilationTest, Retval) {
@@ -693,5 +825,27 @@ TEST(XlaCompilationTest, ClusterControlTrigger) {
   EXPECT_EQ(clusters, expected_clusters);
 }
 
+TEST(XlaCompilationTest, RandomShape) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output shape_shape = ops::Const(root.WithOpName("shape_shape"), {2}, {1});
+  Output shape =
+      ops::RandomUniformInt(root.WithOpName("shape"), shape_shape,
+                            ops::Const(root.WithOpName("minval"), 1),
+                            ops::Const(root.WithOpName("maxval"), 20));
+  Output reshape_input =
+      ops::Placeholder(root.WithOpName("reshape_input"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({500, 500})));
+  Output reshape =
+      ops::Reshape(root.WithOpName("reshape"), reshape_input, shape);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["shape"], "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
index a84b82e479..65669877f7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 /*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
-    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def) {
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
+    SessionOptions* session_options) {
   // Assign all nodes to the CPU device.
   static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
   for (Node* n : (*graph)->nodes()) {
@@ -26,12 +28,19 @@ namespace tensorflow {
 
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
+  opt_options.session_options = session_options;
   opt_options.flib_def = flib_def;
   MarkForCompilationPass pass;
   return pass.RunImpl(opt_options);
 }
 
 /*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def) {
+  SessionOptions session_options;
+  return MarkForCompilation(graph, flib_def, &session_options);
+}
+
+/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation(
     std::unique_ptr<Graph>* graph) {
   FunctionDefLibrary flib;
   FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
index b9a0531cb0..216baaf933 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h
@@ -25,6 +25,11 @@ class MarkForCompilationPassTestHelper {
   // `graph` to the CPU device.  To make testing easier, ignores device
   // registration, _XlaCompile attributes, input deadness and global jit level.
   static Status MarkForCompilation(std::unique_ptr<Graph>* graph,
+                                   FunctionLibraryDefinition* flib_def,
+                                   SessionOptions* session_options);
+
+  // Like `MarkForCompilation` but creates a default SessionOptions.
+  static Status MarkForCompilation(std::unique_ptr<Graph>* graph,
                                    FunctionLibraryDefinition* flib_def);
 
   // Like `MarkForCompilation` but creates `flib_def` from the op registry.
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index c9e46bc147..13804c6a05 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -10,10 +10,3 @@ cc_library(
     deps = ["//tensorflow/core:framework"],
     alwayslink = 1,
 )
-
-cc_library(
-    name = "parallel_check_op",
-    srcs = ["parallel_check_op.cc"],
-    deps = ["//tensorflow/core:framework"],
-    alwayslink = 1,
-)
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
new file mode 100644
index 0000000000..3a9a8c4988
--- /dev/null
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/partially_decluster_pass.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/framework/memory_types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace tensorflow {
+namespace {
+Status FindNodesToDecluster(const Graph& graph, gtl::FlatSet<Node*>* result,
+                            gtl::ArraySlice<Node*> post_order) {
+  // Find nodes that have at least one user outside their cluster that expects
+  // hostmem output.  These nodes should be cloned to outside the cluster to
+  // avoid the device-host copy we'd otherwise need.
+
+  MemoryTypeVector input_mtypes, output_mtypes;
+
+  for (Node* n : post_order) {
+    absl::optional<StringPiece> from_cluster = GetXlaClusterForNode(*n);
+    if (!from_cluster) {
+      continue;
+    }
+
+    // We assume the only XLA-auto-clusterable operations with side effects are
+    // resource variable updates.  We can't execute these twice.
+    if (HasResourceInputOrOutput(*n)) {
+      continue;
+    }
+
+    DeviceType device_type("");
+    TF_RETURN_IF_ERROR(
+        DeviceToDeviceType(n->assigned_device_name(), &device_type));
+    TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type,
+                                          n->def(), &input_mtypes,
+                                          &output_mtypes));
+    for (const Edge* e : n->out_edges()) {
+      Node* dst = e->dst();
+
+      if (e->IsControlEdge()) {
+        continue;
+      }
+
+      bool edge_incurs_extra_device_to_host_copy;
+      if (output_mtypes[e->src_output()] == DEVICE_MEMORY) {
+        // If the output of the *TensorFlow* operation is in DEVICE_MEMORY then
+        // keep the node clustered -- XLA will also produce the output in device
+        // memory and we will get some benefit from clustering.
+        edge_incurs_extra_device_to_host_copy = false;
+      } else {
+        MemoryTypeVector dst_input_mtypes, dst_output_mtypes;
+        DeviceType dst_device_type("");
+        TF_RETURN_IF_ERROR(
+            DeviceToDeviceType(dst->assigned_device_name(), &dst_device_type));
+        TF_RETURN_IF_ERROR(MemoryTypesForNode(graph.op_registry(), device_type,
+                                              dst->def(), &dst_input_mtypes,
+                                              &dst_output_mtypes));
+        edge_incurs_extra_device_to_host_copy =
+            dst_input_mtypes[e->dst_input()] == HOST_MEMORY;
+      }
+
+      if (!edge_incurs_extra_device_to_host_copy) {
+        continue;
+      }
+
+      // Check if `dst` is in a different cluster, unclustered, or about to be
+      // partially declustered (here we rely on the post-order traversal order).
+      // If yes, decluster `n` to avoid the device-to-host memcpy.
+      absl::optional<StringPiece> dst_cluster =
+          result->count(dst) ? absl::nullopt : GetXlaClusterForNode(*dst);
+      if (from_cluster != dst_cluster) {
+        CHECK(result->insert(n).second);
+        break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status PartiallyDeclusterNode(Graph* graph, Node* n) {
+  StringPiece cluster_name = *GetXlaClusterForNode(*n);
+  gtl::InlinedVector<const Edge*, 6> out_edges_to_clone;
+  for (const Edge* out_edge : n->out_edges()) {
+    if (out_edge->IsControlEdge()) {
+      continue;
+    }
+
+    Node* dst = out_edge->dst();
+    absl::optional<StringPiece> dst_cluster_name = GetXlaClusterForNode(*dst);
+    if (dst_cluster_name != cluster_name) {
+      out_edges_to_clone.push_back(out_edge);
+    }
+  }
+
+  CHECK(!out_edges_to_clone.empty()) << n->DebugString();
+
+  NodeDef ndef = n->def();
+  ndef.set_name(strings::StrCat(n->name(), "/declustered"));
+  RemoveFromXlaCluster(&ndef);
+  Status s;
+  Node* cloned_node = graph->AddNode(ndef, &s);
+  cloned_node->set_assigned_device_name(n->assigned_device_name());
+  TF_RETURN_IF_ERROR(s);
+
+  for (const Edge* in_edge : n->in_edges()) {
+    graph->AddEdge(in_edge->src(), in_edge->src_output(), cloned_node,
+                   in_edge->dst_input());
+  }
+
+  for (const Edge* out_edge_to_clone : out_edges_to_clone) {
+    graph->AddEdge(cloned_node, out_edge_to_clone->src_output(),
+                   out_edge_to_clone->dst(), out_edge_to_clone->dst_input());
+    graph->RemoveEdge(out_edge_to_clone);
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status PartiallyDeclusterPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  // NB!  In this pass we assume the only XLA-auto-clusterable operations that
+  // may have side effects are resource variable operations so we don't cluster
+  // those.  The pass will have to be updated if this assumption becomes
+  // invalid.
+
+  Graph* graph = options.graph->get();
+
+  // When deciding whether to decluster a particular node, we base our decision
+  // on if we've decided that some of its consumers have to be declustered too.
+  // Iterating the graph in post-order guarantees that consumers have been
+  // visited before producers.
+  std::vector<Node*> post_order;
+  GetPostOrder(*graph, &post_order, /*stable_comparator=*/NodeComparatorName(),
+               /*edge_filter=*/[](const Edge& edge) {
+                 return !edge.src()->IsNextIteration();
+               });
+
+  gtl::FlatSet<Node*> nodes_to_partially_decluster;
+  TF_RETURN_IF_ERROR(FindNodesToDecluster(
+      **options.graph, &nodes_to_partially_decluster, post_order));
+
+  if (VLOG_IS_ON(3)) {
+    for (Node* n : post_order) {
+      if (nodes_to_partially_decluster.count(n)) {
+        VLOG(3) << n->DebugString();
+      }
+    }
+  }
+
+  for (Node* n : post_order) {
+    if (nodes_to_partially_decluster.count(n)) {
+      TF_RETURN_IF_ERROR(PartiallyDeclusterNode(graph, n));
+    }
+  }
+
+  nodes_to_partially_decluster.clear();
+  TF_RETURN_IF_ERROR(FindNodesToDecluster(
+      **options.graph, &nodes_to_partially_decluster, post_order));
+  CHECK(nodes_to_partially_decluster.empty());
+
+  return Status::OK();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.h b/tensorflow/compiler/jit/partially_decluster_pass.h
new file mode 100644
index 0000000000..6949b5028e
--- /dev/null
+++ b/tensorflow/compiler/jit/partially_decluster_pass.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// Clones nodes from within a cluster to outside the cluster if profitable.
+//
+// Today this only clones to avoid device-to-host copies, but in the future we
+// may consider other reasons to clone.  For instance, we convert this:
+//
+//         .....
+//           |
+//           v
+//      A_Clustered ====> C_Unclustered
+//           |
+//           v
+//      B_Clustered
+//
+// to:
+//
+//         .....
+//          | |
+//          | +-------------+
+//          |               |
+//          v               v
+//      A_Clustered   A_Unclustered ====> C_Unclustered
+//           |
+//           v
+//      B_Clustered
+//
+// where the ===> arrow has a hostmem source and destination and would entail a
+// device to host copy if the source and destination were not in the same XLA
+// cluster.
+class PartiallyDeclusterPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PARTIALLY_DECLUSTER_PASS_H_
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
new file mode 100644
index 0000000000..f61a955c22
--- /dev/null
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -0,0 +1,283 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/partially_decluster_pass.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+REGISTER_OP("FakeNullary").Output("out: float");
+
+REGISTER_OP("FakeBinary")
+    .Input("host_in: float")
+    .Input("device_in: float")
+    .Output("host_out: float")
+    .Output("device_out: float");
+
+REGISTER_OP("FakeResourceVar").Output("out: resource");
+
+REGISTER_OP("FakeResourceUpdate")
+    .Input("in: resource")
+    .Output("out: resource")
+    .Output("something_else: float");
+
+class FakeBinaryOp : public OpKernel {
+ public:
+  explicit FakeBinaryOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override { CHECK(false); }
+};
+
+class FakeResourceVarUpdateOp : public OpKernel {
+ public:
+  explicit FakeResourceVarUpdateOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override { CHECK(false); }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FakeBinary")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("host_in")
+                            .HostMemory("host_out"),
+                        FakeBinaryOp);
+
+REGISTER_KERNEL_BUILDER(Name("FakeResourceVarUpdate")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("something_else"),
+                        FakeResourceVarUpdateOp);
+
+Status PartiallyDecluster(std::unique_ptr<Graph>* graph) {
+  FixupSourceAndSinkEdges(graph->get());
+  // Assign all nodes to the CPU device.
+  static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
+  for (Node* n : (*graph)->nodes()) {
+    n->set_assigned_device_name(kCpuDevice);
+  }
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = graph;
+  PartiallyDeclusterPass pass;
+  return pass.Run(opt_options);
+}
+
+const Node* FindNodeByName(const Graph& graph, const string& name) {
+  for (const Node* node : graph.nodes()) {
+    if (node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+bool GetInputsForNode(const Graph& graph, const string& node_name,
+                      std::vector<Node*>* inputs) {
+  const Node* node = FindNodeByName(graph, node_name);
+  if (node == nullptr) {
+    return false;
+  }
+  for (const Edge* e : node->in_edges()) {
+    inputs->push_back(e->src());
+  }
+  std::sort(inputs->begin(), inputs->end(), NodeComparatorName());
+  return true;
+}
+
+TEST(PartiallyDeclusterPassTest, ClusteredAndUnclustered) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer"));
+    ops::BinaryOp("FakeBinary", clustered_producer, input,
+                  builder.opts().WithName("UnclusteredConsumer"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", {clustered_producer, 1}, input,
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> unclustered_consumer_inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "UnclusteredConsumer",
+                               &unclustered_consumer_inputs));
+  ASSERT_EQ(unclustered_consumer_inputs.size(), 2);
+  EXPECT_EQ(unclustered_consumer_inputs[0]->name(),
+            "ClusteredProducer/declustered");
+  EXPECT_EQ(unclustered_consumer_inputs[1]->name(), "Input");
+
+  std::vector<Node*> clustered_consumer_inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ClusteredConsumer",
+                               &clustered_consumer_inputs));
+  ASSERT_EQ(clustered_consumer_inputs.size(), 2);
+  EXPECT_EQ(clustered_consumer_inputs[0]->name(), "ClusteredProducer");
+  EXPECT_EQ(clustered_consumer_inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DifferentClusters) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer"));
+    Node* consumer_in_different_cluster =
+        ops::BinaryOp("FakeBinary", clustered_producer, input,
+                      builder.opts().WithName("ConsumerInDifferentCluster"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", input, {clustered_producer, 1},
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs));
+  ASSERT_EQ(inputs.size(), 2);
+  EXPECT_EQ(inputs[0]->name(), "ClusteredProducer/declustered");
+  EXPECT_EQ(inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DontDeclusterIfUserIsDeviceMem) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer"));
+    // The first input is hostmem and the second input is devicemem.
+    Node* consumer_in_different_cluster =
+        ops::BinaryOp("FakeBinary", input, clustered_producer,
+                      builder.opts().WithName("ConsumerInDifferentCluster"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", input, {clustered_producer, 1},
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs));
+  ASSERT_EQ(inputs.size(), 2);
+  EXPECT_EQ(inputs[0]->name(), "ClusteredProducer");
+  EXPECT_EQ(inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DontDuplicateResourceVarOps) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* resource_var = ops::SourceOp("FakeResourceVar",
+                                       builder.opts().WithName("ResourceVar"));
+    Node* clustered_producer =
+        ops::UnaryOp("FakeResourceUpdate", resource_var,
+                     builder.opts().WithName("ClusteredProducer"));
+    Node* consumer_in_different_cluster =
+        ops::BinaryOp("FakeBinary", {clustered_producer, 1}, input,
+                      builder.opts().WithName("ConsumerInDifferentCluster"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", input, {clustered_producer, 1},
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    consumer_in_different_cluster->AddAttr(kXlaClusterAttr, "cluster_1");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> inputs;
+  ASSERT_TRUE(GetInputsForNode(*graph, "ConsumerInDifferentCluster", &inputs));
+  ASSERT_EQ(inputs.size(), 2);
+  EXPECT_EQ(inputs[0]->name(), "ClusteredProducer");
+  EXPECT_EQ(inputs[1]->name(), "Input");
+}
+
+TEST(PartiallyDeclusterPassTest, DeclusterDependentNodes) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer_0 =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName("ClusteredProducer0"));
+    Node* clustered_producer_1 =
+        ops::BinaryOp("FakeBinary", clustered_producer_0, input,
+                      builder.opts().WithName("ClusteredProducer1"));
+    ops::BinaryOp("FakeBinary", clustered_producer_1, input,
+                  builder.opts().WithName("UnclusteredConsumer"));
+    Node* clustered_consumer =
+        ops::BinaryOp("FakeBinary", {clustered_producer_1, 1}, input,
+                      builder.opts().WithName("ClusteredConsumer"));
+    clustered_producer_0->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_producer_1->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_consumer->AddAttr(kXlaClusterAttr, "cluster_0");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  std::vector<Node*> unclustered_consumer_inputs, declustered_producer_1_inputs;
+
+  ASSERT_TRUE(GetInputsForNode(*graph, "UnclusteredConsumer",
+                               &unclustered_consumer_inputs));
+  ASSERT_EQ(unclustered_consumer_inputs.size(), 2);
+  EXPECT_EQ(unclustered_consumer_inputs[0]->name(),
+            "ClusteredProducer1/declustered");
+  EXPECT_EQ(unclustered_consumer_inputs[1]->name(), "Input");
+
+  ASSERT_TRUE(GetInputsForNode(*graph, "ClusteredProducer1/declustered",
+                               &declustered_producer_1_inputs));
+  ASSERT_EQ(declustered_producer_1_inputs.size(), 2);
+  EXPECT_EQ(declustered_producer_1_inputs[0]->name(),
+            "ClusteredProducer0/declustered");
+  EXPECT_EQ(declustered_producer_1_inputs[1]->name(), "Input");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
new file mode 100644
index 0000000000..1ba4a5ef73
--- /dev/null
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
@@ -0,0 +1,336 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ALGORITHM OVERVIEW
+// ==================
+//
+// An XLA cluster hoists all resource reads to be beginning of the cluster
+// execution and all the resource writes to the end.  This means it cannot
+// enforce arbitrary ordering dependencies (via control or data edges) between
+// resource operations.  Since all resource reads happen before all resource
+// writes, edges constraining resource reads to happen before resource writes
+// are fine, but all other kinds of edges are problematic.  This analysis
+// computes the set of pairs of resource operations that cannot be put in the
+// same cluster because XLA cannot respect the dependencies between them in the
+// TensorFlow program.
+//
+// TODO(b/112856632): We can, in theory, support Read->Read and Write->Write
+// dependencies.
+//
+// Specifically the result computed by this analysis contains the edge {W, R}
+// iff all of these hold true:
+//
+//   - In the graph (g - {edges from NextIteration to Merge}) there is a path
+//     from W to R.
+//   - IsEdgeSafe(W, R) == False [defined below]
+//   - W != R (note: some resource operations both read from and write to
+//     resource variables).
+//
+// The result is incorrect around loops because we ignore edges from
+// NextIteration to Merge, but that should be fine because we don't cluster
+// these edges.  For instance, in:
+//
+// Init -----> Merge <-------+
+//               |           |
+//               v           |
+//             Read          |
+//               |           |
+//               v           |
+//             Write         |
+//               |           |
+//               v           |
+//           NextIteration --+
+//
+// we won't put (Read, Write) in the returned set.  This is fine if
+// auto-clustering can only cluster the Read->Write edge, but it is a problem if
+// it clusters the Write->NextIteration->Merge->Read edges instead.  The same
+// problem is present for the functional version of the loop above.  We rely on
+// auto-clustering to not cluster control flow edges like NextIteration->Merge.
+// This is enough to avoid the explicit-control-flow problem shown above.  One
+// way to think about this is that we only care about cases where two nodes, A
+// and B, would normally have been put in the same cluster but cannot legally be
+// in the same cluster because of resourcevar-dependencies.  If A and B would
+// normally have been put in the same cluster then all paths between A and B
+// would have to be clusterable (otherwise we'd have introduced a cycle).  Ergo
+// there could not have been a NextIteration->Merge edge between A and B since
+// we don't cluster these edges.
+//
+// We also rely on auto-clustering to not cluster functional control flow nodes
+// that contain resource operations.
+//
+// IMPLEMENTATION
+// --------------
+//
+// We traverse the graph minus backedges in reverse post order, mapping each
+// node to the set of resource operation reaching that node.  Since we visit
+// producers before consumers, we can construct the set of reaching operations
+// by taking the union of the operations reaching the input nodes.  These
+// "reaching resource operations" can then be used to create the pairs of
+// incompatible nodes using `IsEdgeSafe`.
+
+#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+// Returns true if `n` may call a function.
+Status MayCallFunction(const Node& n, const FunctionLibraryDefinition* flib_def,
+                       bool* out_result) {
+  if (flib_def->Contains(n.type_string())) {
+    *out_result = true;
+  } else {
+    *out_result =
+        std::any_of(n.def().attr().begin(), n.def().attr().end(),
+                    [](const std::pair<string, AttrValue>& name_attr_pair) {
+                      return name_attr_pair.second.has_func();
+                    });
+  }
+
+  return Status::OK();
+}
+
+// Maps `n` to the XlaResourceOpKind corresponding to its operation.  If `n` is
+// not a resource operation recognized by XLA then sets `out_resource_op_kind`
+// to nullopt.
+Status XlaResourceOpKindForNode(
+    const Node& n, const FunctionLibraryDefinition* flib_def,
+    const std::function<Status(const Node&, bool*)>& resource_ops_to_ignore,
+    absl::optional<XlaResourceOpKind>* out_resource_op_kind) {
+  bool should_ignore = false;
+  if (resource_ops_to_ignore) {
+    TF_RETURN_IF_ERROR(resource_ops_to_ignore(n, &should_ignore));
+  }
+  if (should_ignore) {
+    *out_resource_op_kind = absl::nullopt;
+    return Status::OK();
+  }
+
+  const XlaResourceOpInfo* op_info = GetResourceOpInfoForOp(n.type_string());
+  if (op_info) {
+    *out_resource_op_kind = op_info->kind();
+    return Status::OK();
+  }
+
+  // We conservatively assume that functions will both read and write resource
+  // variables.  In the future we may consider doing some form of
+  // inter-procedural analysis.
+  bool may_call_function;
+  TF_RETURN_IF_ERROR(MayCallFunction(n, flib_def, &may_call_function));
+  if (may_call_function) {
+    *out_resource_op_kind = XlaResourceOpKind::kReadWrite;
+  } else {
+    *out_resource_op_kind = absl::nullopt;
+  }
+
+  return Status::OK();
+}
+
+// Returns true if a control or data dependence from a TensorFlow operation of
+// resource op kind `from` to a TensorFlow operation of resource op kind `to`
+// can be represented by an XLA cluster and needs no special handling around
+// auto-jit.
+bool IsEdgeSafe(XlaResourceOpKind from, XlaResourceOpKind to) {
+  // XLA clusters forces all reads to happen before all writes, which means the
+  // kinds of edges it can faithfully represent are: Read->Write, Read->Modify,
+  // Modify->Write, Read->Read, Write->Write.
+  //
+  // TODO(b/112856632): We can, in theory, support Read->Read and Write->Write
+  // dependencies.
+  return from == XlaResourceOpKind::kRead && to == XlaResourceOpKind::kWrite;
+}
+
+using ResourceOp = std::pair<int, XlaResourceOpKind>;
+
+string ResourceOpToString(const ResourceOp& resource_op) {
+  return strings::StrCat(
+      resource_op.first, ": ",
+      XlaResourceOpInfo::XlaResourceOpKindToString(resource_op.second));
+}
+
+// A copy-on-write set used to store the set of ResourceOps reaching a node in a
+// TensorFlow graph.
+//
+// TODO(sanjoy): It may be useful to pull this out into its own header at some
+// point.
+class ResourceOpSet {
+ private:
+  using Impl = gtl::FlatSet<ResourceOp>;
+
+ public:
+  ResourceOpSet() = default;
+
+  // Adds all ResourceOp s in `other` to this set.
+  void Add(const ResourceOpSet& other) {
+    CHECK(!frozen_);
+    if (other.impl_ == impl_) {
+      other.frozen_ = true;
+      return;
+    }
+
+    if (!impl_) {
+      other.frozen_ = true;
+      impl_ = other.impl_;
+      return;
+    }
+
+    for (ResourceOp resource_op : other) {
+      Add(resource_op);
+    }
+  }
+
+  void Add(const ResourceOp& resource_op) {
+    CHECK(!frozen_);
+    if (!IsCopy() && Contains(resource_op)) {
+      // We can avoid the copy if the item we want to insert already exists.
+      return;
+    }
+
+    EnsureIsCopied();
+    impl_->insert(resource_op);
+  }
+
+  Impl::const_iterator begin() const {
+    return impl_ ? impl_->begin() : GetEmptyImpl()->begin();
+  }
+
+  Impl::const_iterator end() const {
+    return impl_ ? impl_->end() : GetEmptyImpl()->end();
+  }
+
+  bool Contains(const ResourceOp& resource_op) const {
+    return impl_ != nullptr && impl_->count(resource_op);
+  }
+
+ private:
+  bool IsCopy() const { return storage_ != nullptr; }
+
+  void EnsureIsCopied() {
+    if (storage_ == nullptr) {
+      storage_ = absl::make_unique<Impl>();
+      for (ResourceOp op : *this) {
+        storage_->insert(op);
+      }
+      impl_ = storage_.get();
+    }
+  }
+
+  static Impl* GetEmptyImpl() {
+    static Impl* empty_impl = new Impl;
+    return empty_impl;
+  }
+
+  Impl* impl_ = nullptr;
+  std::unique_ptr<Impl> storage_;
+
+  // frozen_ is true if there is another set pointing to this set's impl_.  We
+  // can no longer add elements to this set in that case since the sets pointing
+  // to this set expect the contents of this set to be stable.
+  mutable bool frozen_ = false;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ResourceOpSet);
+};
+
+string ResourceOpSetToString(const ResourceOpSet& resource_op_set) {
+  std::vector<string> elements_debug_string;
+  std::transform(resource_op_set.begin(), resource_op_set.end(),
+                 std::back_inserter(elements_debug_string), ResourceOpToString);
+  return strings::StrCat("{", absl::StrJoin(elements_debug_string, ","), "}");
+}
+
+string NodeToString(const Node& n, XlaResourceOpKind resource_op_kind) {
+  return strings::StrCat(
+      "[", n.name(), ": ", n.type_string(), "(",
+      XlaResourceOpInfo::XlaResourceOpKindToString(resource_op_kind), ")", "]");
+}
+}  // namespace
+
+Status ComputeIncompatibleResourceOperationPairs(
+    const Graph& g, const FunctionLibraryDefinition* flib_def,
+    const std::function<Status(const Node&, bool*)>& resource_ops_to_ignore,
+    std::vector<std::pair<int, int>>* result) {
+  CHECK(result->empty());
+
+  std::vector<Node*> rpo;
+  GetReversePostOrder(g, &rpo, /*stable_comparator=*/NodeComparatorName(),
+                      /*edge_filter=*/[](const Edge& edge) {
+                        return !edge.src()->IsNextIteration();
+                      });
+
+  auto resource_op_set_for_node =
+      absl::make_unique<ResourceOpSet[]>(g.num_node_ids());
+
+  const bool vlog = VLOG_IS_ON(2);
+
+  for (Node* n : rpo) {
+    absl::optional<XlaResourceOpKind> op_kind;
+    TF_RETURN_IF_ERROR(XlaResourceOpKindForNode(
+        *n, flib_def, resource_ops_to_ignore, &op_kind));
+
+    ResourceOpSet* resource_op_set = &resource_op_set_for_node[n->id()];
+
+    // Merge the reaching resource operations for all the incoming edges to
+    // create the set of all possible resource ops reaching `n`.
+    for (const Edge* e : n->in_edges()) {
+      if (n->IsMerge() && e->src()->IsNextIteration()) {
+        // Ignore back-edges (see file comment).
+        continue;
+      }
+
+      const ResourceOpSet& incoming_op_set =
+          resource_op_set_for_node[e->src()->id()];
+      resource_op_set->Add(incoming_op_set);
+    }
+
+    // Add to the "incompatible resource ops" set if necessary.
+    if (op_kind) {
+      for (ResourceOp incoming_op : *resource_op_set) {
+        if (IsEdgeSafe(incoming_op.second, *op_kind)) {
+          continue;
+        }
+
+        if (vlog) {
+          VLOG(2) << "Unsafe edge: "
+                  << NodeToString(*g.FindNodeId(incoming_op.first),
+                                  incoming_op.second)
+                  << " -> " << NodeToString(*n, *op_kind);
+        }
+        result->push_back({incoming_op.first, n->id()});
+      }
+
+      resource_op_set->Add({n->id(), *op_kind});
+    }
+
+    if (vlog) {
+      VLOG(3) << n->name() << " -> " << ResourceOpSetToString(*resource_op_set);
+    }
+  }
+
+  std::sort(result->begin(), result->end());
+  CHECK(std::unique(result->begin(), result->end()) == result->end());
+
+  return Status::OK();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.h b/tensorflow/compiler/jit/resource_operation_safety_analysis.h
new file mode 100644
index 0000000000..ae8cfeecad
--- /dev/null
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
+
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// An XLA cluster hoists all resource reads to be beginning of the cluster
+// execution and all the resource writes to the end.  This means it cannot
+// enforce arbitrary ordering dependencies (via control or data edges) between
+// resource operations.  Since all resource reads happen before all resource
+// writes, edges constraining resource reads to happen before resource writes
+// are fine, but all other kinds of edges are problematic.  This analysis
+// returns the set of pairs of resource operations that cannot be put in the
+// same cluster because XLA cannot respect the dependencies between them in the
+// TensorFlow program.
+//
+// The restrictions are not transitive: it is fine to put A and C in the same
+// cluster even if the returned set contains (A,B) and (B,C).
+//
+// In other words, if these pairs are seen as edges in an undirected graph of
+// the nodes in `g` then auto-clustering is at least as constrained as the graph
+// coloring problem on this graph.
+//
+//
+// For instance if we auto-cluster all operations in this TensorFlow graph:
+//
+//         ReadVariablepOp0  ->  ReadVariableOp1
+//                                      |
+//                                      v
+//                              AssignVariableOp0  ->  AssignVariableOp1
+//
+// we will lose the ReadVariablepOp0 -> ReadVariableOp1 and the
+// AssignVariableOp0 -> AssignVariableOp1 dependencies.  I.e. it is possible for
+// XlaLaunchOp to issue ReadVariableOp1 before ReadVariablepOp0 since it reads
+// all the resource variables when the cluster starts executing without any
+// particular ordering between them; same holds for the AssignVariableOp0 ->
+// AssignVariableOp1 edge.  The ReadVariableOp1 -> AssignVariableOp0 edge will
+// be respected by XlaLaunchOp though because all reads happen before all
+// writes.
+//
+//
+// NB!  The result computed by this analysis assumes that we don't auto-cluster
+// back-edges (i.e. the edges from NextIteration to Merge).
+//
+// NB!  The result computed by this analysis assumes that we don't auto-cluster
+// functional control flow nodes containing resource operations.
+//
+// If `resource_ops_to_ignore` is set then nodes for which it returns true are
+// ignored (we pretend these nodes are not resource operations).
+Status ComputeIncompatibleResourceOperationPairs(
+    const Graph& g, const FunctionLibraryDefinition* flib_def,
+    const std::function<Status(const Node&, bool*)>& resource_ops_to_ignore,
+    std::vector<std::pair<int, int>>* result);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_RESOURCE_OPERATION_SAFETY_ANALYSIS_H_
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
new file mode 100644
index 0000000000..e54b547abc
--- /dev/null
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
@@ -0,0 +1,540 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+Node* MakeRead(const Scope& scope, const string& id) {
+  Output var_handle =
+      ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
+  Output read =
+      ops::ReadVariableOp(scope.WithOpName("Read" + id), var_handle, DT_FLOAT);
+  return read.node();
+}
+
+Node* MakeWrite(const Scope& scope, const string& id) {
+  Output var_handle =
+      ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
+  Output value_to_write =
+      ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f);
+  ops::AssignVariableOp assign_op(scope.WithOpName("Assignee" + id), var_handle,
+                                  value_to_write);
+  return assign_op.operation.node();
+}
+
+Node* MakeModify(const Scope& scope, const string& id) {
+  Output var_handle =
+      ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
+  Output value_to_write = ops::Const(scope.WithOpName("Increment" + id), 1.0f);
+  ops::AssignAddVariableOp assign_add_op(scope.WithOpName("Increment" + id),
+                                         var_handle, value_to_write);
+  return assign_add_op.operation.node();
+}
+
+Node* MakeNeutral(const Scope& scope, const string& id) {
+  return ops::Const(scope.WithOpName("Const" + id), 42.0f).node();
+}
+
+Status ComputeIncompatiblePairs(Graph* g,
+                                std::vector<std::pair<int, int>>* result) {
+  FixupSourceAndSinkEdges(g);
+  return ComputeIncompatibleResourceOperationPairs(*g, &g->flib_def(), {},
+                                                   result);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, WriteRead) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(write, read);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> write_read_pair = {write->id(), read->id()};
+  EXPECT_EQ(incompatible_pairs[0], write_read_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ReadWrite) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(read, write);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  EXPECT_EQ(incompatible_pairs.size(), 0);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ReadWriteNoEdges) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  MakeRead(root, "R");
+  MakeWrite(root, "W");
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  EXPECT_EQ(incompatible_pairs.size(), 0);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ReadModify) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* modify = MakeModify(root, "M");
+
+  root.graph()->AddControlEdge(read, modify);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  EXPECT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> read_modify_pair = {read->id(), modify->id()};
+  EXPECT_EQ(incompatible_pairs[0], read_modify_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ModifyRead) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* modify = MakeModify(root, "M");
+
+  root.graph()->AddControlEdge(modify, read);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> modify_read_pair = {modify->id(), read->id()};
+  EXPECT_EQ(incompatible_pairs[0], modify_read_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ModifyWrite) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* modify = MakeModify(root, "M");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(modify, write);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  EXPECT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> modify_write_pair = {modify->id(), write->id()};
+  EXPECT_EQ(incompatible_pairs[0], modify_write_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, WriteModify) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* modify = MakeModify(root, "M");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(write, modify);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> write_modify_pair = {write->id(), modify->id()};
+  EXPECT_EQ(incompatible_pairs[0], write_modify_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ReadModifyWrite) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* modify = MakeModify(root, "M");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(read, modify);
+  root.graph()->AddControlEdge(modify, write);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  EXPECT_EQ(incompatible_pairs.size(), 2);
+  std::pair<int, int> modify_write_pair = {modify->id(), write->id()};
+  std::pair<int, int> read_modify_pair = {read->id(), modify->id()};
+  EXPECT_EQ(incompatible_pairs[0], read_modify_pair);
+  EXPECT_EQ(incompatible_pairs[1], modify_write_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, WriteModifyRead) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* modify = MakeModify(root, "M");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(write, modify);
+  root.graph()->AddControlEdge(modify, read);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 3);
+
+  std::pair<int, int> write_modify_pair = {write->id(), modify->id()};
+  std::pair<int, int> modify_read_pair = {modify->id(), read->id()};
+  std::pair<int, int> write_read_pair = {write->id(), read->id()};
+  EXPECT_EQ(incompatible_pairs[0], modify_read_pair);
+  EXPECT_EQ(incompatible_pairs[1], write_read_pair);
+  EXPECT_EQ(incompatible_pairs[2], write_modify_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, WriteReadModify) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* read = MakeRead(root, "R");
+  Node* modify = MakeModify(root, "M");
+  Node* write = MakeWrite(root, "W");
+
+  root.graph()->AddControlEdge(write, read);
+  root.graph()->AddControlEdge(read, modify);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 3);
+
+  std::pair<int, int> write_modify_pair = {write->id(), modify->id()};
+  std::pair<int, int> write_read_pair = {write->id(), read->id()};
+  std::pair<int, int> read_modify_pair = {read->id(), modify->id()};
+  EXPECT_EQ(incompatible_pairs[0], read_modify_pair);
+  EXPECT_EQ(incompatible_pairs[1], write_read_pair);
+  EXPECT_EQ(incompatible_pairs[2], write_modify_pair);
+}
+
+FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
+  FunctionDefLibrary flib_def;
+  FunctionDef func = FunctionDefHelper::Create(
+      /*function_name=*/name, /*in_def=*/{}, /*out_def=*/{"out: float"},
+      /*attr_def*/
+      {}, /*node_def=*/{FunctionDefHelper::Const("one", 1.0f)},
+      /*ret_def=*/{{"out", "out:output:0"}});
+  *flib_def.add_function() = std::move(func);
+  return flib_def;
+}
+
+Node* MakeCall(Graph* graph, const string& callee_name, const string& node_name,
+               Status* status) {
+  NodeDef call_node;
+  call_node.set_name(node_name);
+  call_node.set_op(callee_name);
+  return graph->AddNode(call_node, status);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, CallRead) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("Const_func");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* read = MakeRead(root, "R");
+  Status status;
+  Node* call = MakeCall(root.graph(), "Const_func", "C", &status);
+  TF_ASSERT_OK(status);
+
+  root.graph()->AddControlEdge(call, read);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> call_read_edge = {call->id(), read->id()};
+  EXPECT_EQ(incompatible_pairs[0], call_read_edge);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ReadCall) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("Const_func");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* read = MakeRead(root, "R");
+  Status status;
+  Node* call = MakeCall(root.graph(), "Const_func", "C", &status);
+  TF_ASSERT_OK(status);
+
+  root.graph()->AddControlEdge(read, call);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> read_call_edge = {read->id(), call->id()};
+  EXPECT_EQ(incompatible_pairs[0], read_call_edge);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, CallWrite) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("Const_func");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* write = MakeWrite(root, "W");
+  Status status;
+  Node* call = MakeCall(root.graph(), "Const_func", "C", &status);
+  TF_ASSERT_OK(status);
+
+  root.graph()->AddControlEdge(call, write);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> call_write_edge = {call->id(), write->id()};
+  EXPECT_EQ(incompatible_pairs[0], call_write_edge);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, WriteCall) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("Const_func");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* write = MakeWrite(root, "W");
+  Status status;
+  Node* call = MakeCall(root.graph(), "Const_func", "C", &status);
+  TF_ASSERT_OK(status);
+
+  root.graph()->AddControlEdge(write, call);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> write_call_edge = {write->id(), call->id()};
+  EXPECT_EQ(incompatible_pairs[0], write_call_edge);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, SymbolicGradientRead) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("Const_func");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* read = MakeRead(root, "R");
+  NameAttrList fn;
+  fn.set_name("Const_func");
+  Node* symbolic_gradient =
+      ops::SymbolicGradient(root, /*input=*/{ops::Const(root, 1.0f)},
+                            /*Tout=*/{DT_FLOAT}, fn)
+          .output[0]
+          .node();
+
+  root.graph()->AddControlEdge(symbolic_gradient, read);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> symbolic_gradient_read_edge = {symbolic_gradient->id(),
+                                                     read->id()};
+  EXPECT_EQ(incompatible_pairs[0], symbolic_gradient_read_edge);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, WriteSymbolicGradient) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("Const_func");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* write = MakeWrite(root, "W");
+  NameAttrList fn;
+  fn.set_name("Const_func");
+  Node* symbolic_gradient =
+      ops::SymbolicGradient(root, /*input=*/{ops::Const(root, 1.0f)},
+                            /*Tout=*/{DT_FLOAT}, fn)
+          .output[0]
+          .node();
+
+  root.graph()->AddControlEdge(write, symbolic_gradient);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+  std::pair<int, int> write_symbolic_gradient_edge = {write->id(),
+                                                      symbolic_gradient->id()};
+  EXPECT_EQ(incompatible_pairs[0], write_symbolic_gradient_edge);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, ChainOfOps) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* write_0 = MakeWrite(root, "W0");
+  Node* neutral_0 = MakeNeutral(root, "N0");
+  Node* read_0 = MakeRead(root, "R0");
+  Node* write_1 = MakeWrite(root, "W1");
+  Node* neutral_1 = MakeNeutral(root, "N1");
+  Node* read_1 = MakeRead(root, "R1");
+
+  root.graph()->AddControlEdge(write_0, neutral_0);
+  root.graph()->AddControlEdge(neutral_0, read_0);
+  root.graph()->AddControlEdge(read_0, write_1);
+  root.graph()->AddControlEdge(write_1, neutral_1);
+  root.graph()->AddControlEdge(neutral_1, read_1);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 5);
+  std::pair<int, int> write_0_read_0_pair = {write_0->id(), read_0->id()};
+  std::pair<int, int> write_0_read_1_pair = {write_0->id(), read_1->id()};
+  std::pair<int, int> write_1_read_1_pair = {write_1->id(), read_1->id()};
+  std::pair<int, int> write_0_write_1_pair = {write_0->id(), write_1->id()};
+  std::pair<int, int> read_0_read_1_pair = {read_0->id(), read_1->id()};
+
+  EXPECT_EQ(incompatible_pairs[0], write_0_read_0_pair);
+  EXPECT_EQ(incompatible_pairs[1], write_0_write_1_pair);
+  EXPECT_EQ(incompatible_pairs[2], write_0_read_1_pair);
+  EXPECT_EQ(incompatible_pairs[3], read_0_read_1_pair);
+  EXPECT_EQ(incompatible_pairs[4], write_1_read_1_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, DagOfOps) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* write_0 = MakeWrite(root, "W0");
+  Node* write_1 = MakeWrite(root, "W1");
+  Node* neutral = MakeNeutral(root, "N");
+  Node* read_0 = MakeRead(root, "R0");
+  Node* read_1 = MakeRead(root, "R1");
+
+  root.graph()->AddControlEdge(write_0, neutral);
+  root.graph()->AddControlEdge(write_1, neutral);
+  root.graph()->AddControlEdge(neutral, read_0);
+  root.graph()->AddControlEdge(neutral, read_1);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 4);
+  std::pair<int, int> write_0_read_0_pair = {write_0->id(), read_0->id()};
+  std::pair<int, int> write_0_read_1_pair = {write_0->id(), read_1->id()};
+  std::pair<int, int> write_1_read_0_pair = {write_1->id(), read_0->id()};
+  std::pair<int, int> write_1_read_1_pair = {write_1->id(), read_1->id()};
+
+  EXPECT_EQ(incompatible_pairs[0], write_0_read_0_pair);
+  EXPECT_EQ(incompatible_pairs[1], write_0_read_1_pair);
+  EXPECT_EQ(incompatible_pairs[2], write_1_read_0_pair);
+  EXPECT_EQ(incompatible_pairs[3], write_1_read_1_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, DagOfOpsWithRepeatedPaths) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Node* write_0 = MakeWrite(root, "W0");
+  Node* write_1 = MakeWrite(root, "W1");
+  Node* neutral = MakeNeutral(root, "N");
+  Node* read_0 = MakeRead(root, "R0");
+  Node* read_1 = MakeRead(root, "R1");
+
+  root.graph()->AddControlEdge(write_0, neutral);
+  root.graph()->AddControlEdge(write_1, neutral);
+  root.graph()->AddControlEdge(neutral, read_0);
+  root.graph()->AddControlEdge(neutral, read_1);
+  root.graph()->AddControlEdge(write_1, read_1);
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 4);
+  std::pair<int, int> write_0_read_0_pair = {write_0->id(), read_0->id()};
+  std::pair<int, int> write_0_read_1_pair = {write_0->id(), read_1->id()};
+  std::pair<int, int> write_1_read_0_pair = {write_1->id(), read_0->id()};
+  std::pair<int, int> write_1_read_1_pair = {write_1->id(), read_1->id()};
+
+  EXPECT_EQ(incompatible_pairs[0], write_0_read_0_pair);
+  EXPECT_EQ(incompatible_pairs[1], write_0_read_1_pair);
+  EXPECT_EQ(incompatible_pairs[2], write_1_read_0_pair);
+  EXPECT_EQ(incompatible_pairs[3], write_1_read_1_pair);
+}
+
+TEST(ResourceOperationSafetyAnalysisTest, Loop) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output init_value = ops::Placeholder(root.WithOpName("init"), DT_FLOAT);
+  Output loop_cond = ops::Placeholder(root.WithOpName("init"), DT_BOOL);
+  Output enter_value =
+      ops::internal::Enter(root.WithOpName("enter"), init_value, "fr");
+  ops::Merge iv(root.WithOpName("iv"), {enter_value, enter_value});
+  ops::Switch latch(root.WithOpName("latch"), iv.output, loop_cond);
+  ops::internal::Exit exit(root.WithOpName("exit"), iv.output);
+  Output next_iteration =
+      ops::NextIteration(root.WithOpName("next_iteration"), latch.output_true);
+  TF_ASSERT_OK(
+      root.graph()->UpdateEdge(next_iteration.node(), 0, iv.output.node(), 1));
+
+  Node* write = MakeWrite(root, "W");
+  Node* read = MakeRead(root, "R");
+
+  root.graph()->AddControlEdge(iv.output.node(), write);
+  root.graph()->AddControlEdge(write, read);
+  root.graph()->AddControlEdge(read, next_iteration.node());
+
+  std::vector<std::pair<int, int>> incompatible_pairs;
+  TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
+
+  ASSERT_EQ(incompatible_pairs.size(), 1);
+
+  std::pair<int, int> write_read_pair = {write->id(), read->id()};
+  EXPECT_EQ(incompatible_pairs[0], write_read_pair);
+}
+
+bool IsResourceArgDef(const OpDef::ArgDef& arg_def) {
+  return arg_def.type() == DT_RESOURCE;
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index a5628b12a2..4f2fabd658 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -185,4 +186,49 @@ Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) {
   return Status::OK();
 }
 
+absl::optional<StringPiece> GetXlaClusterForNode(const Node& node) {
+  const AttrValue* attr_value = node.attrs().Find(kXlaClusterAttr);
+  if (attr_value == nullptr) {
+    return absl::nullopt;
+  }
+  Status s = AttrValueHasType(*attr_value, "string");
+  if (!s.ok()) {
+    return absl::nullopt;
+  }
+  return attr_value->s();
+}
+
+bool HasResourceInputOrOutput(const Node& node) {
+  return std::find(node.input_types().begin(), node.input_types().end(),
+                   DT_RESOURCE) != node.input_types().end() ||
+         std::find(node.output_types().begin(), node.output_types().end(),
+                   DT_RESOURCE) != node.output_types().end();
+}
+
+void RemoveFromXlaCluster(NodeDef* node_def) {
+  node_def->mutable_attr()->erase(kXlaClusterAttr);
+}
+
+Status AdjustCycleDetectionGraphForResourceOps(
+    const Graph* graph, const FunctionLibraryDefinition* flib_def,
+    const std::function<Status(const Node&, bool*)>& resource_ops_to_ignore,
+    GraphCycles* cycles) {
+  std::vector<std::pair<int, int>> unsafe_deps;
+  TF_RETURN_IF_ERROR(ComputeIncompatibleResourceOperationPairs(
+      *graph, flib_def, resource_ops_to_ignore, &unsafe_deps));
+
+  // An edge {P,Q} in `unsafe_deps` denotes that P and Q, both of which are
+  // operations that interact with resource variables, must not be put in the
+  // same cluster.  We enforce this constraint by creating a phantom node, X,
+  // and adding edges P->X and X->Q.  MarkForCompilation then cannot cluster P
+  // and Q together since that would create a cycle with X.
+
+  for (std::pair<int, int> unsafe_dep : unsafe_deps) {
+    int phantom_node_id = cycles->NewNode();
+    CHECK(cycles->InsertEdge(unsafe_dep.first, phantom_node_id));
+    CHECK(cycles->InsertEdge(phantom_node_id, unsafe_dep.second));
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index bcce082aaf..b0439a63ca 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/core/graph/algorithm.h"
 
@@ -44,6 +45,23 @@ bool HasForwardedRefInput(const Node& node);
 // the enclosing graph.
 Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles);
 
+// Returns the XLA cluster in which `node` is placed if it is in an XLA cluster,
+// otherwise returns nullopt.
+absl::optional<StringPiece> GetXlaClusterForNode(const Node& node);
+
+// Removes `node_def` its XLA cluster (by clearing its _XlaCluster attribute).
+void RemoveFromXlaCluster(NodeDef* node_def);
+
+// Returns true if `node` has a DT_RESOURCE typed input or output.
+bool HasResourceInputOrOutput(const Node& node);
+
+// Adds edges to `cycles` to prevent clustering resource operations that cannot
+// be legally clustered.
+Status AdjustCycleDetectionGraphForResourceOps(
+    const Graph* graph, const FunctionLibraryDefinition* flib_def,
+    const std::function<Status(const Node&, bool*)>& resource_ops_to_ignore,
+    GraphCycles* cycles);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_cluster_util_test.cc b/tensorflow/compiler/jit/xla_cluster_util_test.cc
index 2cb351e1ec..65bbf3efe8 100644
--- a/tensorflow/compiler/jit/xla_cluster_util_test.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 7140d47a94..ef6b0e67d3 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -230,7 +230,7 @@ Status XlaCompilationCache::Compile(
     const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable,
-    const XlaCompiler::CompileOptions* compile_options) {
+    const XlaCompiler::CompileOptions& compile_options) {
   return CompileImpl(options, function, constant_args, variable_args, ctx,
                      compilation_result, executable, compile_options, false);
 }
@@ -241,7 +241,7 @@ Status XlaCompilationCache::CompileSingleOp(
     const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable,
-    const XlaCompiler::CompileOptions* compile_options) {
+    const XlaCompiler::CompileOptions& compile_options) {
   const NodeDef& def = ctx->op_kernel().def();
   NameAttrList name;
   name.set_name(def.op());
@@ -256,7 +256,7 @@ Status XlaCompilationCache::CompileImpl(
     const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable,
-    const XlaCompiler::CompileOptions* compile_options,
+    const XlaCompiler::CompileOptions& compile_options,
     bool compile_single_op) {
   CHECK_NE(executable, nullptr);
   VLOG(1) << "XlaCompilationCache::Compile " << DebugString();
@@ -324,13 +324,12 @@ Status XlaCompilationCache::CompileImpl(
     entry->compiled = true;
 
     if (compile_single_op) {
-      entry->compilation_status = compiler.CompileSingleOp(
-          compile_options ? *compile_options : XlaCompiler::CompileOptions(),
-          signature.name, ctx, args, &entry->compilation_result);
+      entry->compilation_status =
+          compiler.CompileSingleOp(compile_options, signature.name, ctx, args,
+                                   &entry->compilation_result);
     } else {
       entry->compilation_status = compiler.CompileFunction(
-          compile_options ? *compile_options : XlaCompiler::CompileOptions(),
-          function, args, &entry->compilation_result);
+          compile_options, function, args, &entry->compilation_result);
     }
     TF_RETURN_IF_ERROR(entry->compilation_status);
     CHECK_EQ(entry->executable.get(), nullptr);
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index fc5f008f4f..10ad87e38c 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -70,7 +70,7 @@ class XlaCompilationCache : public ResourceBase {
                  OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** compilation_result,
                  xla::LocalExecutable** executable,
-                 const XlaCompiler::CompileOptions* compile_options);
+                 const XlaCompiler::CompileOptions& compile_options);
 
   // As above, but calls XlaCompiler::CompileSingleOp instead of
   // XlaCompiler::CompileFunction.
@@ -80,7 +80,7 @@ class XlaCompilationCache : public ResourceBase {
       const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
       const XlaCompiler::CompilationResult** compilation_result,
       xla::LocalExecutable** executable,
-      const XlaCompiler::CompileOptions* compile_options);
+      const XlaCompiler::CompileOptions& compile_options);
 
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
@@ -96,7 +96,7 @@ class XlaCompilationCache : public ResourceBase {
                      OpKernelContext* ctx,
                      const XlaCompiler::CompilationResult** compilation_result,
                      xla::LocalExecutable** executable,
-                     const XlaCompiler::CompileOptions* compile_options,
+                     const XlaCompiler::CompileOptions& compile_options,
                      bool compile_single_op);
 
   // Takes `result` which has been compiled from a Tensorflow subgraph to a
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index dd84fb34c1..3ba48e8c31 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -177,7 +177,7 @@ Status XlaCompileOnDemandOp::Compile(
 
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
   return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
-                                result, executable, &compile_options);
+                                result, executable, compile_options);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 2a2691a6a4..70e6d0be0f 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdlib.h>
 #include <unordered_set>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
@@ -101,7 +102,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      xla::MakeUnique<XlaDeviceAllocator>();
+      absl::make_unique<XlaDeviceAllocator>();
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -327,7 +328,7 @@ xla::StatusOr<XlaDeviceContext*> XlaDevice::GetDeviceContextLocked() {
   // to those methods; see the bug for details. Our only saving grace at the
   // moment is that this race doesn't seem to occur in practice.
   if (use_gpu_device_info_) {
-    auto gpu_device_info = MakeUnique<GpuDeviceInfo>();
+    auto gpu_device_info = absl::make_unique<GpuDeviceInfo>();
     gpu_device_info->stream = stream_.get();
     gpu_device_info->default_context = device_context_;
     set_tensorflow_gpu_device_info(gpu_device_info.get());
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 0a0c089241..ee07c5c964 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -91,7 +91,8 @@ Status XlaTransferManager::TransferLiteralToDevice(
   const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
   VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " "
           << shaped_buffer.ToString();
-  if (UseMultipleStreams()) {
+  if (UseMultipleStreams() && !transfer_manager_->CanShapedBufferBeAccessedNow(
+                                  stream_->parent(), shaped_buffer)) {
     // Initially wait for the compute stream so that memory allocations are
     // synchronized.
     host_to_device_stream_->ThenWaitFor(stream_.get());
@@ -123,11 +124,11 @@ void XlaTransferManager::TransferLiteralFromDevice(
   TensorReference ref(device_tensor);
   transfer_manager_->TransferLiteralFromDevice(
       device_to_host_stream_.get(), shaped_buffer, literal,
-      [=, &shaped_buffer, &literal](xla::Status status) {
+      [=, &shaped_buffer](xla::Status status) {
         ref.Unref();
         done([&]() -> Status {
-          VLOG(1) << "Transfer from device as literal: " << literal.ToString()
-                  << " " << shaped_buffer.ToString();
+          VLOG(1) << "Transfer from device as literal: "
+                  << shaped_buffer.ToString();
           return status;
         }());
       });
@@ -183,18 +184,6 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
       return;
     }
     status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
-    if (status.ok()) {
-      xla_tensor->set_host_tensor(*cpu_tensor);
-      host_to_device_stream_->ThenDoHostCallback([this, done]() {
-        // We must not call the done closure directly from DoHostCallback
-        // to avoid a deadlock. If done() is the callback that ends an
-        // Executor's run, the Executor may call XlaDevice::Sync() inside the
-        // callback. This deadlocks, because XlaDevice::Sync() waits for all
-        // stream activity to complete.
-        thread_pool_->Schedule([done]() { done(Status::OK()); });
-      });
-      return;
-    }
   } else {
     se::DeviceMemoryBase dev_dst_ptr =
         XlaTensor::DeviceMemoryFromTensor(*device_tensor);
@@ -207,8 +196,9 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
           host_to_device_stream_.get(), block_status.error_message().c_str());
     }
   }
-  xla_tensor->set_host_tensor(*cpu_tensor);
-
+  if (status.ok()) {
+    xla_tensor->set_host_tensor(*cpu_tensor);
+  }
   done(status);
 }
 
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index da3e329247..13da5d2f94 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -215,6 +215,8 @@ class XlaAssignVariableOp : public AsyncOpKernel {
                           AnonymousIteratorHandleOp);                          \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
                           IteratorGetNextOp);                                  \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE),          \
+                          IteratorGetNextSyncOp);                              \
   REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")                       \
                               .Device(DEVICE)                                  \
                               .HostMemory("string_handle"),                    \
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
index 4b499b1613..915c5afa79 100644
--- a/tensorflow/compiler/jit/xla_fusion_optimizer.cc
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
@@ -208,6 +208,8 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
 
   GraphCycles cycles;
   TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(&graph, &cycles));
+  TF_RETURN_IF_ERROR(AdjustCycleDetectionGraphForResourceOps(
+      &graph, &graph.flib_def(), /*resource_ops_to_ignore=*/{}, &cycles));
 
   // TODO(hpucha): Make clustering more robust. There are two known issues that
   // we need to mitigate: (a) Non-resource variables can cause deadlocks
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc
index 5736760a87..b77b207908 100644
--- a/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/xla_fusion_optimizer.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
@@ -179,5 +181,28 @@ TEST_F(XlaFusionOptimizerTest, CompilableCycles) {
   EXPECT_EQ(clusters["A"], clusters["C"]);
 }
 
+TEST_F(XlaFusionOptimizerTest, ResourcesClusteringDisallowed) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output var_handle =
+      ops::VarHandleOp(root.WithOpName("Var"), DT_FLOAT, TensorShape({}));
+  Output to_assign = ops::Const(root.WithOpName("Const"), 10.0f);
+  Output begin = ops::Const(root.WithOpName("begin"), 0);
+  Output end = ops::Const(root.WithOpName("end"), 1);
+  Output strides = ops::Const(root.WithOpName("strides"), 1);
+  ops::ResourceStridedSliceAssign assign_1(
+      root.WithOpName("assign_1"), var_handle, begin, end, strides, to_assign);
+  ops::ResourceStridedSliceAssign assign_2(
+      root.WithOpName("assign_2"), var_handle, begin, end, strides, to_assign);
+  root.graph()->AddControlEdge(assign_1.operation.node(),
+                               assign_2.operation.node());
+  grappler::GrapplerItem item;
+  root.graph()->ToGraphDef(&item.graph);
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_NE(clusters["assign_1"], clusters["assign_2"]);
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 4efbb2d5d7..2ffce9298d 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -175,7 +176,7 @@ void XlaComputationLaunchContext::PopulateInputs(
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
       se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
-      arg_buffers_[i] = xla::MakeUnique<ShapedBuffer>(
+      arg_buffers_[i] = absl::make_unique<ShapedBuffer>(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
       arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 4232f514b3..7ac275fab8 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -167,4 +167,4 @@ xla::ScopedShapedBuffer ExtractSubShapedBuffer(
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 8d36d0fa0a..4c9bb2e27b 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -70,7 +71,7 @@ class XlaTensor {
   // Mutates the XlaTensor to set the ShapedBuffer.
   void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
     shaped_buffer_ =
-        xla::MakeUnique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
+        absl::make_unique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
   }
 
   // Some tensors on the device may have known values on the host. We use these
@@ -127,4 +128,4 @@ class XlaTensor {
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index ae98b3f0f9..94e08b6efe 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -388,6 +388,19 @@ tf_xla_py_test(
 )
 
 tf_xla_py_test(
+    name = "reshape_op_test",
+    size = "small",
+    srcs = ["reshape_op_test.py"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
     srcs = ["dynamic_stitch_test.py"],
@@ -715,6 +728,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1177,3 +1191,19 @@ tf_xla_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
+
+tf_xla_py_test(
+    name = "xla_ops_test",
+    size = "small",
+    srcs = ["xla_ops_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/compiler/tests/adadelta_test.py b/tensorflow/compiler/tests/adadelta_test.py
index 3e3c09c66e..b7b7fda293 100644
--- a/tensorflow/compiler/tests/adadelta_test.py
+++ b/tensorflow/compiler/tests/adadelta_test.py
@@ -33,7 +33,7 @@ class AdadeltaOptimizerTest(xla_test.XLATestCase):
   def testBasic(self):
     num_updates = 4  # number of ADADELTA steps to perform
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         for grad in [0.2, 0.1, 0.01]:
           for lr in [1.0, 0.5, 0.1]:
             var0_init = [1.0, 2.0]
diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py
index dc1625793a..69fb3ec296 100644
--- a/tensorflow/compiler/tests/adagrad_da_test.py
+++ b/tensorflow/compiler/tests/adagrad_da_test.py
@@ -33,7 +33,7 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
 
   def testAdagradDAWithoutRegularizationBasic1(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         global_step = resource_variable_ops.ResourceVariable(
             0, dtype=dtypes.int64)
         var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
@@ -69,7 +69,7 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
 
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         global_step = resource_variable_ops.ResourceVariable(
             0, dtype=dtypes.int64)
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
@@ -100,7 +100,7 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
 
   def testAdagradDAWithL1(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         global_step = resource_variable_ops.ResourceVariable(
             0, dtype=dtypes.int64)
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
@@ -131,7 +131,7 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
 
   def testAdagradDAWithL1_L2(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         global_step = resource_variable_ops.ResourceVariable(
             0, dtype=dtypes.int64)
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index d775850a80..ab69319c59 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -32,7 +32,7 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
 
   def testBasic(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -57,7 +57,7 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -83,7 +83,7 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
 
   def testSharing(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
diff --git a/tensorflow/compiler/tests/adamax_test.py b/tensorflow/compiler/tests/adamax_test.py
index c4fdbc5974..3ed1d41b71 100644
--- a/tensorflow/compiler/tests/adamax_test.py
+++ b/tensorflow/compiler/tests/adamax_test.py
@@ -49,7 +49,7 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
 
   def testBasic(self):
     for i, dtype in enumerate(self.float_types):
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         variable_scope.get_variable_scope().set_use_resource(True)
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
@@ -100,7 +100,7 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         variable_scope.get_variable_scope().set_use_resource(True)
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
diff --git a/tensorflow/compiler/tests/addsign_test.py b/tensorflow/compiler/tests/addsign_test.py
index 9ec5a964cb..1bc07ace23 100644
--- a/tensorflow/compiler/tests/addsign_test.py
+++ b/tensorflow/compiler/tests/addsign_test.py
@@ -63,7 +63,7 @@ class AddSignTest(xla_test.XLATestCase):
                  alpha=1.0,
                  beta=0.9):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         # Initialize variables for numpy implementation.
         m0, m1 = 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype)
diff --git a/tensorflow/compiler/tests/argminmax_test.py b/tensorflow/compiler/tests/argminmax_test.py
index 9d3a889b1f..4155342787 100644
--- a/tensorflow/compiler/tests/argminmax_test.py
+++ b/tensorflow/compiler/tests/argminmax_test.py
@@ -40,7 +40,7 @@ class ArgMinMaxTest(xla_test.XLATestCase):
       op_input: numpy input array to use as input to 'op'.
       expected: numpy array representing the expected output of 'op'.
     """
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         pinp = array_ops.placeholder(
             dtypes.as_dtype(op_input.dtype), op_input.shape, name="a")
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 0aafda7fb4..ed4940f204 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -36,7 +36,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
   """Test cases for binary operators."""
 
   def _testBinary(self, op, a, b, expected, equality_test=None):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         pa = array_ops.placeholder(dtypes.as_dtype(a.dtype), a.shape, name="a")
         pb = array_ops.placeholder(dtypes.as_dtype(b.dtype), b.shape, name="b")
@@ -1167,6 +1167,16 @@ class BinaryOpsTest(xla_test.XLATestCase):
     for dtype in self.numeric_types:
       self._testBinary(
           array_ops.tile,
+          np.array([[6], [3], [4]], dtype=dtype),
+          np.array([2, 0], dtype=np.int32),
+          expected=np.empty([6, 0], dtype=dtype))
+      self._testBinary(
+          array_ops.tile,
+          np.array([[6, 3, 4]], dtype=dtype),
+          np.array([2, 0], dtype=np.int32),
+          expected=np.empty([2, 0], dtype=dtype))
+      self._testBinary(
+          array_ops.tile,
           np.array([[6]], dtype=dtype),
           np.array([1, 2], dtype=np.int32),
           expected=np.array([[6, 6]], dtype=dtype))
@@ -1362,5 +1372,40 @@ class BinaryOpsTest(xla_test.XLATestCase):
                              [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]],
                             dtype=dtype))
 
+  def testBroadcastTo(self):
+    for dtype in self.all_types:
+      x = np.random.randint(0, high=100, size=[2, 3])
+      self._testBinary(
+          array_ops.broadcast_to,
+          x,
+          np.array([2, 3], dtype=np.int32),
+          expected=x)
+      self._testBinary(
+          array_ops.broadcast_to,
+          x,
+          np.array([6, 6], dtype=np.int32),
+          expected=np.tile(x, [3, 2]))
+      self._testBinary(
+          array_ops.broadcast_to,
+          x,
+          np.array([7, 4, 3], dtype=np.int32),
+          expected=np.tile(x, [7, 2, 1]))
+      self._testBinary(
+          array_ops.broadcast_to,
+          x,
+          np.array([7, 0, 3], dtype=np.int32),
+          expected=np.zeros([7, 0, 3], dtype=dtype))
+      self._testBinary(
+          array_ops.broadcast_to,
+          x,
+          np.array([7, 1, 2, 9], dtype=np.int32),
+          expected=np.tile(x, [7, 1, 1, 3]))
+      self._testBinary(
+          array_ops.broadcast_to,
+          np.zeros([2, 0], dtype=dtype),
+          np.array([4, 0], dtype=np.int32),
+          expected=np.zeros([4, 0], dtype=dtype))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/bucketize_op_test.py b/tensorflow/compiler/tests/bucketize_op_test.py
index ef4d5f6322..5c24db539b 100644
--- a/tensorflow/compiler/tests/bucketize_op_test.py
+++ b/tensorflow/compiler/tests/bucketize_op_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 class BucketizationOpTest(xla_test.XLATestCase):
 
   def testInt(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       p = array_ops.placeholder(dtypes.int32)
       with self.test_scope():
         op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11])
@@ -38,7 +38,7 @@ class BucketizationOpTest(xla_test.XLATestCase):
                           sess.run(op, {p: [-5, 0, 2, 3, 5, 8, 10, 11, 12]}))
 
   def testFloat(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       p = array_ops.placeholder(dtypes.float32)
       with self.test_scope():
         op = math_ops._bucketize(p, boundaries=[0., 3., 8., 11.])
@@ -48,7 +48,7 @@ class BucketizationOpTest(xla_test.XLATestCase):
           sess.run(op, {p: [-5., 0., 2., 3., 5., 8., 10., 11., 12.]}))
 
   def test2DInput(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       p = array_ops.placeholder(dtypes.float32)
       with self.test_scope():
         op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11])
@@ -58,7 +58,7 @@ class BucketizationOpTest(xla_test.XLATestCase):
                                  {p: [[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]}))
 
   def testInvalidBoundariesOrder(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       p = array_ops.placeholder(dtypes.int32)
       with self.test_scope():
         op = math_ops._bucketize(p, boundaries=[0, 8, 3, 11])
@@ -67,7 +67,7 @@ class BucketizationOpTest(xla_test.XLATestCase):
         sess.run(op, {p: [-5, 0]})
 
   def testBoundariesNotList(self):
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesRegexp(TypeError, "Expected list.*"):
         p = array_ops.placeholder(dtypes.int32)
         with self.test_scope():
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index a4e7f75081..a57d1dc81e 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -56,7 +56,7 @@ class CategoricalTest(xla_test.XLATestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       random_seed.set_random_seed(1618)
       op = random_ops.multinomial(logits, num_samples,
                                   output_dtype=dtypes.int32)
@@ -79,7 +79,7 @@ class CategoricalTest(xla_test.XLATestCase):
 
   def _testRngIsNotConstant(self, rng, dtype, output_dtype):
     # Tests that 'rng' does not always return the same value.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         x = rng(dtype, output_dtype)
 
@@ -107,7 +107,7 @@ class CategoricalTest(xla_test.XLATestCase):
   def testCategoricalIsInRange(self):
     for dtype in self.float_types:
       for output_dtype in self.output_dtypes():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           with self.test_scope():
             x = random_ops.multinomial(
                 array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
index ed532db0ee..d1896a50f7 100644
--- a/tensorflow/compiler/tests/cholesky_op_test.py
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -54,7 +54,7 @@ class CholeskyOpTest(xla_test.XLATestCase):
 
   def _verifyCholesky(self, x, atol=1e-6):
     # Verify that LL^T == x.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(
           dtypes.as_dtype(x.dtype), shape=x.shape)
       with self.test_scope():
diff --git a/tensorflow/compiler/tests/clustering_test.py b/tensorflow/compiler/tests/clustering_test.py
index e42ebf8f9e..88bd58b2da 100644
--- a/tensorflow/compiler/tests/clustering_test.py
+++ b/tensorflow/compiler/tests/clustering_test.py
@@ -38,7 +38,7 @@ class ClusteringTest(xla_test.XLATestCase):
     val1 = np.array([4, 3, 2, 1], dtype=np.float32)
     val2 = np.array([5, 6, 7, 8], dtype=np.float32)
     expected = val1 + val2
-    with self.test_session():
+    with self.cached_session():
       with self.test_scope():
         input1 = constant_op.constant(val1, name="const1")
         input2 = constant_op.constant(val2, name="const2")
@@ -50,7 +50,7 @@ class ClusteringTest(xla_test.XLATestCase):
     val1 = np.array([4, 3, 2, 1]).astype(np.float32)
     val2 = np.array([5, 6, 7, 8]).astype(np.float32)
     expected = val1 + val2
-    with self.test_session():
+    with self.cached_session():
       with ops.device(CPU_DEVICE):
         input1 = constant_op.constant(val1, name="const1")
         input2 = constant_op.constant(val2, name="const2")
@@ -68,7 +68,7 @@ class ClusteringTest(xla_test.XLATestCase):
     # where x and z are placed on the CPU and y and w are placed on the XLA
     # device. If y and w are clustered for compilation, then the graph will
     # deadlock since the clustered graph will contain a self-loop.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with ops.device(CPU_DEVICE):
         x = array_ops.placeholder(dtypes.float32, [2])
       with self.test_scope():
@@ -81,7 +81,7 @@ class ClusteringTest(xla_test.XLATestCase):
     self.assertAllClose(result, [12., 2.], rtol=1e-3)
 
   def testHostMemory(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.int32)
       with self.test_scope():
         y = x + 1
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index d9ad428147..37e5318bb5 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.platform import googletest
 class ConcatTest(xla_test.XLATestCase):
 
   def testHStack(self):
-    with self.test_session():
+    with self.cached_session():
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       with self.test_scope():
@@ -49,7 +49,7 @@ class ConcatTest(xla_test.XLATestCase):
     self.assertAllEqual(result[4:, :], params[p2])
 
   def testVStack(self):
-    with self.test_session():
+    with self.cached_session():
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       with self.test_scope():
@@ -65,7 +65,7 @@ class ConcatTest(xla_test.XLATestCase):
     self.assertAllEqual(result[:, 4:], params[p2])
 
   def testInt32(self):
-    with self.test_session():
+    with self.cached_session():
       p1 = np.random.rand(2, 3).astype("i")
       p2 = np.random.rand(2, 3).astype("i")
       x1 = constant_op.constant(p1)
@@ -88,7 +88,7 @@ class ConcatTest(xla_test.XLATestCase):
       dtype_feed = dtypes.float32
     else:
       dtype_feed = dtype
-    with self.test_session():
+    with self.cached_session():
       p = []
       for i in np.arange(num_tensors):
         input_shape = shape
@@ -130,7 +130,7 @@ class ConcatTest(xla_test.XLATestCase):
     self._testRandom(dtypes.int32)
 
   def _testGradientsSimple(self):
-    with self.test_session():
+    with self.cached_session():
       inp = []
       inp_tensors = []
       with self.test_scope():
@@ -157,7 +157,7 @@ class ConcatTest(xla_test.XLATestCase):
     self._testGradientsSimple()
 
   def _testGradientsFirstDim(self):
-    with self.test_session():
+    with self.cached_session():
       inp = []
       inp_tensors = []
       with self.test_scope():
@@ -185,7 +185,7 @@ class ConcatTest(xla_test.XLATestCase):
     self._testGradientsFirstDim()
 
   def _testGradientsLastDim(self):
-    with self.test_session():
+    with self.cached_session():
       inp = []
       inp_tensors = []
       with self.test_scope():
@@ -220,7 +220,7 @@ class ConcatTest(xla_test.XLATestCase):
     # Random dim to concat on
     concat_dim = np.random.randint(5)
     concat_dim_sizes = np.random.randint(1, 5, size=num_tensors)
-    with self.test_session():
+    with self.cached_session():
       inp = []
       inp_tensors = []
       with self.test_scope():
@@ -254,7 +254,7 @@ class ConcatTest(xla_test.XLATestCase):
   def DISABLED_testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         for shape0 in (), (2,):
           axis = len(shape0)
@@ -276,14 +276,14 @@ class ConcatTest(xla_test.XLATestCase):
   def testConcatTuple(self):
     c1 = np.random.rand(4, 4).astype(np.float32)
     c2 = np.random.rand(4, 4).astype(np.float32)
-    with self.test_session():
+    with self.cached_session():
       with self.test_scope():
         concat_list_t = array_ops.concat([c1, c2], 0)
         concat_tuple_t = array_ops.concat((c1, c2), 0)
       self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
 
   def testConcatNoScalars(self):
-    with self.test_session():
+    with self.cached_session():
       with self.test_scope():
         scalar = constant_op.constant(7)
         dim = array_ops.placeholder(dtypes.int32)
@@ -295,7 +295,7 @@ class ConcatTest(xla_test.XLATestCase):
 class ConcatOffsetTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         cdim = constant_op.constant(1, dtypes.int32)
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
@@ -309,7 +309,7 @@ class ConcatOffsetTest(xla_test.XLATestCase):
 class PackTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
@@ -319,7 +319,7 @@ class PackTest(xla_test.XLATestCase):
         self.assertAllEqual(ans, [[2, 3, 5], [2, 7, 5], [2, 20, 5]])
 
   def testScalars(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         s0 = constant_op.constant(2, dtypes.int32)
         s1 = constant_op.constant(3, dtypes.int32)
@@ -329,7 +329,7 @@ class PackTest(xla_test.XLATestCase):
         self.assertAllEqual(ans, [2, 3, 5])
 
   def testEmpty(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         s0 = constant_op.constant([[]], dtypes.int32)
         s1 = constant_op.constant([[]], dtypes.int32)
diff --git a/tensorflow/compiler/tests/conv2d_test.py b/tensorflow/compiler/tests/conv2d_test.py
index f9db103f6d..af00ff287d 100644
--- a/tensorflow/compiler/tests/conv2d_test.py
+++ b/tensorflow/compiler/tests/conv2d_test.py
@@ -87,7 +87,7 @@ class Conv2DTest(xla_test.XLATestCase, parameterized.TestCase):
     dilations = test_utils.PermuteDimsBetweenDataFormats(
         dilations, data_format_src, data_format_dst)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
       t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
       with self.test_scope():
@@ -288,7 +288,7 @@ class Conv2DBackpropInputTest(xla_test.XLATestCase, parameterized.TestCase):
       dilations = test_utils.PermuteDimsBetweenDataFormats(
           dilations, data_format_src, data_format_dst)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       t1 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
       t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes)
       with self.test_scope():
@@ -586,7 +586,7 @@ class Conv2DBackpropFilterTest(xla_test.XLATestCase, parameterized.TestCase):
       dilations = test_utils.PermuteDimsBetweenDataFormats(
           dilations, data_format_src, data_format_dst)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
       t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes)
       with self.test_scope():
diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py
index 31ee41f04f..33fd983b54 100644
--- a/tensorflow/compiler/tests/conv3d_test.py
+++ b/tensorflow/compiler/tests/conv3d_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.platform import googletest
 class Conv3DBackpropFilterV2GradTest(xla_test.XLATestCase):
 
   def testGradient(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       for padding in ["SAME", "VALID"]:
         for stride in [1, 2]:
           np.random.seed(1)
@@ -69,7 +69,7 @@ class Conv3DBackpropFilterV2GradTest(xla_test.XLATestCase):
 class Conv3DTransposeTest(xla_test.XLATestCase):
 
   def testConv3DTransposeSingleStride(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       strides = [1, 1, 1, 1, 1]
 
       # Input, output: [batch, depth, height, width, channel]
@@ -119,7 +119,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
                 self.assertAllClose(target, value[n, d, h, w, k])
 
   def testConv3DTransposeSame(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       strides = [1, 2, 2, 2, 1]
 
       # Input, output: [batch, depth, height, width, depth]
@@ -157,7 +157,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
                 self.assertAllClose(target, value[n, d, h, w, k])
 
   def testConv3DTransposeValid(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       strides = [1, 2, 2, 2, 1]
 
       # Input, output: [batch, depth, height, width, depth]
@@ -217,7 +217,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
     np.random.seed(1)  # Make it reproducible.
     x_val = np.random.random_sample(x_shape).astype(np.float64)
     f_val = np.random.random_sample(f_shape).astype(np.float64)
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       x = constant_op.constant(x_val, name="x", dtype=dtypes.float32)
       f = constant_op.constant(f_val, name="f", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index 865f60ccab..04f3b3ef49 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -86,7 +86,7 @@ class DenseLayerTest(test.TestCase):
     XlaLaunch op by XLA.
     """
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(shape=[2, 2, 3], dtype=np.float32)
       with jit_scope():
         y = layers.dense(x, 3)
@@ -113,7 +113,7 @@ class DenseLayerTest(test.TestCase):
     cluster, causing dense layer to be split into TWO XlaLaunch ops.
     """
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
       with jit_scope():
         y = layers.dense(x, 3)
diff --git a/tensorflow/compiler/tests/depthwise_conv_op_test.py b/tensorflow/compiler/tests/depthwise_conv_op_test.py
index 98dc73e189..6ef8a68ca5 100644
--- a/tensorflow/compiler/tests/depthwise_conv_op_test.py
+++ b/tensorflow/compiler/tests/depthwise_conv_op_test.py
@@ -151,7 +151,7 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
                   dtype=data_type).reshape(tensor_in_sizes)
     x2 = np.array([f * 1.0 for f in range(1, total_size_2 + 1)],
                   dtype=data_type).reshape(filter_in_sizes)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       if data_type == np.float32:
         tolerance = 1e-4
       else:
@@ -247,7 +247,7 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
                   dtype=np.float32).reshape(tensor_in_sizes)
     x2 = np.array([f * 1.0 for f in range(1, total_size_2 + 1)],
                   dtype=np.float32).reshape(filter_in_sizes)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       t1 = array_ops.placeholder(shape=tensor_in_sizes, dtype=np.float32)
       t2 = array_ops.placeholder(shape=filter_in_sizes, dtype=np.float32)
       with self.test_scope():
@@ -321,7 +321,7 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
     def _GetVal(use_xla):
-      with self.test_session():
+      with self.cached_session():
         t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
         t1 = array_ops.placeholder(np.float32, shape=filter_sizes)
         t2 = array_ops.placeholder(np.float32, shape=output_sizes)
@@ -356,7 +356,7 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
     def _GetVal(use_xla):
-      with self.test_session():
+      with self.cached_session():
         t0 = array_ops.placeholder(np.float32, shape=input_sizes)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = array_ops.placeholder(np.float32, shape=output_sizes)
diff --git a/tensorflow/compiler/tests/dynamic_slice_ops_test.py b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
index 154e36b10e..5f01e128f0 100644
--- a/tensorflow/compiler/tests/dynamic_slice_ops_test.py
+++ b/tensorflow/compiler/tests/dynamic_slice_ops_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.platform import test
 class DynamicUpdateSliceOpsTest(xla_test.XLATestCase):
 
   def _assertOpOutputMatchesExpected(self, op, args, expected):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         placeholders = [
             array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
diff --git a/tensorflow/compiler/tests/dynamic_stitch_test.py b/tensorflow/compiler/tests/dynamic_stitch_test.py
index edd78153b5..50b04daa6b 100644
--- a/tensorflow/compiler/tests/dynamic_stitch_test.py
+++ b/tensorflow/compiler/tests/dynamic_stitch_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.platform import googletest
 class DynamicStitchTest(xla_test.XLATestCase):
 
   def _AssertDynamicStitchResultIs(self, indices, data, expected):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       index_placeholders = [
           array_ops.placeholder(dtypes.as_dtype(arg.dtype)) for arg in indices
       ]
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index ff097f80f1..e32f3d4b7f 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -101,7 +101,7 @@ class EagerTest(xla_test.XLATestCase):
       self.assertAllEqual(15, product)
 
     # Run some ops graphly
-    with context.graph_mode(), self.test_session() as sess:
+    with context.graph_mode(), self.cached_session() as sess:
       with self.test_scope():
         three = constant_op.constant(3)
         five = constant_op.constant(5)
@@ -443,7 +443,6 @@ class EagerFunctionTest(xla_test.XLATestCase):
       self.assertAllEqual((2, 3, 4), dz.shape.as_list())
 
   def testNestedDefun(self):
-    self.skipTest('Nested defuns do not work on TPU at the moment')
     with self.test_scope():
 
       @function.defun
diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py
index 5529fdbb09..37061e91d1 100644
--- a/tensorflow/compiler/tests/extract_image_patches_op_test.py
+++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py
@@ -44,7 +44,7 @@ class ExtractImagePatches(xla_test.XLATestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.test_session():
+    with self.cached_session():
       image_placeholder = array_ops.placeholder(dtypes.float32)
       with self.test_scope():
         out_tensor = array_ops.extract_image_patches(
diff --git a/tensorflow/compiler/tests/fake_quant_ops_test.py b/tensorflow/compiler/tests/fake_quant_ops_test.py
index c48ab178bf..2178c44556 100644
--- a/tensorflow/compiler/tests/fake_quant_ops_test.py
+++ b/tensorflow/compiler/tests/fake_quant_ops_test.py
@@ -107,7 +107,7 @@ class FakeQuantWithMinMaxArgsTest(xla_test.XLATestCase):
         ],
         dtype=np.float32)
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         input_placeholder = array_ops.placeholder(
             dtypes.float32, inputs.shape, name="inputs")
@@ -198,7 +198,7 @@ class FakeQuantWithMinMaxArgsGradientTest(xla_test.XLATestCase):
         [0.0, 0.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0, 0.0],
         dtype=np.float32)
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         gradient_placeholder = array_ops.placeholder(
             dtypes.float32, gradients.shape, name="gradients")
@@ -306,7 +306,7 @@ class FakeQuantWithMinMaxVarsTest(xla_test.XLATestCase):
         ],
         dtype=np.float32)
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         input_placeholder = array_ops.placeholder(
             dtypes.float32, inputs.shape, name="inputs")
@@ -406,7 +406,7 @@ class FakeQuantWithMinMaxVarsGradientTest(xla_test.XLATestCase):
     expected_backprops_wrt_min = 1.0 + 2.0
     expected_backprops_wrt_max = 10.0 + 11.0
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         gradient_placeholder = array_ops.placeholder(
             dtypes.float32, gradients.shape, name="gradients")
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
index c64ea249ec..b3e13fbaa6 100644
--- a/tensorflow/compiler/tests/fft_test.py
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -71,7 +71,7 @@ class FFTTest(xla_test.XLATestCase):
       data = np.reshape(data.astype(np.float32).view(np.complex64), shape)
       data = to_32bit(complex_to_input(data))
       expected = to_32bit(input_to_expected(data))
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.test_scope():
           ph = array_ops.placeholder(
               dtypes.as_dtype(data.dtype), shape=data.shape)
@@ -93,7 +93,7 @@ class FFTTest(xla_test.XLATestCase):
         data, nperseg=ws, noverlap=ws - hs, boundary=None, window=window)[2]
     expected = np.swapaxes(expected, -1, -2)
     expected *= window.sum()  # scipy divides by window sum
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         ph = array_ops.placeholder(
             dtypes.as_dtype(data.dtype), shape=data.shape)
diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py
index 0f64cc87cd..8c7edfd277 100644
--- a/tensorflow/compiler/tests/fifo_queue_test.py
+++ b/tensorflow/compiler/tests/fifo_queue_test.py
@@ -31,13 +31,13 @@ from tensorflow.python.platform import test
 class FIFOQueueTest(xla_test.XLATestCase):
 
   def testEnqueue(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       enqueue_op = q.enqueue((10.0,))
       enqueue_op.run()
 
   def testEnqueueWithShape(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32, shapes=(3, 2))
       enqueue_correct_op = q.enqueue(([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],))
       enqueue_correct_op.run()
@@ -46,7 +46,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertEqual(1, q.size().eval())
 
   def testMultipleDequeues(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()])
       self.evaluate(q.enqueue([1]))
       self.evaluate(q.enqueue([2]))
@@ -55,7 +55,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertAllEqual(set([1, 2, 3]), set([a, b, c]))
 
   def testQueuesDontShare(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()])
       self.evaluate(q.enqueue(1))
       q2 = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()])
@@ -64,13 +64,13 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertAllEqual(self.evaluate(q.dequeue()), 1)
 
   def testEnqueueDictWithoutNames(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       with self.assertRaisesRegexp(ValueError, "must have names"):
         q.enqueue({"a": 12.0})
 
   def testParallelEnqueue(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -95,7 +95,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertItemsEqual(elems, results)
 
   def testParallelDequeue(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -119,7 +119,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertItemsEqual(elems, results)
 
   def testDequeue(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -133,7 +133,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       q = data_flow_ops.FIFOQueue(3, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -163,7 +163,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
         self.assertEqual([elem], result)
 
   def testMultiEnqueueAndDequeue(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       q = data_flow_ops.FIFOQueue(10, (dtypes_lib.int32, dtypes_lib.float32))
       elems = [(5, 10.0), (10, 20.0), (15, 30.0)]
       enqueue_ops = [q.enqueue((x, y)) for x, y in elems]
@@ -179,12 +179,12 @@ class FIFOQueueTest(xla_test.XLATestCase):
         self.assertEqual([y], y_val)
 
   def testQueueSizeEmpty(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       self.assertEqual([0], q.size().eval())
 
   def testQueueSizeAfterEnqueueAndDequeue(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue()
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 1da97fd512..7ca50b02d9 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -112,7 +112,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
   def testFtrlwithoutRegularization(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -146,7 +146,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
   def testFtrlwithoutRegularization2(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -174,7 +174,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
   def testFtrlWithL1(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -202,7 +202,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -236,7 +236,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     weights will tend to have smaller magnitudes with this parameter set.
     """
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([4.0, 3.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
@@ -273,9 +273,9 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
   def testEquivAdagradwithoutRegularization(self):
     steps = 5
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         val0, val1 = self.equivAdagradTest_FtrlPart(steps, dtype)
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
     self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4, half_rtol=1e-2)
@@ -284,9 +284,9 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         val0, val1 = self.equivGradientDescentTest_FtrlPart(steps, dtype)
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         val2, val3 = self.equivGradientDescentTest_GradientDescentPart(
             steps, dtype)
 
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index 04fba44446..b1891b918c 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -40,7 +40,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -66,7 +66,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([4, 3, 2, 1]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -90,7 +90,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = Func(aval, bval)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -105,7 +105,7 @@ class FunctionTest(xla_test.XLATestCase):
 
   def testCompileTimeConstantsInDefun(self):
     """Tests that XLA handles compile-time constants in defuns."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       @function.Defun(dtypes.float32, dtypes.int32, dtypes.int32)
       def Foo(a, c, d):
@@ -140,7 +140,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([4, 3, 2, 1]).reshape([2, 2]).astype(np.float32)
     expected = aval + bval * 2
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         a = array_ops.placeholder(dtypes.float32, name="a")
         b = array_ops.placeholder(dtypes.float32, name="b")
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 132e42ac7a..8c018cccb8 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -83,7 +83,7 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
     y_ref, mean_ref, var_ref = self._reference_training(
         x_val, scale_val, offset_val, epsilon, data_format_src)
 
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       # To avoid constant folding
       x_val_converted = test_utils.ConvertBetweenDataFormats(
           x_val, data_format_src, data_format)
@@ -126,7 +126,7 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
     y_ref, mean_ref, var_ref = self._reference_training(
         x_val, scale_val, offset_val, epsilon, data_format_src)
 
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       # To avoid constant folding
       x_val_converted = test_utils.ConvertBetweenDataFormats(
           x_val, data_format_src, data_format)
@@ -210,7 +210,7 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
     grad_x_ref, grad_scale_ref, grad_offset_ref = self._reference_grad(
         x_val, grad_val, scale_val, mean_val, var_val, epsilon, data_format_src)
 
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       grad_val_converted = test_utils.ConvertBetweenDataFormats(
           grad_val, data_format_src, data_format)
       x_val_converted = test_utils.ConvertBetweenDataFormats(
@@ -260,7 +260,7 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
     var_val = np.random.random_sample(scale_shape).astype(np.float32)
     data_format_src = "NHWC"
 
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       grad_val_converted = test_utils.ConvertBetweenDataFormats(
           grad_val, data_format_src, data_format)
       x_val_converted = test_utils.ConvertBetweenDataFormats(
diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py
index 23b0aed34f..7161f4ab33 100644
--- a/tensorflow/compiler/tests/gather_nd_op_test.py
+++ b/tensorflow/compiler/tests/gather_nd_op_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 class GatherNdTest(xla_test.XLATestCase):
 
   def _runGather(self, params, indices):
-    with self.test_session():
+    with self.cached_session():
       paramsp = array_ops.placeholder(params.dtype)
       indicesp = array_ops.placeholder(indices.dtype)
       with self.test_scope():
@@ -46,7 +46,7 @@ class GatherNdTest(xla_test.XLATestCase):
               np.array([[4], [4], [0]], np.int32)))
 
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
-    with self.test_session():
+    with self.cached_session():
       params = np.ones((3, 3), dtype=np.float32)
 
       indices_empty = np.empty((0, 2), dtype=np.int32)
diff --git a/tensorflow/compiler/tests/gather_test.py b/tensorflow/compiler/tests/gather_test.py
index e9c8ef7c91..089d95daab 100644
--- a/tensorflow/compiler/tests/gather_test.py
+++ b/tensorflow/compiler/tests/gather_test.py
@@ -42,7 +42,7 @@ class GatherTest(xla_test.XLATestCase):
     return data
 
   def testScalar1D(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in self.all_tf_types:
         for indices in 4, [4], [1, 2, 2, 4, 5]:
@@ -55,7 +55,7 @@ class GatherTest(xla_test.XLATestCase):
           self.assertAllEqual(np_val, gather_val)
 
   def testScalar2D(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
                        [12, 13, 14]])
       for dtype in self.all_tf_types:
@@ -69,7 +69,7 @@ class GatherTest(xla_test.XLATestCase):
           self.assertAllEqual(expected, gather_val)
 
   def testSimpleTwoD32(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
                        [12, 13, 14]])
       for dtype in self.all_tf_types:
@@ -87,7 +87,7 @@ class GatherTest(xla_test.XLATestCase):
     if np.int64 not in self.int_types:
       return
 
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
                        [12, 13, 14]])
       # The indices must be in bounds for any axis.
@@ -114,7 +114,7 @@ class GatherTest(xla_test.XLATestCase):
         for axis in 0, 1, 2, 3, -1, -2:
           params = self._buildParams(np.random.randn(*shape), dtype)
           indices = np.random.randint(shape[axis], size=indices_shape)
-          with self.test_session() as sess, self.test_scope():
+          with self.cached_session() as sess, self.test_scope():
             tf_params = array_ops.placeholder(dtype=dtype)
             tf_indices = constant_op.constant(indices, dtype=dtypes.int32)
             gather = array_ops.gather(tf_params, tf_indices, axis=axis)
@@ -123,7 +123,7 @@ class GatherTest(xla_test.XLATestCase):
             self.assertAllEqual(gather_np, gather_value)
 
   def testIndicesWithDifferentDimensions(self):
-    with self.test_session():
+    with self.cached_session():
       for dtype in self.numeric_tf_types:
         params = array_ops.placeholder(dtype=dtype)
         indices = array_ops.placeholder(dtype=np.int32)
@@ -137,7 +137,7 @@ class GatherTest(xla_test.XLATestCase):
             [[7]], gather.eval(feed_dict={params: [4, 7, 2], indices: [[1]]}))
 
   def testGatherPrecision(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       data = np.array([[0, 0, 0, 0], [0, 2 * (1 + np.exp2(-8)), 0, 0],
                        [0, 0, 0, 0], [0.015789, 0.0985, 0.55789, 0.3842]])
       indices = np.array([1, 2, 3, 1])
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index bf986ade06..6fe5a66e0e 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -54,7 +54,7 @@ class RGBToHSVTest(xla_test.XLATestCase):
       inp = GenerateNumpyRandomRGB(shape).astype(nptype)
 
       # Convert to HSV and back, as a batch and individually
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         batch0 = array_ops.placeholder(nptype, shape=shape)
         with self.test_scope():
           batch1 = image_ops.rgb_to_hsv(batch0)
@@ -78,7 +78,7 @@ class RGBToHSVTest(xla_test.XLATestCase):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     for nptype in self.float_types:
       rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
-      with self.test_session():
+      with self.cached_session():
         placeholder = array_ops.placeholder(nptype)
         with self.test_scope():
           hsv = image_ops.rgb_to_hsv(placeholder)
@@ -97,7 +97,7 @@ class RGBToHSVTest(xla_test.XLATestCase):
           for r, g, b in rgb_flat
       ])
       hsv_np = hsv_np.reshape(4, 4, 4, 3)
-      with self.test_session():
+      with self.cached_session():
         placeholder = array_ops.placeholder(nptype)
         with self.test_scope():
           hsv_op = image_ops.rgb_to_hsv(placeholder)
@@ -108,7 +108,7 @@ class RGBToHSVTest(xla_test.XLATestCase):
 class AdjustContrastTest(xla_test.XLATestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(x_np.dtype, shape=x_np.shape)
       flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
       with self.test_scope():
@@ -146,7 +146,7 @@ class AdjustContrastTest(xla_test.XLATestCase):
     return y_np
 
   def _adjustContrastTf(self, x_np, contrast_factor):
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(np.float32)
       with self.test_scope():
         y = image_ops.adjust_contrast(x, contrast_factor)
@@ -180,7 +180,7 @@ class AdjustHueTest(xla_test.XLATestCase):
     y_data = [0, 13, 1, 54, 226, 59, 8, 234, 150, 255, 39, 1]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(x_np.dtype, shape=x_shape)
       flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
       with self.test_scope():
@@ -198,7 +198,7 @@ class AdjustHueTest(xla_test.XLATestCase):
     y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(x_np.dtype, shape=x_shape)
       flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
       with self.test_scope():
@@ -216,7 +216,7 @@ class AdjustHueTest(xla_test.XLATestCase):
     y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(x_np.dtype, shape=x_shape)
       flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
       with self.test_scope():
@@ -244,7 +244,7 @@ class AdjustHueTest(xla_test.XLATestCase):
     return y_v.reshape(x_np.shape)
 
   def _adjustHueTf(self, x_np, delta_h):
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(dtypes.float32)
       with self.test_scope():
         y = gen_image_ops.adjust_hue(x, delta_h)
@@ -324,7 +324,7 @@ class AdjustSaturationTest(xla_test.XLATestCase):
     y_rgb_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
     y_np = np.array(y_rgb_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(x_np.dtype, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
       y_tf = y.eval({x: x_np})
@@ -339,7 +339,7 @@ class AdjustSaturationTest(xla_test.XLATestCase):
     y_data = [0, 5, 13, 0, 106, 226, 30, 0, 234, 89, 255, 0]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(x_np.dtype, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
       y_tf = y.eval({x: x_np})
@@ -378,7 +378,7 @@ class AdjustSaturationTest(xla_test.XLATestCase):
         "gb_same",
         "rgb_same",
     ]
-    with self.test_session():
+    with self.cached_session():
       for x_shape in x_shapes:
         for test_style in test_styles:
           x_np = np.random.rand(*x_shape) * 255.
@@ -410,13 +410,14 @@ class ResizeBilinearTest(xla_test.XLATestCase):
                                       image_np,
                                       target_shape,
                                       expected=None,
-                                      large_tolerance=False):
+                                      large_tolerance=False,
+                                      align_corners=True):
     if expected is None:
       self.fail("expected must be specified")
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       image = array_ops.placeholder(image_np.dtype)
       resized = gen_image_ops.resize_bilinear(
-          image, target_shape, align_corners=True)
+          image, target_shape, align_corners=align_corners)
       out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
       if large_tolerance:
         self.assertAllClose(
@@ -433,7 +434,7 @@ class ResizeBilinearTest(xla_test.XLATestCase):
       self.fail("input_shape must be specified")
     if expected is None:
       self.fail("expected must be specified")
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       dtype = dtype or np.float32
       grads = array_ops.placeholder(np.float32)
       resized = gen_image_ops.resize_bilinear_grad(
@@ -579,6 +580,27 @@ class ResizeBilinearTest(xla_test.XLATestCase):
                 dtype=np.float32)),
         large_tolerance=True)
 
+  def testNonAlignCorners3x2To6x4(self):
+    input_data = [[64, 32], [32, 64], [50, 100]]
+    expected_data = [[64.0, 48.0, 32.0, 32.0], [48.0, 48.0, 48.0, 48.0],
+                     [32.0, 48.0, 64.0, 64.0], [41.0, 61.5, 82.0, 82.0],
+                     [50.0, 75.0, 100.0, 100.0], [50.0, 75.0, 100.0, 100.0]]
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array(input_data, dtype=dtype), [6, 4],
+          expected=np.array(expected_data, dtype=np.float32),
+          align_corners=False)
+
+  def testNonAlignCorners6x4To3x2(self):
+    input_data = [[127, 127, 64, 64], [127, 127, 64, 64], [64, 64, 127, 127],
+                  [64, 64, 127, 127], [50, 50, 100, 100], [50, 50, 100, 100]]
+    expected_data = [[127, 64], [64, 127], [50, 100]]
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array(input_data, dtype=dtype), [3, 2],
+          expected=np.array(expected_data, dtype=dtype),
+          align_corners=False)
+
 
 class NonMaxSuppressionTest(xla_test.XLATestCase):
 
@@ -596,7 +618,7 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
       iou_threshold_np = np.array(0.5, dtype=np.float32)
       score_threshold_np = np.array(0.0, dtype=np.float32)
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
         scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
         iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
@@ -639,7 +661,7 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
       iou_threshold_np = np.array(0.5, dtype=np.float32)
       score_threshold_np = np.array(0.0, dtype=np.float32)
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
         scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
         iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
@@ -686,7 +708,7 @@ class NonMaxSuppressionTest(xla_test.XLATestCase):
       iou_threshold_np = np.array(0.5, dtype=np.float32)
       score_threshold_np = np.array(0.4, dtype=np.float32)
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
         scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
         iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
diff --git a/tensorflow/compiler/tests/listdiff_op_test.py b/tensorflow/compiler/tests/listdiff_op_test.py
index 45a04f0cf5..58622114e4 100644
--- a/tensorflow/compiler/tests/listdiff_op_test.py
+++ b/tensorflow/compiler/tests/listdiff_op_test.py
@@ -33,7 +33,7 @@ class ListDiffTest(xla_test.XLATestCase):
   def _testListDiff(self, x, y, out, idx):
     for dtype in [dtypes.int32, dtypes.int64]:
       for index_dtype in [dtypes.int32, dtypes.int64]:
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           x_tensor = ops.convert_to_tensor(x, dtype=dtype)
           y_tensor = ops.convert_to_tensor(y, dtype=dtype)
           with self.test_scope():
diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py
index 253b45902f..c6ad67993e 100644
--- a/tensorflow/compiler/tests/lrn_ops_test.py
+++ b/tensorflow/compiler/tests/lrn_ops_test.py
@@ -58,7 +58,7 @@ class LRNTest(xla_test.XLATestCase):
     return output
 
   def _RunAndVerify(self, dtype):
-    with self.test_session():
+    with self.cached_session():
       # random shape
       shape = np.random.randint(1, 16, size=4)
       # Make depth at least 2 to make it meaningful
@@ -110,7 +110,7 @@ class LRNTest(xla_test.XLATestCase):
     alpha = 1.0 * np.random.rand()
     beta = 1.0 * np.random.rand()
 
-    with self.test_session():
+    with self.cached_session():
       in_image = constant_op.constant(in_image_vals, shape=shape)
       out_image = constant_op.constant(out_image_vals, shape=shape)
       out_grads = constant_op.constant(out_grads_vals, shape=shape)
diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py
index 31093c6571..265c0b6d14 100644
--- a/tensorflow/compiler/tests/lstm_test.py
+++ b/tensorflow/compiler/tests/lstm_test.py
@@ -73,7 +73,7 @@ class LSTMTest(test.TestCase):
 
   def _RunLSTMCell(self, basename, init_weights, m_prev_scalar, c_prev_scalar,
                    pad_scalar):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       num_inputs = 1
       num_nodes = 1
 
@@ -156,7 +156,7 @@ class LSTMTest(test.TestCase):
 
   def _RunLSTMLayer(self, basename, init_weights, m_init_scalar, c_init_scalar,
                     pad_scalar):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       num_inputs = 1
       num_nodes = 1
       seq_length = 3
diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py
index 0d9f99f8a6..9222db4b7e 100644
--- a/tensorflow/compiler/tests/matrix_band_part_test.py
+++ b/tensorflow/compiler/tests/matrix_band_part_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 class MatrixBandPartTest(xla_test.XLATestCase):
 
   def _testMatrixBandPart(self, dtype, shape):
-    with self.test_session():
+    with self.cached_session():
       batch_shape = shape[:-2]
       mat = np.ones(shape).astype(dtype)
       batch_mat = np.tile(mat, batch_shape + [1, 1])
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index 2bb8a97bda..94cd3eeb31 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -54,7 +54,7 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
 
   def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol):
     clean_a = np.tril(a) if lower else np.triu(a)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder_a = MakePlaceholder(a)
       placeholder_ca = MakePlaceholder(clean_a)
       placeholder_b = MakePlaceholder(b)
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index c2592c54cf..f77521a7c4 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -41,7 +41,7 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
 
   def testBasic(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -95,7 +95,7 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
 
   def testNesterovMomentum(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([0.1, 0.2], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([0.3, 0.4], dtype=dtype)
         var0_np = np.array([0.1, 0.2], dtype=dtype)
@@ -120,7 +120,7 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
diff --git a/tensorflow/compiler/tests/nary_ops_test.py b/tensorflow/compiler/tests/nary_ops_test.py
index da08225e9f..a1c07fce73 100644
--- a/tensorflow/compiler/tests/nary_ops_test.py
+++ b/tensorflow/compiler/tests/nary_ops_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.platform import googletest
 class NAryOpsTest(xla_test.XLATestCase):
 
   def _testNAry(self, op, args, expected, equality_fn=None):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         placeholders = [
             array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
@@ -126,7 +126,7 @@ class NAryOpsTest(xla_test.XLATestCase):
             [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], dtype=np.float32))
 
   def testOneHot(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       indices = array_ops.constant(np.array([[2, 3], [0, 1]], dtype=np.int32))
       op = array_ops.one_hot(indices,
                              np.int32(4),
@@ -148,7 +148,7 @@ class NAryOpsTest(xla_test.XLATestCase):
       self.assertAllEqual(output, expected)
 
   def testSplitV(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         output = session.run(
             array_ops.split(np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 0, 1, 2]],
diff --git a/tensorflow/compiler/tests/nullary_ops_test.py b/tensorflow/compiler/tests/nullary_ops_test.py
index 2f9122645d..f985c5d2d9 100644
--- a/tensorflow/compiler/tests/nullary_ops_test.py
+++ b/tensorflow/compiler/tests/nullary_ops_test.py
@@ -29,14 +29,14 @@ from tensorflow.python.platform import googletest
 class NullaryOpsTest(xla_test.XLATestCase):
 
   def _testNullary(self, op, expected):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         output = op()
       result = session.run(output)
       self.assertAllClose(result, expected, rtol=1e-3)
 
   def testNoOp(self):
-    with self.test_session():
+    with self.cached_session():
       with self.test_scope():
         output = control_flow_ops.no_op()
       # This should not crash.
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
index d68d32057a..7635f89249 100644
--- a/tensorflow/compiler/tests/oom_test.py
+++ b/tensorflow/compiler/tests/oom_test.py
@@ -46,7 +46,7 @@ class OutOfMemoryTest(xla_test.XLATestCase):
     def test_loop():
       size = int(2e8)
       while True:
-        with self.test_session():
+        with self.cached_session():
           # Force the compiled code to not be constant by feeding in a
           # parameter.
           p = array_ops.placeholder(dtypes.float32, shape=[2, 1, 1])
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
index a75d99189b..77bb839409 100644
--- a/tensorflow/compiler/tests/placeholder_test.py
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import googletest
 class PlaceholderTest(xla_test.XLATestCase):
 
   def test_placeholder_with_default_default(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       v = resource_variable_ops.ResourceVariable(4.0)
       ph = array_ops.placeholder_with_default(v, shape=[])
       out = ph * 2
@@ -36,7 +36,7 @@ class PlaceholderTest(xla_test.XLATestCase):
       self.assertEqual(8.0, sess.run(out))
 
   def test_placeholder_with_default_fed(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       v = resource_variable_ops.ResourceVariable(4.0)
       ph = array_ops.placeholder_with_default(v, shape=[])
       out = ph * 2
diff --git a/tensorflow/compiler/tests/pooling_ops_3d_test.py b/tensorflow/compiler/tests/pooling_ops_3d_test.py
index 17f860db61..b6cdd38345 100644
--- a/tensorflow/compiler/tests/pooling_ops_3d_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_3d_test.py
@@ -62,7 +62,7 @@ class Pooling3DTest(xla_test.XLATestCase):
     # numbers from 1.
     x = np.arange(1.0, total_size + 1, dtype=np.float32)
     x = x.reshape(input_sizes)
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       inputs = array_ops.placeholder(dtypes.float32)
       t = pool_func(
           inputs,
@@ -210,7 +210,7 @@ class Pooling3DTest(xla_test.XLATestCase):
     strides = [1] + strides + [1]
     total_size = np.prod(input_sizes)
     x = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_sizes)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Use the forward pool function to compute some corresponding outputs
       # (needed for the CPU device, and we need the shape in both cases).
       with ops.device("CPU"):
diff --git a/tensorflow/compiler/tests/pooling_ops_test.py b/tensorflow/compiler/tests/pooling_ops_test.py
index 9fc94752ea..d03bd4fdbb 100644
--- a/tensorflow/compiler/tests/pooling_ops_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_test.py
@@ -89,7 +89,7 @@ class PoolingTest(xla_test.XLATestCase):
     # numbers from 1.
     x = np.array([f * 1.0 for f in range(1, total_size + 1)], dtype=np.float32)
     x = x.reshape(input_sizes)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         inputs = array_ops.placeholder(dtypes.float32)
         t = inputs
@@ -324,7 +324,7 @@ class PoolGradTest(xla_test.XLATestCase):
     # TODO(b/74222344): Fix nan handling for max pool grad.
     # x[np.random.choice(total_size)] = np.nan
     x = x.reshape(input_sizes)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Use the forward pool function to compute some corresponding outputs
       # (needed for the CPU device, and we need the shape in both cases).
       with ops.device(self.CPU_DEVICE):
diff --git a/tensorflow/compiler/tests/powersign_test.py b/tensorflow/compiler/tests/powersign_test.py
index 5fa7706d72..86536da7fe 100644
--- a/tensorflow/compiler/tests/powersign_test.py
+++ b/tensorflow/compiler/tests/powersign_test.py
@@ -64,7 +64,7 @@ class PowerSignTest(xla_test.XLATestCase):
                  base=math.e,
                  beta=0.9):
     for dtype in self.float_types:
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         # Initialize variables for numpy implementation.
         m0, m1 = 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype)
diff --git a/tensorflow/compiler/tests/proximal_adagrad_test.py b/tensorflow/compiler/tests/proximal_adagrad_test.py
index cde87db63d..c41b4171e2 100644
--- a/tensorflow/compiler/tests/proximal_adagrad_test.py
+++ b/tensorflow/compiler/tests/proximal_adagrad_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.training import proximal_adagrad
 class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
 
   def testResourceProximalAdagradwithoutRegularization(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([0.0, 0.0])
       var1 = resource_variable_ops.ResourceVariable([0.0, 0.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -60,7 +60,7 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       self.assertEqual(2, len(opt_vars))
 
   def testProximalAdagradwithoutRegularization2(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
       var1 = resource_variable_ops.ResourceVariable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -84,7 +84,7 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       self.assertAllClose(np.array([3.715679, 2.433051]), var1.eval())
 
   def testProximalAdagradWithL1(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
       var1 = resource_variable_ops.ResourceVariable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -108,7 +108,7 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       self.assertAllClose(np.array([2.959304, 1.029232]), var1.eval())
 
   def testProximalAdagradWithL1_L2(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
       var1 = resource_variable_ops.ResourceVariable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -151,7 +151,7 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
     return var0.eval(), var1.eval()
 
   def testEquivAdagradwithoutRegularization(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       val0, val1 = self.applyOptimizer(
           proximal_adagrad.ProximalAdagradOptimizer(
               3.0,
@@ -159,7 +159,7 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
               l1_regularization_strength=0.0,
               l2_regularization_strength=0.0))
 
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       val2, val3 = self.applyOptimizer(
           adagrad.AdagradOptimizer(
               3.0, initial_accumulator_value=0.1))
diff --git a/tensorflow/compiler/tests/proximal_gradient_descent_test.py b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
index 11eb768711..3d808e6b8a 100644
--- a/tensorflow/compiler/tests/proximal_gradient_descent_test.py
+++ b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.training import proximal_gradient_descent
 class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
 
   def testResourceProximalGradientDescentwithoutRegularization(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([0.0, 0.0])
       var1 = resource_variable_ops.ResourceVariable([0.0, 0.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -53,7 +53,7 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       self.assertAllClose(np.array([-0.09, -0.18]), var1.eval())
 
   def testProximalGradientDescentwithoutRegularization2(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
       var1 = resource_variable_ops.ResourceVariable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -75,7 +75,7 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       self.assertAllClose(np.array([3.91, 2.82]), var1.eval())
 
   def testProximalGradientDescentWithL1(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
       var1 = resource_variable_ops.ResourceVariable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -97,7 +97,7 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       self.assertAllClose(np.array([3.67, 2.37]), var1.eval())
 
   def testProximalGradientDescentWithL1_L2(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
       var1 = resource_variable_ops.ResourceVariable([4.0, 3.0])
       grads0 = constant_op.constant([0.1, 0.2])
@@ -137,14 +137,14 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
     return var0.eval(), var1.eval()
 
   def testEquivGradientDescentwithoutRegularization(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       val0, val1 = self.applyOptimizer(
           proximal_gradient_descent.ProximalGradientDescentOptimizer(
               3.0,
               l1_regularization_strength=0.0,
               l2_regularization_strength=0.0))
 
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       val2, val3 = self.applyOptimizer(
           gradient_descent.GradientDescentOptimizer(3.0))
 
diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py
index 1b969ee2b3..3a268978bf 100644
--- a/tensorflow/compiler/tests/qr_op_test.py
+++ b/tensorflow/compiler/tests/qr_op_test.py
@@ -71,7 +71,7 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
     x_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_tf = array_ops.placeholder(dtype)
       with self.test_scope():
         q_tf, r_tf = linalg_ops.qr(x_tf, full_matrices=full_matrices)
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 8c4e16e4e0..6e18344117 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -39,7 +39,7 @@ class RandomOpsTest(xla_test.XLATestCase):
 
   def _testRngIsNotConstant(self, rng, dtype):
     # Tests that 'rng' does not always return the same value.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         x = rng(dtype)
 
@@ -79,7 +79,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       if (self.device in ["XLA_GPU", "XLA_CPU"
                          ]) and (dtype in [dtypes.bfloat16, dtypes.half]):
         continue
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.test_scope():
           x = random_ops.random_uniform(
               shape=[1000], dtype=dtype, minval=-2, maxval=33)
@@ -99,7 +99,7 @@ class RandomOpsTest(xla_test.XLATestCase):
     count = 10000000
     # TODO(b/34339814): implement inverse erf support for non-F32 types.
     for dtype in [dtypes.float32]:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.test_scope():
           x = random_ops.truncated_normal(shape=[count], dtype=dtype)
         y = sess.run(x)
@@ -147,7 +147,7 @@ class RandomOpsTest(xla_test.XLATestCase):
     # TODO(b/26783907): this test requires the CPU backend to implement sort.
     if self.device in ["XLA_CPU"]:
       return
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         x = math_ops.range(1 << 16)
         shuffle = random_ops.random_shuffle(x)
@@ -158,7 +158,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       self.assertAllEqual(set(result), set(expected))
 
   def testShuffle2d(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         x = array_ops.diag(math_ops.range(20))
         shuffle = random_ops.random_shuffle(x)
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index cea2ec816f..5ae5b1bc1d 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import itertools
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
@@ -30,22 +31,24 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
 
-class ReduceOpsTest(xla_test.XLATestCase):
-
+@parameterized.named_parameters(('32_bit_index', dtypes.int32),
+                                ('64_bit_index', dtypes.int64))
+class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
   def _testReduction(self,
                      tf_reduce_fn,
                      np_reduce_fn,
                      dtype,
                      test_inputs,
+                     index_dtype,
                      rtol=1e-4,
                      atol=1e-4):
     """Tests that the output of 'tf_reduce_fn' matches numpy's output."""
 
     for test_input in test_inputs:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.test_scope():
           a = array_ops.placeholder(dtype)
-          index = array_ops.placeholder(dtypes.int32)
+          index = array_ops.placeholder(index_dtype)
           out = tf_reduce_fn(a, index)
         result = sess.run(out, {a: test_input, index: [0]})
         self.assertAllClose(
@@ -89,22 +92,23 @@ class ReduceOpsTest(xla_test.XLATestCase):
       np.array([[False, True, False], [True, True, False]]),
   ]
 
-  def testReduceSumF32(self):
-    self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA)
+  def testReduceSumF32(self, index_dtype):
+    self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA,
+                        index_dtype)
 
-  def testReduceSumC64(self):
+  def testReduceSumC64(self, index_dtype):
     self._testReduction(math_ops.reduce_sum, np.sum, np.complex64,
-                        self.COMPLEX_DATA)
+                        self.COMPLEX_DATA, index_dtype)
 
-  def testReduceProdF32(self):
+  def testReduceProdF32(self, index_dtype):
     self._testReduction(math_ops.reduce_prod, np.prod, np.float32,
-                        self.REAL_DATA)
+                        self.REAL_DATA, index_dtype)
 
-  def testReduceProdC64(self):
+  def testReduceProdC64(self, index_dtype):
     self._testReduction(math_ops.reduce_prod, np.prod, np.complex64,
-                        self.COMPLEX_DATA)
+                        self.COMPLEX_DATA, index_dtype)
 
-  def testReduceMin(self):
+  def testReduceMin(self, index_dtype):
 
     def reference_min(dtype, inp, axis):
       """Wrapper around np.amin that returns +infinity for an empty input."""
@@ -119,9 +123,9 @@ class ReduceOpsTest(xla_test.XLATestCase):
         [np.float32, np.int32, np.int64]):
       self._testReduction(math_ops.reduce_min,
                           functools.partial(reference_min, dtype), dtype,
-                          self.REAL_DATA)
+                          self.REAL_DATA, index_dtype)
 
-  def testReduceMax(self):
+  def testReduceMax(self, index_dtype):
 
     def reference_max(dtype, inp, axis):
       """Wrapper around np.amax that returns -infinity for an empty input."""
@@ -137,23 +141,25 @@ class ReduceOpsTest(xla_test.XLATestCase):
         [np.float32, np.int32, np.int64]):
       self._testReduction(math_ops.reduce_max,
                           functools.partial(reference_max, dtype), dtype,
-                          self.REAL_DATA)
+                          self.REAL_DATA, index_dtype)
 
-  def testReduceMeanF32(self):
+  def testReduceMeanF32(self, index_dtype):
     # TODO(phawkins): mean on XLA currently returns 0 instead of NaN when
     # reducing across zero inputs.
     self._testReduction(math_ops.reduce_mean, np.mean, np.float32,
-                        self.NONEMPTY_REAL_DATA)
+                        self.NONEMPTY_REAL_DATA, index_dtype)
 
-  def testReduceMeanC64(self):
+  def testReduceMeanC64(self, index_dtype):
     self._testReduction(math_ops.reduce_mean, np.mean, np.complex64,
-                        self.NONEMPTY_COMPLEX_DATA)
+                        self.NONEMPTY_COMPLEX_DATA, index_dtype)
 
-  def testReduceAll(self):
-    self._testReduction(math_ops.reduce_all, np.all, np.bool, self.BOOL_DATA)
+  def testReduceAll(self, index_dtype):
+    self._testReduction(math_ops.reduce_all, np.all, np.bool, self.BOOL_DATA,
+                        index_dtype)
 
-  def testReduceAny(self):
-    self._testReduction(math_ops.reduce_any, np.any, np.bool, self.BOOL_DATA)
+  def testReduceAny(self, index_dtype):
+    self._testReduction(math_ops.reduce_any, np.any, np.bool, self.BOOL_DATA,
+                        index_dtype)
 
 
 class ReduceOpPrecisionTest(xla_test.XLATestCase):
@@ -178,7 +184,7 @@ class ReduceOpPrecisionTest(xla_test.XLATestCase):
     """
 
     for test_input in test_inputs:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.test_scope():
           a = array_ops.placeholder(dtype)
           index = array_ops.placeholder(dtypes.int32)
diff --git a/tensorflow/compiler/tests/reduce_window_test.py b/tensorflow/compiler/tests/reduce_window_test.py
index c69b6837b0..ff20ea3f42 100644
--- a/tensorflow/compiler/tests/reduce_window_test.py
+++ b/tensorflow/compiler/tests/reduce_window_test.py
@@ -32,7 +32,7 @@ class ReduceWindowTest(xla_test.XLATestCase):
   """Test cases for xla.reduce_window."""
 
   def _reduce_window(self, operand, init, reducer, **kwargs):
-    with self.test_session():
+    with self.cached_session():
       placeholder = array_ops.placeholder(operand.dtype)
       with self.test_scope():
         output = xla.reduce_window(placeholder, init, reducer, **kwargs)
diff --git a/tensorflow/compiler/tests/reshape_op_test.py b/tensorflow/compiler/tests/reshape_op_test.py
new file mode 100644
index 0000000000..84c6777940
--- /dev/null
+++ b/tensorflow/compiler/tests/reshape_op_test.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slicing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class ReshapeTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(('32_bit_index', dtypes.int32),
+                                  ('64_bit_index', dtypes.int64))
+  def testBasic(self, index_dtype):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[2, 3])
+        with self.test_scope():
+          shape = constant_op.constant([3, 2], dtype=index_dtype)
+          o = array_ops.reshape(i, shape)
+        params = {
+            i: [[1, 2, 3], [4, 5, 6]],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[1, 2], [3, 4], [5, 6]], result)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/reverse_ops_test.py b/tensorflow/compiler/tests/reverse_ops_test.py
index d01c676e7c..392290fd92 100644
--- a/tensorflow/compiler/tests/reverse_ops_test.py
+++ b/tensorflow/compiler/tests/reverse_ops_test.py
@@ -32,33 +32,40 @@ class ReverseOpsTest(xla_test.XLATestCase):
 
   def testReverseOneDim(self):
     shape = (7, 5, 9, 11)
-    for revdim in range(len(shape)):
+    for revdim in range(-len(shape), len(shape)):
       self._AssertReverseEqual([revdim], shape)
 
   def testReverseMoreThanOneDim(self):
     shape = (7, 5, 9, 11)
+    # The offset is used to test various (but not all) combinations of negative
+    # and positive axis indices that are guaranteed to not collide at the same
+    # index.
     for revdims in itertools.chain.from_iterable(
-        itertools.combinations(range(len(shape)), k)
-        for k in range(2, len(shape)+1)):
+        itertools.combinations(range(-offset,
+                                     len(shape) - offset), k)
+        for k in range(2,
+                       len(shape) + 1)
+        for offset in range(0, len(shape))):
       self._AssertReverseEqual(revdims, shape)
 
   def _AssertReverseEqual(self, revdims, shape):
     np.random.seed(120)
     pval = np.random.randint(0, 100, size=shape).astype(float)
-    with self.test_session():
+    with self.cached_session():
       with self.test_scope():
         p = array_ops.placeholder(dtypes.int32, shape=shape)
         axis = constant_op.constant(
             np.array(revdims, dtype=np.int32),
-            shape=(len(revdims),), dtype=dtypes.int32)
+            shape=(len(revdims),),
+            dtype=dtypes.int32)
         rval = array_ops.reverse(p, axis).eval({p: pval})
 
         slices = [
-            slice(-1, None, -1) if d in revdims else slice(None)
-            for d in range(len(shape))]
-      self.assertEqual(
-          pval[slices].flatten().tolist(),
-          rval.flatten().tolist())
+            slice(-1, None, -1)
+            if d in revdims or d - len(shape) in revdims else slice(None)
+            for d in range(len(shape))
+        ]
+      self.assertEqual(pval[slices].flatten().tolist(), rval.flatten().tolist())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/tests/reverse_sequence_op_test.py b/tensorflow/compiler/tests/reverse_sequence_op_test.py
index ccfa630016..60c2337743 100644
--- a/tensorflow/compiler/tests/reverse_sequence_op_test.py
+++ b/tensorflow/compiler/tests/reverse_sequence_op_test.py
@@ -35,7 +35,7 @@ class ReverseSequenceTest(xla_test.XLATestCase):
                            seq_lengths,
                            truth,
                            expected_err_re=None):
-    with self.test_session():
+    with self.cached_session():
       p = array_ops.placeholder(dtypes.as_dtype(x.dtype))
       lengths = array_ops.placeholder(dtypes.as_dtype(seq_lengths.dtype))
       with self.test_scope():
diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py
index ff8bbac911..8840a1329a 100644
--- a/tensorflow/compiler/tests/rmsprop_test.py
+++ b/tensorflow/compiler/tests/rmsprop_test.py
@@ -55,7 +55,7 @@ class RmspropTest(xla_test.XLATestCase):
   def testBasic(self):
     for dtype in self.float_types:
       for centered in [False, True]:
-        with self.test_session(), self.test_scope():
+        with self.cached_session(), self.test_scope():
           # Initialize variables for numpy implementation.
           var0_np = np.array([1.0, 2.0], dtype=dtype)
           grads0_np = np.array([0.1, 0.1], dtype=dtype)
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
index 4292352e76..897db384b7 100644
--- a/tensorflow/compiler/tests/scan_ops_test.py
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -78,7 +78,7 @@ class CumsumTest(xla_test.XLATestCase):
 
   def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       p = array_ops.placeholder(x.dtype)
       tf_out = math_ops.cumsum(p, axis, exclusive, reverse).eval(
           feed_dict={p: x})
@@ -100,7 +100,7 @@ class CumsumTest(xla_test.XLATestCase):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis_dtype in self.axis_dtypes():
-        with self.test_session(), self.test_scope():
+        with self.cached_session(), self.test_scope():
           p = array_ops.placeholder(x.dtype)
           axis = constant_op.constant(0, axis_dtype)
           math_ops.cumsum(p, axis).eval(feed_dict={p: x})
@@ -131,7 +131,7 @@ class CumsumTest(xla_test.XLATestCase):
 
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       input_tensor = ops.convert_to_tensor(x)
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
@@ -156,7 +156,7 @@ class CumprodTest(xla_test.XLATestCase):
 
   def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       p = array_ops.placeholder(x.dtype)
       prod = math_ops.cumprod(p, axis, exclusive, reverse)
       tf_out = prod.eval(feed_dict={p: x})
@@ -178,7 +178,7 @@ class CumprodTest(xla_test.XLATestCase):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis_dtype in self.axis_dtypes():
-        with self.test_session(), self.test_scope():
+        with self.cached_session(), self.test_scope():
           p = array_ops.placeholder(x.dtype)
           axis = constant_op.constant(0, axis_dtype)
           math_ops.cumprod(x, axis).eval(feed_dict={p: x})
@@ -209,7 +209,7 @@ class CumprodTest(xla_test.XLATestCase):
 
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       input_tensor = ops.convert_to_tensor(x)
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py
index f606f88545..693f8513bc 100644
--- a/tensorflow/compiler/tests/scatter_nd_op_test.py
+++ b/tensorflow/compiler/tests/scatter_nd_op_test.py
@@ -119,7 +119,7 @@ class ScatterNdTest(xla_test.XLATestCase):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
   def _runScatterNd(self, indices, updates, shape):
-    with self.test_session():
+    with self.cached_session():
       updates_placeholder = array_ops.placeholder(updates.dtype)
       indices_placeholder = array_ops.placeholder(indices.dtype)
       with self.test_scope():
diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py
index 772c20fd42..287bb0d84e 100644
--- a/tensorflow/compiler/tests/segment_reduction_ops_test.py
+++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py
@@ -32,7 +32,7 @@ class SegmentReductionOpsTest(xla_test.XLATestCase):
   """Test cases for segment reduction ops."""
 
   def _segmentReduction(self, op, data, indices, num_segments):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       d = array_ops.placeholder(data.dtype, shape=data.shape)
       if isinstance(indices, int):
         i = array_ops.placeholder(np.int32, shape=[])
diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py
index 6c4890565d..8f10c2fe86 100644
--- a/tensorflow/compiler/tests/slice_ops_test.py
+++ b/tensorflow/compiler/tests/slice_ops_test.py
@@ -29,7 +29,7 @@ class SliceTest(xla_test.XLATestCase):
 
   def test1D(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[10])
         with self.test_scope():
           o = array_ops.slice(i, [2], [4])
@@ -42,7 +42,7 @@ class SliceTest(xla_test.XLATestCase):
 
   def test3D(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[3, 3, 10])
         with self.test_scope():
           o = array_ops.slice(i, [1, 2, 2], [1, 1, 4])
@@ -64,7 +64,7 @@ class SliceTest(xla_test.XLATestCase):
   def test3DWithDynamicBegin(self):
     """Tests a slice where the start offset is not known at compile time."""
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[3, 3, 10])
         begin = array_ops.placeholder(dtypes.int32, shape=[3])
         with self.test_scope():
@@ -88,7 +88,7 @@ class SliceTest(xla_test.XLATestCase):
   def test3DWithDynamicBeginAndNegativeSize(self):
     """Tests a slice where `begin` is fed dynamically and `size` contains -1."""
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[3, 3, 10])
         begin = array_ops.placeholder(dtypes.int32, shape=[3])
         with self.test_scope():
@@ -114,7 +114,7 @@ class StridedSliceTest(xla_test.XLATestCase):
 
   def test1D(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[10])
         with self.test_scope():
           o = array_ops.strided_slice(i, [2], [6], [2])
@@ -127,7 +127,7 @@ class StridedSliceTest(xla_test.XLATestCase):
 
   def test1DNegativeStride(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[10])
         with self.test_scope():
           o = array_ops.strided_slice(i, [6], [2], [-2])
@@ -140,7 +140,7 @@ class StridedSliceTest(xla_test.XLATestCase):
 
   def test2DDegenerate(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[2, 3])
         with self.test_scope():
           o = array_ops.strided_slice(i, [-1, 0], [0, 3])
@@ -154,7 +154,7 @@ class StridedSliceTest(xla_test.XLATestCase):
 
   def test2DDegenerateNegativeStride(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[2, 3])
         with self.test_scope():
           o = array_ops.strided_slice(i, [0, 0], [-1, 3], [-1, 1])
@@ -168,7 +168,7 @@ class StridedSliceTest(xla_test.XLATestCase):
 
   def test3D(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[3, 3, 10])
         with self.test_scope():
           o = array_ops.strided_slice(i, [0, 2, 2], [2, 3, 6], [1, 1, 2])
@@ -189,7 +189,7 @@ class StridedSliceTest(xla_test.XLATestCase):
 
   def test3DNegativeStride(self):
     for dtype in self.numeric_types:
-      with self.test_session():
+      with self.cached_session():
         i = array_ops.placeholder(dtype, shape=[3, 4, 10])
         with self.test_scope():
           o = array_ops.strided_slice(i, [2, 2, 6], [0, 0, 2], [-1, -1, -2])
diff --git a/tensorflow/compiler/tests/sort_ops_test.py b/tensorflow/compiler/tests/sort_ops_test.py
index 7ff01be3cb..51c04b5c47 100644
--- a/tensorflow/compiler/tests/sort_ops_test.py
+++ b/tensorflow/compiler/tests/sort_ops_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.platform import test
 class XlaSortOpTest(xla_test.XLATestCase):
 
   def _assertOpOutputMatchesExpected(self, op, args, expected):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         placeholders = [
             array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
@@ -131,7 +131,7 @@ class XlaSortOpTest(xla_test.XLATestCase):
     if bfloat16 not in self.numeric_types:
       return
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       p = array_ops.placeholder(dtypes.bfloat16)
       with self.test_scope():
         topk = nn_ops.top_k(p, k=4)
@@ -153,7 +153,7 @@ class XlaSortOpTest(xla_test.XLATestCase):
     if bfloat16 not in self.numeric_types:
       return
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       p = array_ops.placeholder(dtypes.bfloat16)
       with self.test_scope():
         topk = nn_ops.top_k(p, k=6)
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index c685bc548f..33b84cec71 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -72,7 +72,7 @@ class SpaceToBatchTest(xla_test.XLATestCase):
   """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops."""
 
   def _testPad(self, inputs, paddings, block_size, outputs):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       for dtype in self.float_types:
         # outputs = space_to_batch(inputs)
         placeholder = array_ops.placeholder(dtype)
@@ -155,7 +155,7 @@ class SpaceToBatchNDTest(xla_test.XLATestCase):
   def _testPad(self, inputs, block_shape, paddings, outputs):
     block_shape = np.array(block_shape)
     paddings = np.array(paddings).reshape((len(block_shape), 2))
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       for dtype in self.float_types:
         # TODO(b/68813416): Skip bfloat16's as the input type for direct is
         # float32 and results in a mismatch, while making testDirect provide the
diff --git a/tensorflow/compiler/tests/sparse_to_dense_op_test.py b/tensorflow/compiler/tests/sparse_to_dense_op_test.py
index 3db8101c4b..07afd1ab3f 100644
--- a/tensorflow/compiler/tests/sparse_to_dense_op_test.py
+++ b/tensorflow/compiler/tests/sparse_to_dense_op_test.py
@@ -45,32 +45,32 @@ def _SparseToDense(sparse_indices,
 class SparseToDenseTest(xla_test.XLATestCase):
 
   def testInt(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       tf_ans = _SparseToDense([1, 3], [5], 1, 0)
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
   def testFloat(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       tf_ans = _SparseToDense([1, 3], [5], 1.0, 0.0)
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.float32)
     self.assertAllClose(np_ans, tf_ans)
 
   def testSetValue(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       tf_ans = _SparseToDense([1, 3], [5], [1, 2], -1)
     np_ans = np.array([-1, 1, -1, 2, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
   def testSetSingleValue(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       tf_ans = _SparseToDense([1, 3], [5], 1, -1)
     np_ans = np.array([-1, 1, -1, 1, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
   def test2d(self):
     # pylint: disable=bad-whitespace
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       tf_ans = _SparseToDense([[1, 3], [2, 0]], [3, 4], 1, -1)
     np_ans = np.array([[-1, -1, -1, -1],
                        [-1, -1, -1,  1],
@@ -78,12 +78,12 @@ class SparseToDenseTest(xla_test.XLATestCase):
     self.assertAllClose(np_ans, tf_ans)
 
   def testZeroDefault(self):
-    with self.test_session():
+    with self.cached_session():
       x = sparse_ops.sparse_to_dense(2, [4], 7).eval()
       self.assertAllEqual(x, [0, 0, 7, 0])
 
   def test3d(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       tf_ans = _SparseToDense([[1, 3, 0], [2, 0, 1]], [3, 4, 2], 1, -1)
     np_ans = np.ones((3, 4, 2), dtype=np.int32) * -1
     np_ans[1, 3, 0] = 1
@@ -91,25 +91,25 @@ class SparseToDenseTest(xla_test.XLATestCase):
     self.assertAllClose(np_ans, tf_ans)
 
   def testBadShape(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       with self.assertRaisesWithPredicateMatch(ValueError, "must be rank 1"):
         _SparseToDense([1, 3], [[5], [3]], 1, -1)
 
   def testBadValue(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[2,1\], "
           r"should be \[\] or \[2\]"):
         _SparseToDense([1, 3], [5], [[5], [3]], -1)
 
   def testBadNumValues(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[3\], should be \[\] or \[2\]"):
         _SparseToDense([1, 3], [5], [1, 2, 3], -1)
 
   def testBadDefault(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       with self.assertRaisesOpError("default_value should be a scalar"):
         _SparseToDense([1, 3], [5], [1, 2], [0])
 
diff --git a/tensorflow/compiler/tests/stack_ops_test.py b/tensorflow/compiler/tests/stack_ops_test.py
index b7dd787fef..720595a159 100644
--- a/tensorflow/compiler/tests/stack_ops_test.py
+++ b/tensorflow/compiler/tests/stack_ops_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.platform import test
 class StackOpTest(xla_test.XLATestCase):
 
   def testStackPushPop(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       size = array_ops.placeholder(dtypes.int32)
       v = array_ops.placeholder(dtypes.float32)
       h = gen_data_flow_ops.stack_v2(size, dtypes.float32, stack_name="foo")
@@ -41,7 +41,7 @@ class StackOpTest(xla_test.XLATestCase):
       self.assertAllClose([[4.0, 5.0]], c1.eval({size: 5, v: [[4.0, 5.0]]}))
 
   def testStackPushPopSwap(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       a = np.arange(2000)
       x = array_ops.placeholder(dtypes.float32)
       h = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
@@ -51,7 +51,7 @@ class StackOpTest(xla_test.XLATestCase):
       self.assertAllClose(a, c1.eval({x: a}))
 
   def testMultiStack(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       v = array_ops.placeholder(dtypes.float32)
       h1 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_push_v2(h1, v)
@@ -66,7 +66,7 @@ class StackOpTest(xla_test.XLATestCase):
 
   def testSameNameStacks(self):
     """Different stacks with the same name do not interfere."""
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       v1 = array_ops.placeholder(dtypes.float32)
       v2 = array_ops.placeholder(dtypes.float32)
       h1 = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
@@ -84,14 +84,14 @@ class StackOpTest(xla_test.XLATestCase):
       self.assertAllClose(out2, 5.0)
 
   def testCloseStack(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       size = array_ops.placeholder(dtypes.int32)
       h = gen_data_flow_ops.stack_v2(size, dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_close_v2(h)
       sess.run(c1, {size: 5})
 
   def testPushCloseStack(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       v = array_ops.placeholder(dtypes.float32)
       h = gen_data_flow_ops.stack_v2(5, dtypes.float32, stack_name="foo")
       c = gen_data_flow_ops.stack_push_v2(h, v)
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index d162675ef8..1bea7d9355 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -38,7 +38,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
 
   def testDeterminism(self):
     # Stateless values should be equal iff the seeds are equal (roughly)
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
       seeds = [(x, y) for x in range(5) for y in range(5)] * 3
       for stateless_op in [
@@ -55,7 +55,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
                 self.assertEqual(s0 == s1, np.all(v0 == v1))
 
   def testRandomUniformIsInRange(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       for dtype in self._random_types():
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
         x = stateless.stateless_random_uniform(
@@ -74,7 +74,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
 
   def testDistributionOfStatelessRandomUniform(self):
     """Use Pearson's Chi-squared test to test for uniformity."""
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       for dtype in self._random_types():
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
         n = 1000
@@ -88,7 +88,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
         self.assertTrue(self._chi_squared(y, 10) < 16.92)
 
   def testRandomNormalIsFinite(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       for dtype in self._random_types():
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
         x = stateless.stateless_random_uniform(
@@ -111,7 +111,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
 
   def testDistributionOfStatelessRandomNormal(self):
     """Use Anderson-Darling test to test distribution appears normal."""
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       for dtype in self._random_types():
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
         n = 1000
@@ -126,7 +126,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
   def testTruncatedNormalIsInRange(self):
     # TODO(b/34339814): implement inverse erf support for non-F32 types.
     for dtype in [dtypes.float32]:
-      with self.test_session() as sess, self.test_scope():
+      with self.cached_session() as sess, self.test_scope():
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
         n = 10000000
         x = stateless.stateless_truncated_normal(
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index f332aa2e9b..78244d0b36 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -44,7 +44,7 @@ def _make_converter(dtype):
 class TensorArrayTest(xla_test.XLATestCase):
 
   def testTensorArrayWriteRead(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -66,7 +66,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([], flow_val.shape)
 
   def _testTensorArrayWritePack(self, tf_dtype):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
@@ -86,7 +86,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self._testTensorArrayWritePack(dtype)
 
   def testEmptyTensorArrayPack(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
 
@@ -100,7 +100,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([3, 0, 1], c0.eval().shape)
 
   def _testTensorArrayWriteConcat(self, tf_dtype):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
@@ -121,7 +121,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self._testTensorArrayWriteConcat(dtype)
 
   def _testTensorArrayUnpackRead(self, tf_dtype):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
@@ -176,7 +176,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     self._testTensorArrayUnpackReadMaybeLegacy()
 
   def _testTensorArraySplitRead(self, tf_dtype):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
@@ -228,7 +228,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self._testTensorArraySplitRead(dtype)
 
   def testTensorGradArrayWriteRead(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -261,7 +261,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([[-2.0]], g_d2)
 
   def testTensorGradArrayDynamicWriteRead(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -300,7 +300,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(3, g_vs)
 
   def testTensorGradAccessTwiceReceiveSameObject(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3,
           element_shape=[1, 2])
@@ -317,7 +317,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
 
@@ -331,7 +331,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     # the first type, but try to read the other type.
     if len(self.float_types) > 1:
       dtype1, dtype2 = list(self.float_types)[:2]
-      with self.test_session(), self.test_scope():
+      with self.cached_session(), self.test_scope():
         ta = tensor_array_ops.TensorArray(
             dtype=dtype1, tensor_array_name="foo", size=3)
 
@@ -347,7 +347,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         w0.read(1)
 
   def testTensorArraySplitIncompatibleShapesFails(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -379,7 +379,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         ta.split([1.0], [1]).flow.eval()
 
   def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtype, tensor_array_name="foo", size=3, infer_shape=False)
 
@@ -410,7 +410,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
 
   def testMultiTensorArray(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       h1 = tensor_array_ops.TensorArray(
           size=1, dtype=dtypes.float32, tensor_array_name="foo")
       w1 = h1.write(0, 4.0)
@@ -425,7 +425,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllClose(9.0, r.eval())
 
   def _testTensorArrayGradientWriteReadType(self, dtype):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.as_dtype(dtype),
           tensor_array_name="foo",
@@ -478,7 +478,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self._testTensorArrayGradientWriteReadType(dtype)
 
   def _testTensorArrayGradientWritePackConcatAndRead(self):
-    with self.test_session() as sess, self.test_scope():
+    with self.cached_session() as sess, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -513,7 +513,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
   def testTensorArrayReadTwice(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
 
       ta_readtwice = tensor_array_ops.TensorArray(
@@ -529,7 +529,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
 
   def _testTensorArrayGradientUnpackRead(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -557,7 +557,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     self._testTensorArrayGradientUnpackRead()
 
   def testTensorArrayGradientSplitConcat(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=2)
 
@@ -581,21 +581,21 @@ class TensorArrayTest(xla_test.XLATestCase):
                           grad_vals[0])
 
   def testCloseTensorArray(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       c1 = ta.close()
       session.run(c1)
 
   def testSizeTensorArray(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       s = ta.size()
       self.assertAllEqual(3, s.eval())
 
   def testWriteCloseTensorArray(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -608,7 +608,7 @@ class TensorArrayTest(xla_test.XLATestCase):
   # TODO(phawkins): implement while loops.
   # def _testWhileLoopWritePackGradients(self, dynamic_size, dtype):
   #   np_dtype = dtype.as_numpy_dtype
-  #   with self.test_session() as session, self.test_scope():
+  #   with self.cached_session() as session, self.test_scope():
   #     v0 = array_ops.identity(np.arange(3 * 5, dtype=np_dtype).reshape(3, 5))
   #     var = variables.Variable(np.arange(100, 105, dtype=np_dtype))
   #     state0 = array_ops.identity(np.array([1] * 5, dtype=np_dtype))
@@ -692,7 +692,7 @@ class TensorArrayTest(xla_test.XLATestCase):
   #       dynamic_size=True, dtype=dtypes.float32)
 
   # def testGradSerialTwoLoops(self):
-  #   with self.test_session(), self.test_scope():
+  #   with self.cached_session(), self.test_scope():
   #     num_steps = 100
   #     acc = tensor_array_ops.TensorArray(
   #         dtype=dtypes.float32,
@@ -725,7 +725,7 @@ class TensorArrayTest(xla_test.XLATestCase):
   #     self.assertAllClose(31.0, grad.eval())
 
   def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       a = array_ops.identity(
           np.arange(
               3 * 5, dtype=np.float32).reshape(3, 5) + 1)
@@ -757,7 +757,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(joint_grad_b_t, g0)
 
   def testWriteShape(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       c0 = constant_op.constant([4.0, 5.0])
@@ -781,7 +781,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         w0.write(0, c2)
 
   def testPartlyUnknownShape(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=6)
 
@@ -821,7 +821,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
 
   def _testUnpackShape(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -846,7 +846,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     self._testUnpackShape()
 
   def testSplitShape(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -867,7 +867,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape())
 
   def testWriteUnknownShape(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -879,7 +879,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape())
 
   def _testGradientWhenNotAllComponentsRead(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
       x = constant_op.constant([2.0, 3.0])
       w = ta.unstack(x)
@@ -893,7 +893,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     self._testGradientWhenNotAllComponentsRead()
 
   def _testTensorArrayEvalEmpty(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, infer_shape=False)
       with self.assertRaisesOpError(
@@ -906,7 +906,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     self._testTensorArrayEvalEmpty()
 
   def _testTensorArrayEvalEmptyWithDefault(self):
-    with self.test_session(), self.test_scope():
+    with self.cached_session(), self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
@@ -921,7 +921,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     self._testTensorArrayEvalEmptyWithDefault()
 
   def testTensorArrayScatterReadAndGradients(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -946,7 +946,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
   def testTensorArrayWriteGatherAndGradients(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -974,7 +974,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertAllEqual(expected_grad, grad_vals[0])
 
   def testTensorArrayIdentity(self):
-    with self.test_session() as session, self.test_scope():
+    with self.cached_session() as session, self.test_scope():
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
                                          infer_shape=False)
       ta1 = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=4,
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index effa5a59fe..55a992195f 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.platform import googletest
 class TernaryOpsTest(xla_test.XLATestCase):
 
   def _testTernary(self, op, a, b, c, expected):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         pa = array_ops.placeholder(dtypes.as_dtype(a.dtype), a.shape, name="a")
         pb = array_ops.placeholder(dtypes.as_dtype(b.dtype), b.shape, name="b")
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 73adb0d243..5b0e57f83f 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -65,7 +65,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
       rtol: relative tolerance for equality test.
       atol: absolute tolerance for equality test.
     """
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with self.test_scope():
         pinp = array_ops.placeholder(
             dtypes.as_dtype(inp.dtype), inp.shape, name="a")
@@ -202,7 +202,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
       # Disable float16 testing for now
       if dtype != np.float16:
         x = np.arange(-10, 10, 1).astype(dtype)
-        with self.test_session() as session:
+        with self.cached_session() as session:
           erf_x = session.run(math_ops.erf(x))
           erfc_x = session.run(math_ops.erfc(x))
 
@@ -398,6 +398,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
 
       self._assertOpOutputMatchesExpected(
           math_ops.lgamma,
+          np.array(0.5, dtype=dtype),
+          expected=np.array(np.log(np.pi) / 2, dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          math_ops.lgamma,
           np.array(
               [[1, 2, 3], [4, 5, 6], [1 / 2, 3 / 2, 5 / 2],
                [-3 / 2, -7 / 2, -11 / 2]],
@@ -420,6 +425,19 @@ class UnaryOpsTest(xla_test.XLATestCase):
               ],
               dtype=dtype))
 
+      # The actual result is complex. Take the real part.
+      self._assertOpOutputMatchesExpected(
+          math_ops.lgamma,
+          np.array([-1 / 2, -5 / 2, -9 / 2], dtype=dtype),
+          expected=np.array(
+              [
+                  np.log(np.pi) / 2 + np.log(2),
+                  np.log(np.pi) / 2 - np.log(15) + np.log(8),
+                  np.log(np.pi) / 2 - np.log(945) + np.log(32),
+              ],
+              dtype=dtype),
+          atol=1e-4)
+
       self._assertOpOutputMatchesExpected(
           math_ops.digamma,
           np.array(
diff --git a/tensorflow/compiler/tests/while_test.py b/tensorflow/compiler/tests/while_test.py
index b637cf31cf..4ee144beb7 100644
--- a/tensorflow/compiler/tests/while_test.py
+++ b/tensorflow/compiler/tests/while_test.py
@@ -43,7 +43,7 @@ class WhileTest(xla_test.XLATestCase):
     def loop_cond(step):
       return step < 10
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       init_index = array_ops.placeholder(dtypes.int32, [])
       with self.test_scope():
         loop_outputs = xla.while_loop([init_index], loop_cond, loop_body)
@@ -65,7 +65,7 @@ class WhileTest(xla_test.XLATestCase):
       del rsum
       return step < 10
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       init_index = array_ops.placeholder(dtypes.int32, [])
       init_sum = array_ops.placeholder(dtypes.float32, [])
       with self.test_scope():
@@ -91,7 +91,7 @@ class WhileTest(xla_test.XLATestCase):
       del rsum
       return step < 10
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       init_index = array_ops.placeholder(dtypes.int32, [])
       init_sum = array_ops.placeholder(dtypes.complex64, [])
       with self.test_scope():
@@ -117,7 +117,7 @@ class WhileTest(xla_test.XLATestCase):
       del x
       return step < 10
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       init_index = array_ops.placeholder(dtypes.int32, [])
       with self.test_scope():
         loop_outputs = xla.while_loop([init_index, 42], loop_cond, loop_body)
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 85084bb124..28d61fb07d 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -37,7 +37,7 @@ class XlaDeviceTest(xla_test.XLATestCase):
               [16384, 1], [1, 16384], [1, 20000, 1, 1]]
     for dtype in self.numeric_types:
       for shape in shapes:
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           with ops.device("CPU"):
             x = array_ops.placeholder(dtype, shape)
           with self.test_scope():
@@ -58,7 +58,7 @@ class XlaDeviceTest(xla_test.XLATestCase):
     ])
     shape = (10, 10)
     for unsupported_dtype in test_types - self.all_types:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with ops.device("CPU"):
           x = array_ops.placeholder(unsupported_dtype, shape)
         with self.test_scope():
@@ -78,7 +78,7 @@ class XlaDeviceTest(xla_test.XLATestCase):
             pass
 
   def testControlTrigger(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.test_scope():
         x = gen_control_flow_ops.control_trigger()
       sess.run(x)
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
new file mode 100644
index 0000000000..b2f026df6c
--- /dev/null
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -0,0 +1,301 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA op wrappers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.compiler.xla import xla_data_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class XlaOpsTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def _assertOpOutputMatchesExpected(self, op, args, expected,
+                                     equality_fn=None):
+    with self.test_session() as session:
+      with self.test_scope():
+        placeholders = [
+            array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
+            for arg in args
+        ]
+        feeds = {placeholders[i]: args[i] for i in range(0, len(args))}
+        output = op(*placeholders)
+      result = session.run(output, feeds)
+      if not equality_fn:
+        equality_fn = self.assertAllClose
+      equality_fn(result, expected, rtol=1e-3)
+
+  def testAdd(self):
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          xla.add,
+          args=(np.array([1, 2, 3], dtype=dtype),
+                np.array([4, 5, 6], dtype=dtype)),
+          expected=np.array([5, 7, 9], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          lambda x, y: xla.add(x, y, broadcast_dims=(0,)),
+          args=(np.array([[1, 2], [3, 4]], dtype=dtype),
+                np.array([7, 11], dtype=dtype)),
+          expected=np.array([[8, 9], [14, 15]], dtype=dtype))
+
+      self._assertOpOutputMatchesExpected(
+          lambda x, y: xla.add(x, y, broadcast_dims=(1,)),
+          args=(np.array([[1, 2], [3, 4]], dtype=dtype),
+                np.array([7, 11], dtype=dtype)),
+          expected=np.array([[8, 13], [10, 15]], dtype=dtype))
+
+  def testBroadcast(self):
+    for dtype in self.numeric_types:
+      v = np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2])
+      self._assertOpOutputMatchesExpected(
+          lambda x: xla.broadcast(x, (7, 42)),
+          args=(v,),
+          expected=np.tile(v, (7, 42, 1, 1)))
+
+  def testShiftRightLogical(self):
+    self._assertOpOutputMatchesExpected(
+        xla.shift_right_logical,
+        args=(np.array([-1, 16], dtype=np.int32), np.int32(4)),
+        expected=np.array([0x0FFFFFFF, 1], dtype=np.int32))
+
+    self._assertOpOutputMatchesExpected(
+        xla.shift_right_logical,
+        args=(np.array([0xFFFFFFFF, 16], dtype=np.uint32), np.uint32(4)),
+        expected=np.array([0x0FFFFFFF, 1], dtype=np.uint32))
+
+  def testShiftRightArithmetic(self):
+    self._assertOpOutputMatchesExpected(
+        xla.shift_right_arithmetic,
+        args=(np.array([-1, 16], dtype=np.int32), np.int32(4)),
+        expected=np.array([-1, 1], dtype=np.int32))
+
+    self._assertOpOutputMatchesExpected(
+        xla.shift_right_arithmetic,
+        args=(np.array([0xFFFFFFFF, 16], dtype=np.uint32), np.uint32(4)),
+        expected=np.array([0xFFFFFFFF, 1], dtype=np.uint32))
+
+  PRECISION_VALUES = (None, xla_data_pb2.PrecisionConfigProto.DEFAULT,
+                      xla_data_pb2.PrecisionConfigProto.HIGH,
+                      xla_data_pb2.PrecisionConfigProto.HIGHEST)
+
+  @parameterized.parameters(*PRECISION_VALUES)
+  def testConv(self, precision):
+    for dtype in set(self.float_types).intersection(
+        set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
+
+      def conv_1d_fn(lhs, rhs):
+        dnums = xla_data_pb2.ConvolutionDimensionNumbers()
+        num_spatial_dims = 1
+        dnums.input_batch_dimension = 0
+        dnums.input_feature_dimension = 1
+        dnums.output_batch_dimension = 0
+        dnums.output_feature_dimension = 1
+        dnums.kernel_output_feature_dimension = 0
+        dnums.kernel_input_feature_dimension = 1
+        dnums.input_spatial_dimensions.extend(range(2, 2 + num_spatial_dims))
+        dnums.kernel_spatial_dimensions.extend(range(2, 2 + num_spatial_dims))
+        dnums.output_spatial_dimensions.extend(range(2, 2 + num_spatial_dims))
+        precision_config = None
+        if precision:
+          precision_config = xla_data_pb2.PrecisionConfigProto()
+          precision_config.operand_precision.extend([precision, precision])
+        return xla.conv(
+            lhs,
+            rhs,
+            window_strides=(1,),
+            padding=((2, 1),),
+            lhs_dilation=(1,),
+            rhs_dilation=(2,),
+            dimension_numbers=dnums)
+
+      self._assertOpOutputMatchesExpected(
+          conv_1d_fn,
+          args=(
+              np.array([[[3, 4, 5, 6]]], dtype=dtype),
+              np.array([[[-2, -3]]], dtype=dtype),
+          ),
+          expected=np.array([[[-9, -12, -21, -26, -10]]], dtype=dtype))
+
+  @parameterized.parameters(*PRECISION_VALUES)
+  def testDotGeneral(self, precision):
+    for dtype in self.float_types:
+
+      def dot_fn(lhs, rhs):
+        dnums = xla_data_pb2.DotDimensionNumbers()
+        dnums.lhs_contracting_dimensions.append(2)
+        dnums.rhs_contracting_dimensions.append(1)
+        dnums.lhs_batch_dimensions.append(0)
+        dnums.rhs_batch_dimensions.append(0)
+        precision_config = None
+        if precision:
+          precision_config = xla_data_pb2.PrecisionConfigProto()
+          precision_config.operand_precision.extend([precision, precision])
+        return xla.dot_general(
+            lhs,
+            rhs,
+            dimension_numbers=dnums,
+            precision_config=precision_config)
+
+      lhs = np.array(
+          [
+              [[1, 2], [3, 4]],
+              [[5, 6], [7, 8]],
+          ], dtype=dtype)
+      rhs = np.array(
+          [
+              [[1, 2, 3], [4, 5, 6]],
+              [[7, 8, 9], [10, 11, 12]],
+          ], dtype=dtype)
+      self._assertOpOutputMatchesExpected(
+          dot_fn,
+          args=(lhs, rhs),
+          expected=np.array(
+              [
+                  [[9, 12, 15], [19, 26, 33]],
+                  [[95, 106, 117], [129, 144, 159]],
+              ],
+              dtype=dtype))
+
+  def testNeg(self):
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          xla.neg,
+          args=(np.array([1, 2, 3], dtype=dtype),),
+          expected=np.array([-1, -2, -3], dtype=dtype))
+
+  def testPad(self):
+    for dtype in self.numeric_types:
+
+      def pad_fn(x):
+        return xla.pad(
+            x,
+            padding_value=7,
+            padding_low=[2, 1],
+            padding_high=[1, 2],
+            padding_interior=[1, 0])
+
+      self._assertOpOutputMatchesExpected(
+          pad_fn,
+          args=(np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2]),),
+          expected=np.array(
+              [[7, 7, 7, 7, 7], [7, 7, 7, 7, 7], [7, 0, 1, 7, 7],
+               [7, 7, 7, 7, 7], [7, 2, 3, 7, 7], [7, 7, 7, 7, 7]],
+              dtype=dtype))
+
+  def testReduce(self):
+    for dtype in set(self.numeric_types).intersection(
+        set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
+
+      @function.Defun(dtype, dtype)
+      def sum_reducer(x, y):
+        return x + y
+
+      def sum_reduction(dims):
+
+        def fn(x):
+          return xla.reduce(
+              x, init_value=0, dimensions_to_reduce=dims, reducer=sum_reducer)
+
+        return fn
+
+      self._assertOpOutputMatchesExpected(
+          sum_reduction(dims=[]),
+          args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
+          expected=np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]))
+      self._assertOpOutputMatchesExpected(
+          sum_reduction(dims=[0]),
+          args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
+          expected=np.array([12, 15, 18, 21], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          sum_reduction(dims=[1]),
+          args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
+          expected=np.array([6, 22, 38], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          sum_reduction(dims=[0, 1]),
+          args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
+          expected=dtype(66))
+
+      @function.Defun(dtype, dtype)
+      def mul_reducer(x, y):
+        return x * y
+
+      def mul_reduction(dims):
+
+        def fn(x):
+          return xla.reduce(
+              x, init_value=1, dimensions_to_reduce=dims, reducer=mul_reducer)
+
+        return fn
+
+      self._assertOpOutputMatchesExpected(
+          mul_reduction(dims=[0]),
+          args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
+          expected=np.array([0, 45, 120, 231], dtype=dtype))
+
+  def testSelectAndScatter(self):
+    for dtype in set(self.numeric_types).intersection(
+        set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
+
+      @function.Defun(dtype, dtype)
+      def add_scatter(x, y):
+        return x + y
+
+      @function.Defun(dtype, dtype)
+      def ge_select(x, y):
+        return x >= y
+
+      def test_fn(operand, source):
+        return xla.select_and_scatter(
+            operand,
+            window_dimensions=[2, 3, 1, 1],
+            window_strides=[2, 2, 1, 1],
+            padding=[[0, 0]] * 4,
+            source=source,
+            init_value=0,
+            select=ge_select,
+            scatter=add_scatter)
+
+      self._assertOpOutputMatchesExpected(
+          test_fn,
+          args=(np.array(
+              [[7, 2, 5, 3, 8], [3, 8, 9, 3, 4], [1, 5, 7, 5, 6],
+               [0, 6, 2, 10, 2]],
+              dtype=dtype).reshape((4, 5, 1, 1)),
+                np.array([[2, 6], [3, 1]], dtype=dtype).reshape((2, 2, 1, 1))),
+          expected=np.array(
+              [[0, 0, 0, 0, 0], [0, 0, 8, 0, 0], [0, 0, 3, 0, 0],
+               [0, 0, 0, 1, 0]],
+              dtype=dtype).reshape((4, 5, 1, 1)))
+
+  def testTranspose(self):
+    for dtype in self.numeric_types:
+      v = np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2])
+      self._assertOpOutputMatchesExpected(
+          lambda x: xla.transpose(x, [1, 0]), args=(v,), expected=v.T)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index fda32c8a1c..92e577bb7b 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -88,6 +89,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -211,6 +213,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = 1,
 )
@@ -220,13 +223,11 @@ cc_library(
     srcs = [
         "literal_util.cc",
         "shape_util.cc",
-        "str_util.cc",
         "type_util.cc",
     ],
     hdrs = [
         "literal_util.h",
         "shape_util.h",
-        "str_util.h",
         "type_util.h",
     ],
     visibility = [":friends"],
@@ -255,6 +256,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -287,6 +289,7 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -305,6 +308,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -372,19 +376,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
-)
-
-tf_cc_test(
-    name = "str_util_test",
-    srcs = [
-        "str_util_test.cc",
-    ],
-    deps = [
-        ":common",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -443,21 +435,96 @@ cc_library(
 )
 
 cc_library(
+    name = "functionalize_control_flow_util",
+    srcs = [
+        "functionalize_control_flow_util.cc",
+    ],
+    hdrs = [
+        "functionalize_control_flow_util.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "functionalize_cond",
+    srcs = [
+        "functionalize_cond.cc",
+    ],
+    hdrs = [
+        "functionalize_cond.h",
+    ],
+    deps = [
+        ":functionalize_control_flow_util",
+        ":tf2xla_util",
+        "//tensorflow/compiler/jit:union_find",
+        "//tensorflow/compiler/tf2xla:dump_graph",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
     name = "functionalize_control_flow",
-    srcs = ["functionalize_control_flow.cc"],
-    hdrs = ["functionalize_control_flow.h"],
+    srcs = [
+        "functionalize_control_flow.cc",
+    ],
+    hdrs = [
+        "functionalize_control_flow.h",
+    ],
     deps = [
+        ":functionalize_cond",
+        ":functionalize_control_flow_util",
+        ":functionalize_while",
         ":tf2xla_util",
         "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "functionalize_while",
+    srcs = [
+        "functionalize_while.cc",
+    ],
+    hdrs = [
+        "functionalize_while.h",
+    ],
+    deps = [
+        ":functionalize_control_flow_util",
+        ":tf2xla_util",
+        "//tensorflow/compiler/jit:union_find",
+        "//tensorflow/compiler/tf2xla:dump_graph",
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -485,6 +552,32 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "functionalize_cond_test",
+    srcs = ["functionalize_cond_test.cc"],
+    deps = [
+        ":functionalize_cond",
+        ":functionalize_control_flow",
+        ":test_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/compiler/tf2xla/cc:xla_ops",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:resource_variable_ops_op_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "test_util",
     testonly = 1,
@@ -508,3 +601,30 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "resource_operation_table",
+    srcs = ["resource_operation_table.cc"],
+    hdrs = ["resource_operation_table.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+tf_cc_test(
+    name = "resource_operation_table_test",
+    srcs = ["resource_operation_table_test.cc"],
+    deps = [
+        ":resource_operation_table",
+        ":xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index de1008803d..e8673d7790 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 
 namespace tensorflow {
-
 // Backwards dataflow analysis that finds arguments to a graph that must be
 // compile-time constants.
 Status BackwardsConstAnalysis(const Graph& g,
-                              std::vector<bool>* compile_time_const_args) {
+                              std::vector<bool>* compile_time_const_args,
+                              std::vector<bool>* compile_time_const_nodes) {
   // Operators that don't look at the data of their inputs, just the shapes.
   const std::unordered_set<string> metadata_ops = {
       "Rank",
@@ -36,9 +36,16 @@ Status BackwardsConstAnalysis(const Graph& g,
       "Size",
   };
 
+  std::vector<bool> compile_time_const_nodes_impl;
+  if (compile_time_const_nodes) {
+    CHECK_EQ(compile_time_const_nodes->size(), g.num_node_ids());
+  } else {
+    compile_time_const_nodes_impl.resize(g.num_node_ids());
+    compile_time_const_nodes = &compile_time_const_nodes_impl;
+  }
+
   Status status;
-  std::unordered_set<const Node*> must_be_const;
-  auto visit = [&status, &metadata_ops, &must_be_const,
+  auto visit = [&status, &metadata_ops, compile_time_const_nodes,
                 compile_time_const_args](Node* node) {
     if (!status.ok()) return;
 
@@ -47,17 +54,19 @@ Status BackwardsConstAnalysis(const Graph& g,
 
     // If this node must be const, and it isn't a metadata op, then all of its
     // parents must be const.
-    if (must_be_const.find(node) != must_be_const.end()) {
+    if ((*compile_time_const_nodes)[node->id()]) {
       if (node->type_string() == "_Arg") {
         int index;
         status = GetNodeAttr(node->attrs(), "index", &index);
         if (!status.ok()) return;
-        compile_time_const_args->at(index) = true;
+        if (compile_time_const_args) {
+          (*compile_time_const_args)[index] = true;
+        }
         return;
       }
       for (const Edge* pred : node->in_edges()) {
         if (!pred->IsControlEdge()) {
-          must_be_const.insert(pred->src());
+          (*compile_time_const_nodes)[pred->src()->id()] = true;
         }
       }
       return;
@@ -80,7 +89,7 @@ Status BackwardsConstAnalysis(const Graph& g,
       for (Edge const* edge : node->in_edges()) {
         if (edge->dst_input() >= name_range->second.first &&
             edge->dst_input() < name_range->second.second) {
-          must_be_const.insert(edge->src());
+          (*compile_time_const_nodes)[edge->src()->id()] = true;
         }
       }
     }
diff --git a/tensorflow/compiler/tf2xla/const_analysis.h b/tensorflow/compiler/tf2xla/const_analysis.h
index 634b97d7e3..af57e5a403 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.h
+++ b/tensorflow/compiler/tf2xla/const_analysis.h
@@ -23,10 +23,18 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Backwards dataflow analysis that finds arguments (_Arg nodes) to a graph that
-// must be compile-time constants.
+// Backwards dataflow analysis that finds nodes in a graph that must be
+// compile-time constants for us to be able to lower the graph to XLA.
+//
+// The indices of the arguments to `graph` that must be constant are returned in
+// `compile_time_const_arg_indices`, if `compile_time_const_arg_indices` is not
+// null.
+//
+// The ids of the nodes in `graph` that must be constant are returned in
+// `compile_time_const_nodes`, if `compile_time_const_nodes` is not null.
 Status BackwardsConstAnalysis(const Graph& graph,
-                              std::vector<bool>* compile_time_const_args);
+                              std::vector<bool>* compile_time_const_arg_indices,
+                              std::vector<bool>* compile_time_const_nodes);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index 992b12c06d..56065be894 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -38,17 +39,23 @@ TEST(ConstAnalysisTest, Basics) {
   auto c = ops::Reshape(root, arg2, b);
   auto d = ops::Mul(root, c, ops::Sum(root, arg3, arg3));
 
-  Graph graph(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&graph));
+  FixupSourceAndSinkEdges(root.graph());
 
   std::vector<bool> const_args(4, false);
-  TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args));
+  std::vector<bool> const_nodes(root.graph()->num_node_ids(), false);
+  TF_ASSERT_OK(
+      BackwardsConstAnalysis(*root.graph(), &const_args, &const_nodes));
 
   // Arg 0 doesn't need to be constant since the graph only uses its shape.
   // Arg 1 must be constant because it flows to the shape argument of a Reshape.
   // Arg 2 is used only as the value input to a Reshape and need not be const.
   // Arg 3 is used as the reduction-indices argument to Sum and must be const.
   EXPECT_EQ(const_args, std::vector<bool>({false, true, false, true}));
+
+  EXPECT_FALSE(const_nodes[arg0.node()->id()]);
+  EXPECT_TRUE(const_nodes[arg1.node()->id()]);
+  EXPECT_FALSE(const_nodes[arg2.node()->id()]);
+  EXPECT_TRUE(const_nodes[arg3.node()->id()]);
 }
 
 // Regression test for a case where the backward const analysis did
@@ -73,7 +80,8 @@ TEST(ConstAnalysisTest, TopologicalOrder) {
     TF_ASSERT_OK(root.ToGraph(&graph));
 
     std::vector<bool> const_args(3, false);
-    TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args));
+    TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
+                                        /*compile_time_const_nodes=*/nullptr));
 
     EXPECT_EQ(const_args, std::vector<bool>({true, true, false}));
   }
@@ -93,7 +101,8 @@ TEST(ConstAnalysisTest, DontFollowControlDependencies) {
   TF_ASSERT_OK(root.ToGraph(&graph));
 
   std::vector<bool> const_args(2, false);
-  TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args));
+  TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
+                                      /*compile_time_const_nodes=*/nullptr));
 
   EXPECT_EQ(const_args, std::vector<bool>({false, true}));
 }
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
new file mode 100644
index 0000000000..b5667ca0d3
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -0,0 +1,1385 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/functionalize_cond.h"
+
+#include <algorithm>
+#include <deque>
+#include <stack>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+using xla::StatusOr;
+
+namespace tensorflow {
+namespace functionalize_cond {
+
+string DebugString(const CondStateMap::CondNode& node) {
+  return node.ToString();
+}
+
+// TODO(jpienaar): Move to OutputTensor.
+string DebugString(const OutputTensor& tensor) {
+  return strings::StrCat(tensor.node->name(), ":", tensor.index);
+}
+
+string DebugString(CondStateMap::CondId cond_state) {
+  if (cond_state == nullptr || cond_state->empty()) return "[]";
+  return strings::StrCat(
+      "[",
+      absl::StrJoin(*cond_state, ", ",
+                    [](string* output, const CondStateMap::CondNode& node) {
+                      strings::StrAppend(output, node.ToString());
+                    }),
+      "]");
+}
+
+string Branch_Name(BranchType b) {
+  switch (b) {
+    case BranchType::kElseBranch:
+      return "else";
+    case BranchType::kThenBranch:
+      return "then";
+    case BranchType::kBoth:
+      return "both";
+    case BranchType::kNeither:
+      return "neither";
+  }
+}
+
+// Returns the predicate of a switch.
+Status GetSwitchPredicate(const Node& switch_node, OutputTensor* pred) {
+  const Edge* pred_edge;
+  TF_RETURN_IF_ERROR(switch_node.input_edge(1, &pred_edge));
+  // The predicate can be preceded by a identity node. Look through
+  // identity nodes to predicate.
+  while (pred_edge->src()->IsIdentity()) {
+    TF_RETURN_IF_ERROR(pred_edge->src()->input_edge(0, &pred_edge));
+  }
+  *pred = OutputTensor(pred_edge->src(), pred_edge->src_output());
+  return Status::OK();
+}
+
+CondStateMap::CondNode::CondNode(Type type, Node* switch_node,
+                                 BranchType branch)
+    : type(type), branch(branch) {
+  if (type == Type::kSwitch) {
+    TF_CHECK_OK(GetSwitchPredicate(*switch_node, &predicate));
+  }
+}
+
+string CondStateMap::CondNode::ToString() const {
+  switch (type) {
+    case Type::kSwitch:
+      return strings::StrCat("s(", DebugString(predicate), ",",
+                             Branch_Name(branch), ")");
+    case Type::kMerge:
+      return "m";
+    case Type::kDead:
+      return "d";
+  }
+}
+
+bool CondStateMap::CondNode::operator==(const CondNode& other) const {
+  if (type != Type::kSwitch) return type == other.type;
+  return type == other.type && predicate == other.predicate &&
+         branch == other.branch;
+}
+
+bool CondStateMap::CondNode::operator!=(const CondNode& other) const {
+  return !(*this == other);
+}
+
+CondStateMap::CondStateMap(Graph* graph) {
+  node_to_condid_map_.resize(graph->num_node_ids());
+  // Initialize the dead state (empty state is designated with a nullptr).
+  dead_id_ = GetUniqueId({CondNode(CondStateMap::CondNode::Type::kDead)});
+}
+
+bool CondStateMap::IsDead(CondStateMap::CondId id) const {
+  return id == dead_id_;
+}
+
+bool CondStateMap::IsEmpty(CondStateMap::CondId id) const {
+  return id == nullptr;
+}
+
+size_t CondStateMap::CondHash::operator()(
+    const CondStateMap::CondNode& item) const {
+  return Hash64Combine(Hash64Combine(OutputTensor::Hash()(item.predicate),
+                                     hash<BranchType>()(item.branch)),
+                       hash<CondStateMap::CondNode::Type>()(item.type));
+}
+
+size_t CondStateMap::CondHash::operator()(
+    const CondStateMap::CondState& vec) const {
+  if (vec.empty()) return 0;
+  size_t h = (*this)(vec.front());
+  auto it = vec.begin();
+  for (++it; it != vec.end(); ++it) {
+    h = Hash64Combine(h, (*this)(*it));
+  }
+  return h;
+}
+
+// CondArgNode represents a input to the conditional and its corresponding
+// switch nodes.
+struct CondArgNode {
+  explicit CondArgNode(Node* src, int src_output)
+      : src(src), src_output(src_output) {}
+
+  string ToString() const {
+    return strings::StrCat("src=", src->name(), ":", src_output,
+                           " switches=", NodesToString(switches));
+  }
+
+  Node* src;
+  int src_output;
+  std::array<Node*, 2> branch_copy;
+  std::vector<Node*> switches;
+};
+using CondArgNodes = std::vector<CondArgNode>;
+
+string DebugString(const CondArgNodes& nodes) {
+  return strings::StrCat(
+      "[",
+      absl::StrJoin(nodes, ", ",
+                    [](string* output, const CondArgNode& node) {
+                      strings::StrAppend(output, node.ToString());
+                    }),
+      "]");
+}
+
+CondStateMap::CondId CondStateMap::LookupId(const Node* node) const {
+  if (node->id() < node_to_condid_map_.size())
+    return node_to_condid_map_[node->id()];
+  return added_node_mapping_.at(node->id());
+}
+
+CondStateMap::CondId CondStateMap::GetUniqueId(
+    const CondStateMap::CondState& state) {
+  if (state.empty()) return nullptr;
+  return &*condstate_set_.insert(state).first;
+}
+
+const CondStateMap::CondState& CondStateMap::LookupState(
+    const Node* node) const {
+  return *LookupId(node);
+}
+
+void CondStateMap::ResetId(const Node* node, CondStateMap::CondId id) {
+  if (node->id() < node_to_condid_map_.size())
+    node_to_condid_map_[node->id()] = id;
+  else
+    added_node_mapping_[node->id()] = id;
+}
+
+void CondStateMap::MarkDead(const Node* node) { ResetId(node, dead_id_); }
+
+string CondStateMap::CondStateToString(const Node* node) const {
+  return CondStateToString(LookupId(node));
+}
+
+string CondStateMap::CondStateToString(CondStateMap::CondId id) const {
+  return DebugString(id);
+}
+
+FunctionalizeCond::FunctionalizeCond(Graph* graph,
+                                     FunctionLibraryDefinition* library)
+    : cond_state_map_(graph), library_(library), graph_(graph) {}
+
+// Class representing the merge/switch nodes that will become a conditional.
+class Conditional {
+ public:
+  Conditional(OutputTensor predicate, FunctionalizeCond* parent,
+              CondStateMap* cond_state_map);
+
+  // Adds merge node that is part of this conditional.
+  Status AddMerge(Node* m);
+
+  // Constructs an If node from the merge nodes.
+  Status BuildAndReplace(Graph* graph, FunctionLibraryDefinition* library);
+
+ private:
+  // Extracts the then/else bodies: creates new graphs with the nodes
+  // corresponding to the nodes in the then/else branches as of this conditional
+  // as function bodies.
+  Status ExtractBodies(Graph* graph);
+
+  // Builds the arguments that are the input to the If.
+  Status BuildArgumentNodes();
+
+  // Builds the If node for the extracted bodies with the given predicate.
+  Status BuildIfNode(Graph* graph, FunctionLibraryDefinition* library);
+
+  // Adds input edges to If node.
+  Status AddInputEdges(Graph* graph);
+
+  // Adds output edges from If node.
+  Status AddOutputEdges(Graph* graph);
+
+  // Adds switch node that is part of this conditional.
+  Status AddSwitch(Node* s);
+
+  // Internal name of conditional. The name is based on the first merge node
+  // added.
+  string name() const;
+
+  // The FunctionalizeCond instance that created this.
+  FunctionalizeCond* parent_;
+
+  // Mapping between nodes and their cond state.
+  CondStateMap* cond_state_map_;
+
+  // The predicate of the conditional.
+  OutputTensor predicate_;
+
+  // The predicate of the switches of the conditional. This may be different
+  // than predicate (which is initialized from the original graph) as the
+  // predicate could be the output of a newly created If node.
+  OutputTensor switch_predicate_;
+
+  // Switch nodes in graph that are part of this conditional.
+  std::set<Node*, NodeCmpByNameResourcesLast> switches_;
+
+  // Merge nodes in graph that are part of this conditional.
+  std::set<Node*, NodeCmpByNameResourcesLast> merges_;
+
+  // Vector of control inputs from outside the conditional to a node inside.
+  std::vector<Node*> external_control_inputs_;
+  std::vector<Node*> external_control_outputs_;
+
+  // Graphs corresponding to the then and else branch.
+  std::array<std::unique_ptr<Graph>, 2> bodies_;
+
+  // Maps from graph_ to the branch body's graph.
+  std::array<std::vector<Node*>, 2> node_maps_;
+
+  // The argument nodes created for the switches.
+  CondArgNodes cond_arg_nodes_;
+
+  // The constructed If node.
+  Node* if_node_ = nullptr;
+
+  // Whether the merge nodes of this conditional have been replaced.
+  bool replaced_ = false;
+};
+
+Conditional::Conditional(OutputTensor predicate, FunctionalizeCond* parent,
+                         CondStateMap* cond_state_map)
+    : parent_(parent), cond_state_map_(cond_state_map), predicate_(predicate) {}
+
+Status Conditional::AddMerge(Node* m) {
+  merges_.insert(m);
+  return Status::OK();
+}
+
+Status Conditional::AddSwitch(Node* s) {
+  VLOG(5) << "Adding switch " << s->DebugString();
+  OutputTensor predicate;
+  TF_RETURN_IF_ERROR(GetSwitchPredicate(*s, &predicate));
+  if (switch_predicate_.node == nullptr) switch_predicate_ = predicate;
+  if (!(switch_predicate_ == predicate)) {
+    return errors::InvalidArgument(
+        "Merge nodes ", NodesToString(merges_),
+        " directly dominated by switch nodes with different predicates (",
+        DebugString(switch_predicate_), " vs ", DebugString(predicate), ").");
+  }
+  switches_.insert(s);
+  return Status::OK();
+}
+
+Status Conditional::BuildArgumentNodes() {
+  VLOG(1) << "Build function arguments";
+  struct Hash {
+    size_t operator()(const std::pair<Node*, int>& item) const {
+      return Hash64Combine(hash<Node*>()(item.first),
+                           std::hash<int>()(item.second));
+    }
+  };
+
+  std::unordered_map<std::pair<Node*, int>, int, Hash> input_index;
+  for (Node* switch_node : switches_) {
+    const Edge* e;
+    TF_RETURN_IF_ERROR(switch_node->input_edge(0, &e));
+    std::pair<Node*, int> key = std::make_pair(e->src(), e->src_output());
+    if (input_index.find(key) == input_index.end()) {
+      input_index[key] = cond_arg_nodes_.size();
+      cond_arg_nodes_.emplace_back(key.first, key.second);
+    }
+    cond_arg_nodes_.at(input_index.at(key)).switches.push_back(switch_node);
+  }
+  VLOG(5) << "CondArg nodes created: " << DebugString(cond_arg_nodes_);
+
+  int arg_count = 0;
+  for (CondArgNode& cond_arg_node : cond_arg_nodes_) {
+    DataType dtype = cond_arg_node.src->output_type(cond_arg_node.src_output);
+    for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
+      int branch_index = static_cast<int>(branch);
+      TF_RETURN_IF_ERROR(
+          NodeBuilder(strings::StrCat("_Arg", arg_count),
+                      FunctionLibraryDefinition::kArgOp)
+              .Attr("T", dtype)
+              .Attr("index", arg_count)
+              .Finalize(bodies_[branch_index].get(),
+                        &cond_arg_node.branch_copy[branch_index]));
+    }
+    for (Node* node : cond_arg_node.switches) {
+      for (const Edge* e : node->out_edges()) {
+        if (e->IsControlEdge()) continue;
+        int branch_index = e->src_output();
+        Node* src_copy = cond_arg_node.branch_copy[branch_index];
+        Node* dst_copy = node_maps_[branch_index][e->dst()->id()];
+
+        // The graph may contain dead switch nodes,
+        if (dst_copy == nullptr) continue;
+
+        TF_RET_CHECK(dst_copy != nullptr)
+            << "Unable to find copied node for " << e->dst()->DebugString()
+            << " on branch " << Branch_Name(BranchType(branch_index));
+        // If the input goes directly to a merge then the merge has
+        // been replaced by a retval so the dst input is 0 instead of
+        // dst_input.
+        int dst_input = IsMerge(e->dst()) ? 0 : e->dst_input();
+        bodies_[branch_index]->AddEdge(src_copy, 0, dst_copy, dst_input);
+      }
+    }
+    ++arg_count;
+  }
+
+  // Verify that all retvals have an input.
+  // TODO(jpienaar): One could add a ZerosLike in the branch that doesn't have
+  // input.
+  for (Node* m : merges_) {
+    for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
+      bool has_input = false;
+      for (auto e : node_maps_[static_cast<int>(branch)][m->id()]->in_edges()) {
+        if (!e->IsControlEdge()) {
+          has_input = true;
+          break;
+        }
+      }
+      if (!has_input) {
+        return errors::Internal(
+            "Failed to functionalize control flow with merge ",
+            FormatNodeForError(*m), " that doesn't have input on ",
+            Branch_Name(branch), " branch.");
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Conditional::ExtractBodies(Graph* graph) {
+  VLOG(2) << "Extracting bodies for " << name();
+  for (auto b : {BranchType::kElseBranch, BranchType::kThenBranch}) {
+    bodies_[static_cast<int>(b)] =
+        absl::make_unique<Graph>(graph->op_registry());
+  }
+
+  auto find_branch = [&](const Edge* e) {
+    const auto& id = cond_state_map_->LookupId(e->src());
+    return IsSwitch(e->src()) ? BranchType(e->src_output())
+                              : cond_state_map_->FindBranchOf(id, predicate_);
+  };
+
+  std::array<std::vector<Node*>, 2> stacks;
+  VLOG(5) << "Merges: " << NodesToString(merges_);
+  for (Node* m : merges_) {
+    VLOG(5) << "For merge: " << m->DebugString() << " "
+            << cond_state_map_->CondStateToString(m);
+    for (auto e : m->in_edges()) {
+      if (e->IsControlEdge()) continue;
+      BranchType branch = find_branch(e);
+      TF_RET_CHECK(branch == BranchType::kThenBranch ||
+                   branch == BranchType::kElseBranch)
+          << "Error: " << e->src()->name()
+          << " is not on either then or else branch (" << Branch_Name(branch)
+          << ").";
+      Node* src = e->src();
+      if (IsSwitch(src)) {
+        // Switch node outputs and dependencies are handled separately.
+        TF_RETURN_IF_ERROR(AddSwitch(src));
+      } else {
+        stacks[static_cast<int>(branch)].push_back(src);
+      }
+    }
+  }
+
+  for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
+    int branch_index = static_cast<int>(branch);
+    auto output = bodies_[branch_index].get();
+    auto& stack = stacks[branch_index];
+    VLOG(5) << "In branch: " << Branch_Name(branch) << " "
+            << NodesToString(stack);
+    std::vector<bool> visited(graph->num_node_ids(), false);
+    node_maps_[branch_index].resize(graph->num_node_ids(), nullptr);
+    auto& node_map = node_maps_[branch_index];
+
+    while (!stack.empty()) {
+      Node* n = stack.back();
+      stack.pop_back();
+
+      if (visited.at(n->id())) continue;
+      visited[n->id()] = true;
+
+      // Verify output edges and record control edges exitting scope.
+      for (const Edge* e : n->out_edges()) {
+        Node* dst = e->dst();
+        if (IsMerge(dst)) continue;
+        Node* src = e->src();
+
+        auto dst_id = cond_state_map_->LookupId(dst);
+        auto src_id = cond_state_map_->LookupId(src);
+        if (dst_id != src_id) {
+          if (e->IsControlEdge()) {
+            external_control_outputs_.push_back(e->src());
+          } else {
+            // Constants are treated specially to workaround the case of
+            // non-dominated constant nodes.
+            if (!IsConstant(src)) {
+              // TODO(b/78882471): A node that feeds into two different
+              // CondState is not necessarily an error so log a warning for now
+              // but revisit to improve the testing to enable making this an
+              // error.
+              LOG(WARNING) << errors::InvalidArgument(
+                  "Graph contains node ", FormatNodeForError(*src),
+                  " that feeds into node ", FormatNodeForError(*dst),
+                  " but these nodes are in different control contexts (",
+                  DebugString(src_id), " vs ", DebugString(dst_id),
+                  " (detected during out edge testing)");
+            }
+          }
+        }
+      }
+
+      // Copying incomming edges to dst node.
+      for (const Edge* e : n->in_edges()) {
+        Node* src = e->src();
+        // Skip src/dst node.
+        if (!src->IsOp()) continue;
+
+        Node* dst = e->dst();
+        if (IsSwitch(src)) {
+          // Switch node outputs and dependencies are handled separately.
+          TF_RETURN_IF_ERROR(AddSwitch(src));
+          continue;
+        }
+
+        // Verify input is from the same context.
+        auto src_id = cond_state_map_->LookupId(src);
+        auto dst_id = cond_state_map_->LookupId(dst);
+        if (IsMerge(dst) || src_id == dst_id) {
+          // TODO(jpienaar): The merge case can be more strict.
+          if (node_map.at(src->id()) == nullptr) {
+            node_map.at(src->id()) = output->CopyNode(src);
+            stack.push_back(src);
+          }
+        } else if (e->IsControlEdge()) {
+          external_control_inputs_.push_back(src);
+        } else {
+          // This shouldn't happen, this means we have an external data input
+          // not entering via a switch node. Work around this for constant
+          // nodes as some constant nodes are inserted without the required
+          // control context dominance.
+          if (IsConstant(src)) {
+            node_map.at(src->id()) = output->CopyNode(src);
+          } else {
+            return errors::InvalidArgument(
+                "Graph contains node ", FormatNodeForError(*src),
+                " that feeds into node ", FormatNodeForError(*dst),
+                " but these nodes are in different control contexts (",
+                DebugString(src_id), " vs ", DebugString(dst_id),
+                " (detected during in edge testing)");
+          }
+        }
+
+        Node* src_copy = node_map.at(e->src()->id());
+        int src_output = e->src_output();
+        if (node_map.at(dst->id()) == nullptr) {
+          node_map.at(dst->id()) = output->CopyNode(dst);
+        }
+        Node* dst_copy = node_map.at(e->dst()->id());
+        if (e->IsControlEdge()) {
+          // Skip control inputs from external context.
+          if (src_copy != nullptr) output->AddControlEdge(src_copy, dst_copy);
+        } else {
+          output->AddEdge(src_copy, src_output, dst_copy, e->dst_input());
+        }
+      }
+    }
+  }
+
+  // Build return values from the merge nodes.
+  int index = 0;
+  for (Node* m : merges_) {
+    for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
+      int branch_index = static_cast<int>(branch);
+      auto& node_map = node_maps_[branch_index];
+      auto output = bodies_[branch_index].get();
+      TF_ASSIGN_OR_RETURN(node_map[m->id()],
+                          BuildRetvalNode(output, m->output_type(0), index));
+    }
+    ++index;
+
+    // Connect the input to the merge_ with the retval, except if it is a
+    // Swich node, which is handled separately.
+    for (auto e : m->in_edges()) {
+      if (e->IsControlEdge()) continue;
+      int branch_index = static_cast<int>(find_branch(e));
+      auto& node_map = node_maps_[branch_index];
+      auto output = bodies_[branch_index].get();
+      Node* in = e->src();
+      if (!IsSwitch(in)) {
+        if (node_map.at(in->id()) == nullptr) {
+          node_map[in->id()] = output->CopyNode(in);
+        }
+        output->AddEdge(node_map[in->id()], e->src_output(),
+                        node_map.at(m->id()), 0);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status Conditional::BuildIfNode(Graph* graph,
+                                FunctionLibraryDefinition* library) {
+  VLOG(2) << "Build cond function for " << name();
+  NodeDefBuilder builder(name(), "If");
+  const string branch_name[] = {"else_branch", "then_branch"};
+  for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
+    int branch_index = static_cast<int>(branch);
+    static std::atomic<int64> sequence_num(0LL);
+    int64 id = ++sequence_num;
+
+    NameAttrList body_name;
+    body_name.set_name(strings::StrCat("_functionalize_if_",
+                                       branch_name[branch_index], "_", id));
+
+    VLOG(3) << "FunctionalizeControlFlow (" << branch_name[branch_index]
+            << "): "
+            << dump_graph::DumpGraphToFile(
+                   "functionalize_cond_body_" + branch_name[branch_index],
+                   *bodies_[branch_index], nullptr);
+
+    FunctionDef body_fdef;
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*bodies_[branch_index],
+                                          body_name.name(), &body_fdef));
+    TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef));
+    builder.Attr(branch_name[branch_index], body_name);
+  }
+
+  VLOG(3) << "Build input type";
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  DataTypeVector in_arg_types;
+  for (auto& kv : cond_arg_nodes_) {
+    bool inserted = false;
+    for (const Node* arg : kv.switches) {
+      const Edge* in_edge;
+      TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
+      if (in_edge->IsControlEdge()) {
+        builder.ControlInput(in_edge->src()->name());
+      } else {
+        if (!inserted) {
+          DataType dtype = arg->input_type(0);
+          inputs.emplace_back(NodeDefBuilder::NodeOut(
+              in_edge->src()->name(), in_edge->src_output(), dtype));
+          in_arg_types.push_back(dtype);
+          inserted = true;
+        }
+      }
+    }
+  }
+  builder.Attr("Tin", in_arg_types);
+
+  DataTypeVector out_type;
+  for (const Node* merge : merges_) {
+    DataType dtype = merge->output_type(0);
+    out_type.push_back(dtype);
+  }
+  builder.Attr("Tout", out_type);
+  VLOG(3) << "Build output type: " << DataTypeVectorString(out_type);
+
+  builder.Attr("Tcond", DT_BOOL);
+  builder.Device(predicate_.node->assigned_device_name());
+  // Conditional should be the first input ...
+  builder.Input(NodeDefBuilder::NodeOut(predicate_.node->name(),
+                                        predicate_.index,
+                                        predicate_.node->output_type(0)));
+  // ... followed by the other inputs.
+  builder.Input(inputs);
+
+  VLOG(3) << "Build If node";
+  NodeDef if_def;
+  TF_RETURN_IF_ERROR(builder.Finalize(&if_def));
+  TF_ASSIGN_OR_RETURN(if_node_, parent_->AddIfNode(if_def, *merges_.begin()));
+
+  return Status::OK();
+}
+
+Status Conditional::AddInputEdges(Graph* graph) {
+  VLOG(2) << "AddInputEdges for " << if_node_->name();
+  int index = 0;
+  // Add predicate input.
+  graph->AddEdge(const_cast<Node*>(predicate_.node), predicate_.index, if_node_,
+                 index++);
+  // Add function body inputs.
+  for (auto& arg : cond_arg_nodes_) {
+    if (arg.src_output == Graph::kControlSlot) {
+      graph->AddControlEdge(arg.src, if_node_);
+    } else {
+      graph->AddEdge(arg.src, arg.src_output, if_node_, index++);
+    }
+  }
+  for (Node* n : external_control_inputs_) {
+    graph->AddControlEdge(n, if_node_);
+  }
+  return Status::OK();
+}
+
+Status Conditional::AddOutputEdges(Graph* graph) {
+  VLOG(2) << "AddOutputEdges for " << if_node_->name();
+  int i = 0;
+  for (Node* node : merges_) {
+    TF_RETURN_IF_ERROR(parent_->AddIdentityNode(node, if_node_, i));
+    std::vector<const Edge*> edges(node->out_edges().begin(),
+                                   node->out_edges().end());
+    for (const Edge* edge : edges) {
+      Node* dst = edge->dst();
+      int dst_input = edge->dst_input();
+      if (edge->src_output() > 0) {
+        return errors::Unimplemented("Output of index (", edge->src_output(),
+                                     ") of merge node ",
+                                     FormatNodeForError(*node));
+      }
+
+      bool control_edge = edge->IsControlEdge();
+      graph->RemoveEdge(edge);
+      if (control_edge) {
+        graph->AddControlEdge(if_node_, dst);
+      } else {
+        graph->AddEdge(if_node_, i, dst, dst_input);
+      }
+    }
+    ++i;
+  }
+  for (Node* n : external_control_outputs_) {
+    graph->AddControlEdge(if_node_, n);
+  }
+
+  return Status::OK();
+}
+
+Status Conditional::BuildAndReplace(Graph* graph,
+                                    FunctionLibraryDefinition* library) {
+  VLOG(1) << "Build If and replace merge nodes " << name();
+  if (replaced_) return Status::OK();
+
+  TF_RETURN_IF_ERROR(ExtractBodies(graph));
+  TF_RETURN_IF_ERROR(BuildArgumentNodes());
+
+  if (VLOG_IS_ON(3)) {
+    LOG(INFO) << "Extracted bodies:";
+    for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
+      int branch_index = static_cast<int>(branch);
+      auto output = bodies_[branch_index].get();
+      LOG(INFO) << Branch_Name(branch) << ": "
+                << DebugString(output->ToGraphDefDebug());
+    }
+  }
+
+  TF_RETURN_IF_ERROR(BuildIfNode(graph, library));
+  TF_RETURN_IF_ERROR(AddInputEdges(graph));
+  TF_RETURN_IF_ERROR(AddOutputEdges(graph));
+  TF_RETURN_IF_ERROR(parent_->PropagateUpdatedState(if_node_));
+  for (Node* m : merges_) cond_state_map_->MarkDead(m);
+
+  // Check that the if_node doesn't feed into itself.
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      CheckNodeNotInCycle(if_node_, graph->num_node_ids()),
+      "Converting to If failed.");
+
+  replaced_ = true;
+  return Status::OK();
+}
+
+string Conditional::name() const {
+  CHECK(!merges_.empty());
+  return strings::StrCat((*merges_.begin())->name(), "_if");
+}
+
+bool CondStateMap::ScopeIn(CondStateMap::CondId id,
+                           CondStateMap::CondId* scope) {
+  if (id == nullptr) {
+    *scope = nullptr;
+    return true;
+  }
+  CondState state;
+  for (const CondNode& node : *id) {
+    if (node.type == CondNode::Type::kSwitch) {
+      state.push_back(node);
+    }
+    if (node.type == CondNode::Type::kMerge) {
+      if (state.empty()) {
+        return false;
+      }
+      DCHECK(state.back().type == CondNode::Type::kSwitch &&
+             state.back().branch == BranchType::kBoth);
+      state.pop_back();
+    }
+  }
+  *scope = GetUniqueId(state);
+  return true;
+}
+
+Status FunctionalizeCond::AddIdentityNode(const Node* replacee, Node* if_node,
+                                          int port) {
+  Node* id;
+  TF_RETURN_IF_ERROR(NodeBuilder(replacee->name(), "Identity")
+                         .Input(if_node, port)
+                         .Finalize(graph_, &id));
+  cond_state_map_.ResetId(id, cond_state_map_.LookupId(if_node));
+  return Status::OK();
+}
+
+StatusOr<Node*> FunctionalizeCond::AddIfNode(const NodeDef& def,
+                                             const Node* replacee) {
+  Status status;
+  Node* ret = graph_->AddNode(def, &status);
+  TF_RETURN_IF_ERROR(status);
+  CondStateMap::CondState state = cond_state_map_.LookupState(replacee);
+  state.pop_back();
+  VLOG(1) << "Adding If for " << replacee->name();
+  cond_state_map_.ResetId(ret, cond_state_map_.GetUniqueId(state));
+  return ret;
+}
+
+Status FunctionalizeCond::PropagateUpdatedState(const Node* replacee) {
+  VLOG(2) << "Propagating update state for " << replacee->name() << " "
+          << cond_state_map_.CondStateToString(replacee);
+  // Redo topological sort as the order could have changed.
+  // TODO(jpienaar): The original topological order could also be updated
+  // dynamically if needed.
+  std::vector<Node*> rev_topo_order;
+  GetPostOrder(*graph_, &rev_topo_order);
+
+  // All the outputs of the new node could potentially be updated.
+  std::unordered_set<Node*> changed;
+  for (auto n : replacee->out_nodes())
+    if (n->IsOp()) changed.insert(n);
+
+  // Iterate through the changed/possible changed nodes in topological order.
+  for (auto it = rev_topo_order.rbegin();
+       it != rev_topo_order.rend() && !changed.empty(); ++it) {
+    if (changed.find(*it) != changed.end()) {
+      // Update the node state.
+      Node* n = *it;
+      CondStateMap::CondId old_state = cond_state_map_.LookupId(n);
+      cond_state_map_.ResetId(n, nullptr);
+      TF_RETURN_IF_ERROR(DetermineCondState(n));
+      if (cond_state_map_.LookupId(n) != old_state) {
+        for (auto out : n->out_nodes())
+          if (out->IsOp()) changed.insert(out);
+      }
+      changed.erase(n);
+    }
+  }
+  return Status::OK();
+}
+
+// Returns the most restrictive branch of two branches or neither. This is the
+// meet operator of the BranchType lattice.
+BranchType MeetBranch(const BranchType& lhs, const BranchType& rhs) {
+  if (lhs == rhs) return lhs;
+  if (lhs == BranchType::kNeither) return rhs;
+  if (rhs == BranchType::kNeither) return lhs;
+  if (lhs == BranchType::kBoth) return rhs;
+  if (rhs == BranchType::kBoth) return lhs;
+  return BranchType::kNeither;
+}
+
+CondStateMap::ContainsResult CondStateMap::LhsHoldsWhereverRhsHolds(
+    CondStateMap::CondId lhs, CondStateMap::CondId rhs) {
+  CondId lhs_scope;
+  CondId rhs_scope;
+  bool could_determine_scope = ScopeIn(lhs, &lhs_scope);
+  could_determine_scope = could_determine_scope && ScopeIn(rhs, &rhs_scope);
+  if (!could_determine_scope) return kIncomparable;
+
+  // Returns whether a contains b.
+  auto contains = [&](CondId a, CondId b) {
+    // Handle empty states.
+    if (a == nullptr && b != nullptr) return true;
+    if (a == nullptr && b == nullptr) return true;
+    if (a != nullptr && b == nullptr) return false;
+
+    if (a->size() > b->size()) return false;
+    auto a_it = a->begin();
+    auto b_it = b->begin();
+    while (a_it != a->end()) {
+      if (*a_it != *b_it) {
+        if (!(a_it->predicate == b_it->predicate)) return false;
+        BranchType mb = MeetBranch(a_it->branch, b_it->branch);
+        if (mb != b_it->branch) return false;
+      }
+      ++a_it;
+      ++b_it;
+    }
+    return true;
+  };
+
+  bool lhs_contains_rhs = contains(lhs_scope, rhs_scope);
+  bool rhs_contains_lhs = contains(rhs_scope, lhs_scope);
+  if (lhs_contains_rhs && rhs_contains_lhs) return kEqual;
+  if (lhs_contains_rhs) return kLhsContainsRhs;
+  if (rhs_contains_lhs) return kRhsContainsLhs;
+  return kIncomparable;
+}
+
+BranchType CondStateMap::FindBranchOf(CondId id, OutputTensor predicate) const {
+  if (IsEmpty(id)) return BranchType::kNeither;
+  absl::optional<BranchType> b;
+  const CondState& nodes = *id;
+  for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
+    if (it->type == CondStateMap::CondNode::Type::kSwitch &&
+        it->predicate == predicate) {
+      if (b.has_value()) {
+        b = MeetBranch(*b, it->branch);
+      } else {
+        b = it->branch;
+      }
+      if (*b == BranchType::kNeither) {
+        LOG(FATAL) << "Inconsistent state for node: " << DebugString(id);
+      }
+    }
+  }
+  return b.has_value() ? *b : BranchType::kNeither;
+}
+
+StatusOr<CondStateMap::CondId> FunctionalizeCond::JoinCondStatesNonMerge(
+    CondStateMap::CondId src, CondStateMap::CondId dst) {
+  VLOG(4) << "Joining src=" << DebugString(src) << " [" << src
+          << "] and dst=" << DebugString(dst) << " [" << dst << "]";
+
+  if (cond_state_map_.IsEmpty(dst) || cond_state_map_.IsDead(src)) return src;
+  if (cond_state_map_.IsDead(dst)) return dst;
+
+  // Nothing to do if the CondState is the same.
+  if (src == dst) return src;
+
+  CondStateMap::CondId src_scope;
+  CondStateMap::CondId dst_scope;
+  if (!cond_state_map_.ScopeIn(src, &src_scope))
+    return errors::Unimplemented(
+        "Predicates that must hold for node to execute are invalid! ",
+        DebugString(src));
+  if (!cond_state_map_.ScopeIn(dst, &dst_scope))
+    return errors::Unimplemented(
+        "Predicates that must hold for node to execute are invalid! ",
+        DebugString(dst));
+
+  auto result = cond_state_map_.LhsHoldsWhereverRhsHolds(src_scope, dst_scope);
+  switch (result) {
+    case CondStateMap::kIncomparable:
+      return errors::InvalidArgument(
+          "Graph contains node with inputs predicated on incompatible "
+          "predicates: ",
+          DebugString(src), " and ", DebugString(dst));
+    case CondStateMap::kEqual:
+      // If both respect the same predicates, propagate the longer constraint.
+      if ((src != nullptr && dst == nullptr) ||
+          (src != nullptr && dst != nullptr && src->size() > dst->size()))
+        return src;
+      else
+        return dst;
+    case CondStateMap::kLhsContainsRhs:
+      // src contains dst, so dst is already more restrictive.
+      return dst;
+    case CondStateMap::kRhsContainsLhs:
+      // dst contains src, so src is more restrictive.
+      return src;
+  }
+}
+
+StatusOr<CondStateMap::CondState::const_iterator>
+FindThenElseSwitchForPredicate(const OutputTensor& pred,
+                               CondStateMap::CondId id) {
+  for (auto it = id->begin(); it != id->end(); ++it) {
+    // Along every path one there can be only one instance of a then or else
+    // switch for a given predicate, so return once found.
+    if (it->type == CondStateMap::CondNode::Type::kSwitch &&
+        it->predicate == pred &&
+        (it->branch == BranchType::kThenBranch ||
+         it->branch == BranchType::kElseBranch))
+      return it;
+  }
+  return errors::Internal("Unable to find then/else branch with predicate ",
+                          DebugString(pred), " for ", DebugString(id));
+}
+
+StatusOr<CondStateMap::CondId> FunctionalizeCond::JoinCondStatesMerge(
+    CondStateMap::CondId src, CondStateMap::CondId dst) {
+  // Determine the flow state when joining two states for a merge
+  // node. Combining the two states for a merge node is effectively performing a
+  // disjunction of the states along the different input edges. For a merge that
+  // can be transformed into a If the two inputs paths have to have a predicate
+  // on which they differ (e.g., along one edge predicate `p` has to hold while
+  // on another it should not). This function first determines this predicate
+  // and then the resultant state is the common path between the two inputs
+  // followed by s(p, both).
+  VLOG(4) << "Joining (for merge) " << DebugString(src) << " and "
+          << DebugString(dst);
+  if (cond_state_map_.IsEmpty(dst)) return src;
+
+  if (cond_state_map_.IsDead(src)) return src;
+  if (cond_state_map_.IsDead(dst)) return dst;
+
+  CondStateMap::CondId src_scope;
+  CondStateMap::CondId dst_scope;
+  if (!cond_state_map_.ScopeIn(src, &src_scope))
+    return errors::Unimplemented(
+        "Predicates that must hold for node to execute are invalid! ",
+        DebugString(src));
+  if (!cond_state_map_.ScopeIn(dst, &dst_scope))
+    return errors::Unimplemented(
+        "Predicates that must hold for node to execute are invalid! ",
+        DebugString(dst));
+
+  TF_RET_CHECK(src_scope != nullptr && dst_scope != nullptr)
+      << "Illegal merge inputs from outer scope: src=" << DebugString(src)
+      << " dst=" << DebugString(dst);
+  auto src_it = src_scope->begin();
+  auto dst_it = dst_scope->begin();
+
+  // Find branch divergent condition.
+  OutputTensor pred;
+  while (src_it != src_scope->end() && dst_it != dst_scope->end()) {
+    if (*src_it != *dst_it) {
+      VLOG(5) << "Diverges with: " << DebugString(*src_it) << " and "
+              << DebugString(*dst_it);
+      if (!(src_it->predicate == dst_it->predicate)) {
+        return errors::InvalidArgument(
+            "Unable to find common predicate which holds for one input "
+            "but not the other of the merge node.");
+      }
+      pred = src_it->predicate;
+      break;
+    }
+    ++src_it;
+    ++dst_it;
+  }
+
+  if (pred.node == nullptr)
+    return errors::InvalidArgument("Unable to determine predicate for merge.");
+
+  TF_ASSIGN_OR_RETURN(auto div_src_it,
+                      FindThenElseSwitchForPredicate(pred, src));
+  TF_ASSIGN_OR_RETURN(auto div_dst_it,
+                      FindThenElseSwitchForPredicate(pred, dst));
+  TF_RET_CHECK(*div_src_it != *div_dst_it);
+
+  CondStateMap::CondState result;
+  // Populate result with the longest/most restrictive path up to the divergent
+  // node. For example, if the one input is `[switch(pred:0, then)]` and the
+  // other is `[switch(pred:0, both), merge, switch(pred:0, else)]` (as created
+  // in gradient of cond test), then the resultant state here should be
+  // `[switch(pred:0, both), merge, switch(pred:0, both)]`.
+  if (std::distance(src->begin(), div_src_it) >
+      std::distance(dst->begin(), div_dst_it)) {
+    result.assign(src->begin(), std::next(div_src_it));
+  } else {
+    result.assign(dst->begin(), std::next(div_dst_it));
+  }
+  result.back().branch = BranchType::kBoth;
+  return cond_state_map_.GetUniqueId(result);
+}
+
+CondStateMap::CondId FunctionalizeCond::StateAlongEdge(const Edge* e) {
+  Node* src = e->src();
+  CondStateMap::CondId id = cond_state_map_.LookupId(e->src());
+  if (IsMerge(src)) {
+    CondStateMap::CondState state;
+    if (id != nullptr) state = *id;
+    state.emplace_back(CondStateMap::CondNode::Type::kMerge);
+    return cond_state_map_.GetUniqueId(state);
+  }
+  if (IsSwitch(src)) {
+    CondStateMap::CondState state;
+    if (id != nullptr) state = *id;
+    if (e->IsControlEdge()) {
+      state.emplace_back(CondStateMap::CondNode::Type::kSwitch, src,
+                         BranchType::kBoth);
+    } else {
+      state.emplace_back(CondStateMap::CondNode::Type::kSwitch, src,
+                         BranchType(e->src_output()));
+    }
+    return cond_state_map_.GetUniqueId(state);
+  }
+  return id;
+}
+
+Status FunctionalizeCond::DetermineCondStateMerge(Node* dst) {
+  // Only Merge nodes with two inputs are supported, but if this is a redundant
+  // merge, then the dead edge may already have been removed (if due to a
+  // switch) and so the input count would be incorrect.
+  if (cond_state_map_.IsDead(cond_state_map_.LookupId(dst)))
+    return Status::OK();
+
+  int data_inputs = 0;
+  for (auto e : dst->in_edges()) {
+    Node* src = e->src();
+    VLOG(5) << "Processing forward flow for merge: " << e->DebugString() << " "
+            << cond_state_map_.CondStateToString(src);
+    if (!src->IsOp()) continue;
+    if (!e->IsControlEdge()) ++data_inputs;
+
+    CondStateMap::CondId prop = StateAlongEdge(e);
+    auto id_or = JoinCondStatesMerge(prop, cond_state_map_.LookupId(dst));
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(id_or.status(), "for node ",
+                                    FormatNodeForError(*dst));
+    cond_state_map_.ResetId(dst, id_or.ValueOrDie());
+  }
+
+  // Incomplete Merge nodes are not supported.
+  if (data_inputs != 2) {
+    return errors::Unimplemented(
+        dst->name(), " only has ", data_inputs,
+        " inputs, while only merge nodes with two inputs supported.");
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeCond::DetermineCondState(Node* dst) {
+  // The logic for the merge and non-merge case differ: for non-merge it is
+  // the most restrictive CondState, while for merge nodes the
+  // resultant state is less restrictive than either.
+  if (IsMerge(dst)) {
+    TF_RETURN_IF_ERROR(DetermineCondStateMerge(dst));
+  } else {
+    // Handle non-merge join.
+    for (auto e : dst->in_edges()) {
+      VLOG(5) << "Processing forward flow for: " << e->DebugString() << " "
+              << cond_state_map_.CondStateToString(dst);
+      Node* src = e->src();
+      if (!src->IsOp()) continue;
+
+      // Joining the state between the current and propagated state.
+      CondStateMap::CondId prop = StateAlongEdge(e);
+      auto id_or = JoinCondStatesNonMerge(prop, cond_state_map_.LookupId(dst));
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(id_or.status(), "for node ",
+                                      FormatNodeForError(*dst));
+      cond_state_map_.ResetId(dst, id_or.ValueOrDie());
+    }
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeCond::RemoveRedundantMerge(Node* node) {
+  // Handle redundant merge nodes. A merge node is considered redundant if
+  // one input edge is dead while the other has a value.
+  if (!cond_state_map_.IsDead(cond_state_map_.LookupId(node)))
+    return Status::OK();
+
+  const Edge* non_dead_edge = nullptr;
+  for (auto e : node->in_edges()) {
+    if (e->IsControlEdge()) continue;
+    Node* src = e->src();
+
+    // Handle merge with dead state.
+    const auto& src_id = cond_state_map_.LookupId(src);
+    if (!cond_state_map_.IsDead(src_id)) {
+      non_dead_edge = e;
+      break;
+    }
+  }
+
+  if (non_dead_edge == nullptr) {
+    return errors::InvalidArgument("Merge node ", FormatNodeForError(*node),
+                                   " has no non-dead inputs.");
+  }
+  cond_state_map_.MarkDead(node);
+  delete_nodes_.push_back(node->id());
+  VLOG(5) << "removing redundant merge: " << node->name();
+  while (!node->out_edges().empty()) {
+    const Edge* oe = *node->out_edges().begin();
+    Node* dst_node = oe->dst();
+    int dst_port = oe->dst_input();
+    graph_->RemoveEdge(oe);
+    graph_->AddEdge(non_dead_edge->src(),
+                    dst_port == Graph::kControlSlot
+                        ? Graph::kControlSlot
+                        : non_dead_edge->src_output(),
+                    dst_node, dst_port);
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeCond::RemoveRedundantSwitch(Node* node) {
+  // Handle redundant switch nodes. A switch node is considered redundant if
+  // the predicate of the switch already holds on the current branch. E.g., if
+  // p is the predicate of the switch but p is already known to hold on this
+  // branch, then the switch can be removed and the dead state propagated
+  // along one. The checking of predicate is based on the exact predicate
+  // (rather than boolean equivalence) and aimed at redundant switches as
+  // currently generated by gradient code.
+  OutputTensor pred;
+  TF_RETURN_IF_ERROR(GetSwitchPredicate(*node, &pred));
+  auto dst_id = cond_state_map_.LookupId(node);
+  BranchType b = cond_state_map_.FindBranchOf(dst_id, pred);
+  // Determine if we are already on a branch where the switch predicate is
+  // true/false.
+  if (b != BranchType::kThenBranch && b != BranchType::kElseBranch)
+    return Status::OK();
+
+  VLOG(5) << "Redundant switch " << node->name();
+  const Edge* value_edge;
+  TF_RETURN_IF_ERROR(node->input_edge(0, &value_edge));
+  Node* val_node = value_edge->src();
+  int val_port = value_edge->src_output();
+  while (!node->out_edges().empty()) {
+    auto e = *node->out_edges().begin();
+    Node* dst_node = e->dst();
+    int dst_input = e->dst_input();
+    int switch_branch = e->src_output();
+    graph_->RemoveEdge(e);
+    if (switch_branch == Graph::kControlSlot) {
+      if (IsMerge(dst_node)) {
+        auto id_or =
+            JoinCondStatesMerge(dst_id, cond_state_map_.LookupId(dst_node));
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(id_or.status(), "for node ",
+                                        FormatNodeForError(*dst_node));
+        cond_state_map_.ResetId(dst_node, id_or.ValueOrDie());
+      } else {
+        auto id_or =
+            JoinCondStatesNonMerge(dst_id, cond_state_map_.LookupId(dst_node));
+        TF_RETURN_IF_ERROR(id_or.status());
+        cond_state_map_.ResetId(dst_node, id_or.ValueOrDie());
+      }
+    } else if (BranchType(switch_branch) != b) {
+      cond_state_map_.MarkDead(dst_node);
+      delete_nodes_.push_back(dst_node->id());
+      continue;
+    }
+    graph_->AddEdge(
+        val_node,
+        switch_branch == Graph::kControlSlot ? Graph::kControlSlot : val_port,
+        dst_node, dst_input);
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeCond::DetermineCondStates(
+    std::vector<Node*> rev_topo_order) {
+  // The state that is propagated along the given edge.
+  for (auto it = rev_topo_order.rbegin(); it != rev_topo_order.rend(); ++it) {
+    Node* dst = *it;
+    TF_RETURN_IF_ERROR(DetermineCondState(dst));
+    if (IsSwitch(dst)) TF_RETURN_IF_ERROR(RemoveRedundantSwitch(dst));
+    if (IsMerge(dst)) TF_RETURN_IF_ERROR(RemoveRedundantMerge(dst));
+
+    VLOG(5) << dst->name() << " :: " << cond_state_map_.CondStateToString(dst);
+  }
+  return Status::OK();
+}
+
+void FunctionalizeCond::DeleteReachableNodes() {
+  // Delete all nodes that have been extracted or are reachable from
+  // deleted/dead nodes. The input and outgoing edges should have already been
+  // removed.
+  std::vector<bool> deleted(graph_->num_node_ids(), false);
+  // Don't try to delete source or sink nodes.
+  deleted[graph_->kSourceId] = true;
+  deleted[graph_->kSinkId] = true;
+  while (!delete_nodes_.empty()) {
+    int d_id = delete_nodes_.front();
+    delete_nodes_.pop_front();
+    if (deleted[d_id]) continue;
+    Node* d = graph_->FindNodeId(d_id);
+    // Switch and Merge nodes could have been deleted already.
+    if (d == nullptr) continue;
+    for (const Edge* e : d->out_edges()) {
+      delete_nodes_.push_back(e->dst()->id());
+    }
+    deleted[d_id] = true;
+    graph_->RemoveNode(d);
+  }
+}
+
+void FunctionalizeCond::SortMergeNodes(std::vector<Node*>* merge_order) {
+  // Sort merge nodes by nesting depth.
+  using sort_pair = std::pair<int, Node*>;
+  std::vector<sort_pair> inner_to_outer_merge_order;
+  inner_to_outer_merge_order.reserve(merge_order->size());
+  for (auto it = merge_order->rbegin(); it != merge_order->rend(); ++it) {
+    Node* merge = *it;
+    CondStateMap::CondId id = cond_state_map_.LookupId(merge);
+    int depth = 0;
+    for (auto cond_node_it = id->begin(); cond_node_it != id->end();
+         ++cond_node_it) {
+      if (cond_node_it->type == CondStateMap::CondNode::Type::kSwitch &&
+          (cond_node_it->branch == BranchType::kThenBranch ||
+           cond_node_it->branch == BranchType::kElseBranch)) {
+        ++depth;
+      }
+    }
+    inner_to_outer_merge_order.emplace_back(depth, merge);
+  }
+  std::stable_sort(
+      inner_to_outer_merge_order.begin(), inner_to_outer_merge_order.end(),
+      [](sort_pair lhs, sort_pair rhs) { return lhs.first > rhs.first; });
+  merge_order->clear();
+  for (sort_pair t : inner_to_outer_merge_order) {
+    merge_order->push_back(t.second);
+  }
+}
+
+Status FunctionalizeCond::FunctionalizeInternal() {
+  // The general approach for converting a tf.cond (as lowered via switch/merge
+  // nodes) to a functional if is as follows:
+  // 1. Determine the topological order and collect all the switch and merge
+  // nodes in the graph;
+  // 2. Compute the predicates and dominance structure for all the nodes in the
+  // graph - this includes which predicate must be true for a op to execute
+  // (predicate values are considered directly rather than attempting to
+  // determine deeper equivalence). We shall refer to this structure as the
+  // CondState;
+  // 3. Sort the merge nodes by nesting depth;
+  // 4. Extract merge nodes together that have the same CondState and whose
+  // input nodes have the same state from the innermost to the outermost into
+  // IfOps; Note: In the above only nodes paths that converge to a merge node
+  // will be considered for removal.
+
+  // Perform a DFS over the graph and
+  // * Determine the reverse topological order of the nodes (there should be no
+  //   cycles at this point so the post-order numbering corresponds to the
+  //   reverse topological sorting);
+  // * Record reverse topological for merge and switch nodes;
+  std::vector<Node*> rev_topo_order;
+  std::vector<int> switch_ids;
+  std::vector<Node*> merge_order;
+  DFS(*graph_, nullptr, [&](Node* n) {
+    if (IsSwitch(n)) {
+      switch_ids.push_back(n->id());
+    }
+    if (IsMerge(n)) {
+      merge_order.push_back(n);
+    }
+    if (n->IsOp()) {
+      rev_topo_order.push_back(n);
+    }
+  });
+
+  // No merges to functionalize.
+  if (merge_order.empty()) {
+    // No merges mean no switch values consumed (as only considering values
+    // fetchable as output of merge);
+    for (auto it = switch_ids.begin(); it != switch_ids.end(); ++it) {
+      graph_->RemoveNode(graph_->FindNodeId(*it));
+    }
+    return Status::OK();
+  }
+
+  TF_RETURN_IF_ERROR(DetermineCondStates(std::move(rev_topo_order)));
+
+  if (VLOG_IS_ON(4)) DumpGraphWithCondState("cond_id");
+
+  // Sort the merge nodes from innermost outwards.
+  SortMergeNodes(&merge_order);
+
+  // Extract from innermost out.
+  for (auto it = merge_order.begin(); it != merge_order.end(); ++it) {
+    Node* merge = *it;
+    auto id = cond_state_map_.LookupId(merge);
+    if (cond_state_map_.IsDead(id)) continue;
+
+    // Construct a Conditional with the predicate of the merge (which is the
+    // last entry of the CondState for the merge) and this as parent.
+    DCHECK(id->back().predicate.node != nullptr);
+    Conditional cond(id->back().predicate, this, &cond_state_map_);
+    TF_RETURN_IF_ERROR(cond.AddMerge(merge));
+
+    // Find all merge nodes with the same CondId. This is done repeatedly as
+    // the CondId can change due replaced conditionals. E.g., the one branch
+    // could previously have had a conditional nested in it, and so would have
+    // had CondState with sub-state [switch(p,b),m] (where p is some predicate),
+    // post removing the nested conditional that sub-state would no longer be
+    // path of the propagated state along that path.
+    auto end = merge_order.end();
+    for (auto merge_candidate_it = std::next(it); merge_candidate_it != end;
+         ++merge_candidate_it) {
+      auto merge_candidate_it_id =
+          cond_state_map_.LookupId(*merge_candidate_it);
+      if (merge_candidate_it_id != id) continue;
+      TF_RETURN_IF_ERROR(cond.AddMerge(*merge_candidate_it));
+    }
+
+    TF_RETURN_IF_ERROR(cond.BuildAndReplace(graph_, library_));
+
+    if (VLOG_IS_ON(4)) DumpGraphWithCondState("after_extract");
+  }
+
+  // All remaining Switch nodes are not reachable from a Merge node and
+  // removed. This is to account for dead Switch nodes.
+  for (int s_id : switch_ids) delete_nodes_.push_back(s_id);
+  for (Node* m : merge_order) delete_nodes_.push_back(m->id());
+  DeleteReachableNodes();
+
+  return Status::OK();
+}
+
+void FunctionalizeCond::DumpGraphWithCondState(const string& name) {
+  const char* const kCondGroupDebugAttr = "_XlaFunctionalizeCondGroup";
+
+  for (Node* n : graph_->nodes()) {
+    n->ClearAttr(kCondGroupDebugAttr);
+    n->AddAttr(kCondGroupDebugAttr, cond_state_map_.CondStateToString(n));
+  }
+  LOG(INFO) << "FunctionalizeControlFlow (" << name << "): "
+            << dump_graph::DumpGraphToFile(
+                   strings::StrCat("functionalize_", name), *graph_, library_);
+}
+
+Status FunctionalizeCond::Functionalize(Graph* graph,
+                                        FunctionLibraryDefinition* library) {
+  VLOG(1) << "FunctionalizeCond::Functionalize";
+  FunctionalizeCond fc(graph, library);
+  return fc.FunctionalizeInternal();
+}
+
+}  // namespace functionalize_cond
+
+Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library) {
+  // FunctionalizeControlFlow is invoked for every function, so the loops's
+  // bodies and conditionals that were extracted into functions will be handled
+  // in successive invocations.
+  return functionalize_cond::FunctionalizeCond::Functionalize(graph, library);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h
new file mode 100644
index 0000000000..86436011c6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -0,0 +1,248 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_
+
+#include <deque>
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Functionalize all the switch-merge nodes of a loop-free graph into If
+// nodes. That is, attempt to transform every remaining switch and merge nodes
+// in the graph into If nodes.
+// Precondition: All while loops have been removed from graph.
+Status FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library);
+
+// Internal functions/classes exposed for testing purposes.
+namespace functionalize_cond {
+
+// All nodes are assumed to be either in no branch, then branch, else branch,
+// or both branches (such as merge nodes).
+// The code below relies on Else and Then being 0 and 1 (corresponding to the
+// switch outputs). Both and Neither are arbitrary.
+enum class BranchType {
+  kElseBranch = 0,
+  kThenBranch = 1,
+  kBoth = 2,
+  kNeither = 3,
+};
+
+// CondStateMap is responsible for mapping from each graph Node to a CondState,
+// where each CondState is the array of CondNodes (corresponding to switch,
+// merge or dead states) as described below.  For efficiency, this class interns
+// the CondState, so that CondState equality comparisons are simply pointer
+// comparisons.
+class CondStateMap {
+ public:
+  explicit CondStateMap(Graph* graph);
+
+  // Represents an entry in the CondState. An entry can either be the
+  // switch (along with predicate), merge, or dead:
+  // * switch node indicates a node that is executed along a branch with the
+  //   given predicate - a branch can be then, else or both;
+  // * merge node indicates that the node is executed as output of a merge;
+  // * dead indicates that this node can never be executed;
+  struct CondNode {
+    enum class Type { kSwitch = 1, kMerge = 2, kDead = 3 };
+
+    CondNode(Type type, Node* switch_node = nullptr,
+             BranchType branch = BranchType::kNeither);
+
+    string ToString() const;
+    bool operator==(const CondNode& other) const;
+    bool operator!=(const CondNode& other) const;
+
+    // Type of node.
+    Type type;
+
+    // Predicate and branch, only used when type is kSwitch.
+    OutputTensor predicate;
+    BranchType branch;
+  };
+
+  // A node in the graph is executed when multiple conditions hold. The order
+  // represents the nesting of the predicates that hold and is used when
+  // extracting the nested conditionals.
+  using CondState = std::vector<CondNode>;
+
+  // Every unique ID is mapped to a CondState.
+  using CondId = const CondState*;
+
+  // Returns the CondId for a given node.
+  CondId LookupId(const Node* node) const;
+
+  // Returns the unique CondId for CondState.
+  CondId GetUniqueId(const CondState& state);
+
+  // Returns the CondState for a Node.
+  // REQUIRES: node has a non-empty CondState.
+  const CondState& LookupState(const Node* node) const;
+
+  // Resets the CondId for a given node.
+  void ResetId(const Node* node, CondId id);
+
+  // Marks `node` as dead.
+  void MarkDead(const Node* node);
+
+  // Determine branch execution of CondState.
+  BranchType FindBranchOf(CondId id, OutputTensor predicate) const;
+
+  // Enum to represent whether one cond flow state contains another.
+  enum ContainsResult {
+    kIncomparable,
+    kEqual,
+    kLhsContainsRhs,
+    kRhsContainsLhs
+  };
+
+  // Returns whether the lhs CondState holds wherever rhs CondState hols. I.e.,
+  // [(p,t)] contains [(p,t), (r,t)].
+  ContainsResult LhsHoldsWhereverRhsHolds(CondId lhs, CondId rhs);
+
+  // Returns textual representation of node's CondState.
+  string CondStateToString(const Node* node) const;
+  string CondStateToString(CondId id) const;
+
+  // Returns whether the cond state is the dead state.
+  bool IsDead(CondId id) const;
+
+  // Returns whether the cond state is the empty state.
+  bool IsEmpty(CondId id) const;
+
+  // Computes the predicates that have to hold for a node to execute and returns
+  // whether it was possible to determine the predicates that must hold. `scope`
+  // is populated with these predicates. Scope differs from state in that it
+  // does not include merge and both nodes.
+  bool ScopeIn(CondId id, CondId* scope);
+
+ private:
+  // Hash for CondNode and CondState.
+  struct CondHash {
+    size_t operator()(const CondNode& item) const;
+    size_t operator()(const CondState& vec) const;
+  };
+
+  // Set to keep track of unique CondStates.
+  // Pointers to the entries in the unordered set are used as identifiers:
+  // unordered_set guarantees that the pointers remain the same.
+  std::unordered_set<CondState, CondHash> condstate_set_;
+
+  // Mapping from Node id to CondId.
+  std::vector<CondId> node_to_condid_map_;
+
+  // Track the CondId for newly inserted nodes. We use a vector to quickly map
+  // from Node id in the original graph to the CondId, but there will be nodes
+  // added to the original graph (such as If nodes) whose CondState needs to be
+  // tracked too.
+  std::unordered_map<int, CondId> added_node_mapping_;
+
+  // Identifier of the dead flow state. The empty flow state is represented with
+  // a nullptr.
+  CondId dead_id_;
+};
+
+// FunctionalizeCond groups all the state used by functionalizing conditionals
+// of the given graph together.
+class FunctionalizeCond {
+ public:
+  // Functionalize all the switch-merge nodes of a loop-free graph into If
+  // nodes. That is, attempt to transform every remaining switch and merge nodes
+  // in the graph into If nodes.
+  // Precondition: All while loops have been removed from graph.
+  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library);
+
+  // Build identity node with the same name as the merge that will be replaced
+  // in case the output is fetched/colocated.
+  Status AddIdentityNode(const Node* replacee, Node* if_node, int port);
+
+  // Add a If node to the graph defined by def that will, amongst other, replace
+  // replacee in the graph.
+  xla::StatusOr<Node*> AddIfNode(const NodeDef& def, const Node* replacee);
+
+  // Propagates the state of a newly inserted node.
+  Status PropagateUpdatedState(const Node* replacee);
+
+  // Dump graph with the CondState annotated.
+  void DumpGraphWithCondState(const string& name);
+
+ private:
+  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library);
+
+  // Performs the actual cond functionalization. Iterate over groups of merge
+  // nodes (linked by common predicate & CondIds of the incomming edges),
+  // from innermost to outermost, and extract into If nodes.
+  Status FunctionalizeInternal();
+
+  // Returns the forward flow state propagated along edge `e`.
+  // This may modify cond_state_map_.
+  CondStateMap::CondId StateAlongEdge(const Edge* e);
+
+  // Determines the CondState of all the nodes in the given vector where
+  // the input is expected in reverse topological order.
+  // This populates the cond_state_map_.
+  Status DetermineCondStates(std::vector<Node*> rev_topo_order);
+
+  // Determine the CondState for a given node using the incomming edges
+  // to the node. Note: it is expected that this node's CondState is only
+  // determined once its input's CondState is.
+  Status DetermineCondState(Node* dst);
+
+  // Helper functions for DetermineCondState.
+  Status DetermineCondStateMerge(Node* dst);
+
+  // Helper functions for DetermineCondStates. Determines the dst node's
+  // CondState by joining the src and dst's CondState where either
+  // the dst node is a merge or not.
+  // These may modify cond_state_map_.
+  xla::StatusOr<CondStateMap::CondId> JoinCondStatesMerge(
+      CondStateMap::CondId src, CondStateMap::CondId dst);
+  xla::StatusOr<CondStateMap::CondId> JoinCondStatesNonMerge(
+      CondStateMap::CondId src, CondStateMap::CondId dst);
+
+  // Checks if a merge node is redundant and if so removes it from the graph.
+  Status RemoveRedundantMerge(Node* node);
+
+  // Checks if a switch node is redundant and if so removes it from the graph.
+  Status RemoveRedundantSwitch(Node* node);
+
+  // Sorts merge nodes (in reverse topological order) in order of increasing
+  // nesting depth.
+  void SortMergeNodes(std::vector<Node*>* merge_order);
+
+  // Deletes all nodes in/consumers of `delete_nodes_`.
+  void DeleteReachableNodes();
+
+  // Member used to unique the CondState to a unique CondId and keep track of
+  // CondState/CondId per Node.
+  CondStateMap cond_state_map_;
+
+  // Nodes to be deleted.
+  std::deque<int> delete_nodes_;
+
+  FunctionLibraryDefinition* library_;
+  Graph* graph_;
+
+  friend class FunctionalizeCondTest;
+};
+
+}  // namespace functionalize_cond
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_COND_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
new file mode 100644
index 0000000000..a27f889392
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests for the backward const analysis.
+
+#include "tensorflow/compiler/tf2xla/functionalize_cond.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace functionalize_cond {
+
+class FunctionalizeCondTest : public ::testing::Test {
+ protected:
+  FunctionalizeCondTest() {
+    graph_.reset(new Graph(OpRegistry::Global()));
+    flib_def_.reset(
+        new FunctionLibraryDefinition(OpRegistry::Global(), fdef_lib_));
+    fc_.reset(new functionalize_cond::FunctionalizeCond(graph_.get(),
+                                                        flib_def_.get()));
+  }
+
+  CondStateMap::CondId GetUniqueId(
+      const CondStateMap::CondStateMap::CondState& state) {
+    return fc_->cond_state_map_.GetUniqueId(state);
+  }
+
+  xla::StatusOr<CondStateMap::CondId> JoinCondStatesNonMerge(
+      CondStateMap::CondId src, CondStateMap::CondId dst) {
+    return fc_->JoinCondStatesNonMerge(src, dst);
+  }
+
+  xla::StatusOr<CondStateMap::CondId> JoinCondStatesMerge(
+      CondStateMap::CondId src, CondStateMap::CondId dst) {
+    return fc_->JoinCondStatesMerge(src, dst);
+  }
+
+  bool ScopeIn(CondStateMap::CondId ff, CondStateMap::CondId* scope) {
+    return fc_->cond_state_map_.ScopeIn(ff, scope);
+  }
+
+  CondStateMap::ContainsResult LhsHoldsWhereverRhsHolds(
+      CondStateMap::CondId lhs, CondStateMap::CondId rhs) {
+    return fc_->cond_state_map_.LhsHoldsWhereverRhsHolds(lhs, rhs);
+  }
+
+  FunctionDefLibrary fdef_lib_;
+  std::unique_ptr<functionalize_cond::FunctionalizeCond> fc_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<Graph> graph_;
+};
+
+namespace {
+
+TEST_F(FunctionalizeCondTest, ScopeIn) {
+  Tensor pred_tensor(DT_BOOL, TensorShape());
+  pred_tensor.flat<bool>().setZero();
+  Node* pred = test::graph::Constant(graph_.get(), pred_tensor, "pred");
+  Tensor val_tensor(DT_INT32, TensorShape());
+  val_tensor.flat<int>().setZero();
+  Node* val = test::graph::Constant(graph_.get(), val_tensor, "val");
+  Node* s = test::graph::Switch(graph_.get(), val, pred);
+
+  {
+    CondStateMap::CondStateMap::CondState ss;
+    ss.emplace_back(CondStateMap::CondNode(
+        CondStateMap::CondNode::Type::kSwitch, s, BranchType::kThenBranch));
+    CondStateMap::CondId id = GetUniqueId(ss);
+    CondStateMap::CondId scope;
+    ASSERT_TRUE(ScopeIn(id, &scope));
+    ASSERT_TRUE(id == scope);
+  }
+
+  CondStateMap::CondState empty;
+  {
+    CondStateMap::CondState ss;
+    ss.emplace_back(CondStateMap::CondNode(
+        CondStateMap::CondNode::Type::kSwitch, s, BranchType::kBoth));
+    ss.emplace_back(
+        CondStateMap::CondNode(CondStateMap::CondNode::Type::kMerge));
+    CondStateMap::CondId id = GetUniqueId(ss);
+    CondStateMap::CondId scope_1;
+    ASSERT_TRUE(ScopeIn(id, &scope_1));
+    ASSERT_TRUE(scope_1 == GetUniqueId(empty));
+    ASSERT_TRUE(id != scope_1);
+
+    ss.clear();
+    ss.emplace_back(CondStateMap::CondNode(
+        CondStateMap::CondNode::Type::kSwitch, s, BranchType::kBoth));
+    id = GetUniqueId(ss);
+    CondStateMap::CondId scope_2;
+    ASSERT_TRUE(ScopeIn(id, &scope_2));
+
+    ASSERT_TRUE(LhsHoldsWhereverRhsHolds(scope_1, scope_2) ==
+                CondStateMap::ContainsResult::kLhsContainsRhs);
+  }
+}
+
+TEST_F(FunctionalizeCondTest, JoinCondStates) {
+  Tensor pred_tensor(DT_BOOL, TensorShape());
+  pred_tensor.flat<bool>().setZero();
+  Node* pred = test::graph::Constant(graph_.get(), pred_tensor, "pred");
+  Tensor val_tensor(DT_INT32, TensorShape());
+  val_tensor.flat<int>().setZero();
+  Node* val = test::graph::Constant(graph_.get(), val_tensor, "val");
+  Node* s = test::graph::Switch(graph_.get(), val, pred);
+
+  CondStateMap::CondId empty = GetUniqueId({});
+
+  CondStateMap::CondId then_branch;
+  {
+    CondStateMap::CondState ss;
+    ss.emplace_back(CondStateMap::CondNode(
+        CondStateMap::CondNode::Type::kSwitch, s, BranchType::kThenBranch));
+    then_branch = GetUniqueId(ss);
+  }
+  CondStateMap::CondId else_branch;
+  {
+    CondStateMap::CondState ss;
+    ss.emplace_back(CondStateMap::CondNode(
+        CondStateMap::CondNode::Type::kSwitch, s, BranchType::kElseBranch));
+    else_branch = GetUniqueId(ss);
+  }
+
+  // An non-merge op with inputs from then and else branch.
+  Status status = JoinCondStatesNonMerge(then_branch, else_branch).status();
+  EXPECT_TRUE(errors::IsInvalidArgument(status));
+
+  // Merge between then and else branch.
+  auto joined_or = JoinCondStatesMerge(then_branch, else_branch);
+  TF_EXPECT_OK(joined_or.status());
+  CondStateMap::CondId joined = joined_or.ValueOrDie();
+
+  // Merge between then branch and both branch.
+  auto t = JoinCondStatesNonMerge(then_branch, joined);
+  // Note: this is OK in terms of constraint predication, but
+  TF_EXPECT_OK(t.status());
+
+  // Post merge the propagated forward flow state has an additional merge.
+  CondStateMap::CondId post_merge;
+  {
+    CondStateMap::CondState ss;
+    ss = *joined;
+    ss.emplace_back(
+        CondStateMap::CondNode(CondStateMap::CondNode::Type::kMerge));
+    post_merge = GetUniqueId(ss);
+  }
+
+  t = JoinCondStatesNonMerge(post_merge, joined);
+  TF_EXPECT_OK(t.status());
+  EXPECT_TRUE(joined == t.ValueOrDie());
+
+  // No predicate that results in two paths predicated on different conditions
+  // merge.
+  t = JoinCondStatesMerge(post_merge, joined);
+  EXPECT_FALSE(t.ok());
+
+  // Post the merge we are effectively in the root scope and merging should
+  // result in the more restrictive post merge state.
+  t = JoinCondStatesNonMerge(post_merge, empty);
+  TF_EXPECT_OK(t.status());
+  EXPECT_TRUE(post_merge == t.ValueOrDie());
+}
+
+}  // namespace
+}  // namespace functionalize_cond
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 0904778f97..5932be4e52 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -21,1440 +21,24 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/functionalize_cond.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "tensorflow/compiler/tf2xla/functionalize_while.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/graph/node_builder.h"
 
 namespace tensorflow {
 
-namespace {
-
-using xla::StatusOr;
-
-const char* const kArgOp = "_Arg";
-const char* const kRetValOp = "_Retval";
-
-// Information about a loop argument.
-struct Arg {
-  // Every loop argument has an Enter node.
-  Node* enter;
-
-  // Is the loop argument a loop-invariant value? Taken from the `is_constant`
-  // attribute on the Enter node.
-  bool is_loop_invariant;
-
-  // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant
-  // arguments must have all of the following nodes:
-  Node* merge = nullptr;
-  Node* switch_node = nullptr;
-  Node* next_iteration = nullptr;
-  Node* exit = nullptr;
-};
-
-// Information about a loop frame.
-struct Frame {
-  string name;
-
-  // Pointer to the parent frame. The root frame has a pointer to itself.
-  Frame* parent = nullptr;
-  int num_children = 0;
-
-  // Arguments to this loop.
-  std::vector<Arg> args;
-
-  // The loop condition of the loop. There should be exactly one loop condition
-  // in every loop.
-  Node* loop_cond = nullptr;
-
-  // Set of nodes that belong to the loop frame.
-  std::unordered_set<Node*> nodes;
-};
-
-// Comparison function used for sorting nodes consistently.
-// a) resource variables are last, and
-// b) sort lexicographically by name (for deterministic output).
-struct NodeCmp {
-  bool operator()(const Node* lhs, const Node* rhs) const {
-    bool lhs_is_resource =
-        lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false;
-    bool rhs_is_resource =
-        rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false;
-    return std::tie(lhs_is_resource, lhs->name()) <
-           std::tie(rhs_is_resource, rhs->name());
-  }
-};
-
-// Returns a textual representation of the names of the nodes in the input.
-template <typename T>
-string NodesToString(const T& nodes) {
-  return strings::StrCat("{",
-                         str_util::Join(nodes, ",",
-                                        [](string* output, const Node* node) {
-                                          strings::StrAppend(output,
-                                                             node->name());
-                                        }),
-                         "}");
-}
-
-// Copies a subgraph from `graph` to `output` by performing a reverse DFS
-// starting at nodes in vector `stack`.
-// `node_map` is a vector indexed by source node ID to dest nodes.
-// Does not traverse into nodes in `node_map`, so by adding nodes to `node_map`
-// before the traversal clients can cut the graph. If a frame is provided (frame
-// != nullptr), then this functions will return an error if the
-// traversal leaves 'frame'; the client must add enough nodes to `node_map` to
-// cut the graph and prevent the traversal from escaping.
-//
-// `squash_src_outputs` contains a bool for each source node ID. If true, then
-// the source output on that node will be replaced by zero when copied. This is
-// used when replacing a Switch node with an _Arg node. The output we are
-// taking from the Switch node was not necessarily the first output, but _Arg
-// nodes only have one output. By adding the Switch node to `squash_src_outputs`
-// we rewrite the src_output of the corresponding edge to be 0.
-Status CopySubgraph(const Graph& graph, const Frame* frame,
-                    std::vector<Node*> stack,
-                    const std::vector<bool>& squash_src_outputs,
-                    std::vector<Node*>* node_map, Graph* output) {
-  VLOG(3) << "Stack: " << NodesToString(stack);
-  std::vector<bool> visited(graph.num_node_ids(), false);
-  while (!stack.empty()) {
-    Node* n = stack.back();
-    stack.pop_back();
-
-    VLOG(5) << "Copying node " << n->name();
-
-    if (visited[n->id()]) continue;
-    visited[n->id()] = true;
-
-    for (const Edge* e : n->in_edges()) {
-      Node* src = e->src();
-      if (frame != nullptr && frame->nodes.find(src) == frame->nodes.end()) {
-        // We traversed out of the loop frame, without encountering a cut node.
-        return errors::Internal("Graph traversal of loop frame ", frame->name,
-                                " escaped frame at ", src->name(),
-                                " without encountering an argument node.");
-      }
-      if ((*node_map)[src->id()] == nullptr) {
-        (*node_map)[src->id()] = output->CopyNode(src);
-        stack.push_back(src);
-      }
-      Node* src_copy = (*node_map)[e->src()->id()];
-      int src_output = squash_src_outputs[e->src()->id()] && !e->IsControlEdge()
-                           ? 0
-                           : e->src_output();
-      Node* dst_copy = (*node_map)[e->dst()->id()];
-      output->AddEdge(src_copy, src_output, dst_copy, e->dst_input());
-    }
-  }
-  return Status::OK();
-}
-
-StatusOr<Node*> AddNode(const NodeDef& node_def, Graph* graph) {
-  Status status;
-  Node* inserted_node = graph->AddNode(node_def, &status);
-  if (!status.ok()) {
-    return status;
-  }
-  return inserted_node;
-}
-
-// Check that the graph has no cycle containing the given node.
-Status CheckNoCycleContains(const Node* node, const int num_nodes) {
-  std::vector<const Node*> ready;
-  ready.push_back(node);
-  std::vector<bool> visited(num_nodes);
-  while (!ready.empty()) {
-    const Node* current_node = ready.back();
-    ready.pop_back();
-    visited[current_node->id()] = true;
-    for (const Edge* out : current_node->out_edges()) {
-      if (out->dst() == node) {
-        return errors::Internal("Detected a cycle: ", FormatNodeForError(*node),
-                                "(", node->def().op(), ") feeds into itself.");
-      } else if (!visited[out->dst()->id()]) {
-        ready.push_back(out->dst());
-      }
-    }
-  }
-  return Status::OK();
-}
-
-StatusOr<Node*> BuildArgNode(Graph* graph, DataType type, int index) {
-  NodeDef arg_def;
-  NodeDefBuilder builder(strings::StrCat(kArgOp, index), kArgOp);
-  builder.Attr("T", type);
-  builder.Attr("index", index);
-  TF_RETURN_IF_ERROR(builder.Finalize(&arg_def));
-  return AddNode(arg_def, graph);
-}
-
-StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
-  NodeDef ret_def;
-  ret_def.set_op(kRetValOp);
-  ret_def.set_name(strings::StrCat(kRetValOp, index));
-  AddNodeAttr("T", type, &ret_def);
-  AddNodeAttr("index", index, &ret_def);
-  return AddNode(ret_def, graph);
-}
-
-// Builds a graph for the loop condition.
-Status BuildLoopCondition(const Graph& graph, Frame* frame,
-                          std::unique_ptr<Graph>* cond_output) {
-  VLOG(2) << "Building loop condition for " << frame->name;
-  *cond_output = xla::MakeUnique<Graph>(graph.op_registry());
-  Graph* output = cond_output->get();
-
-  // Map from nodes in the original graph to the condition graph.
-  std::vector<Node*> node_map(graph.num_node_ids(), nullptr);
-  std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
-
-  // Build one _Arg node for each Enter node.
-  for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
-
-    TF_ASSIGN_OR_RETURN(Node * arg_node,
-                        BuildArgNode(output, arg.enter->input_type(0), i));
-    if (arg.is_loop_invariant) {
-      node_map[arg.enter->id()] = arg_node;
-    } else {
-      node_map[arg.merge->id()] = arg_node;
-    }
-  }
-
-  // Build a Retval node for the loop condition. The LoopCond nodes are always
-  // boolean because of the type constraints on the LoopCond op.
-  TF_ASSIGN_OR_RETURN(node_map[frame->loop_cond->id()],
-                      BuildRetvalNode(output, DT_BOOL, 0));
-
-  // Performs a reverse DFS, copying nodes and edges to the output graph.
-  // The _Arg and _Retval nodes were added unconditionally above, so we are
-  // guaranteed to get the correct function signature.
-  return CopySubgraph(graph, frame, {frame->loop_cond}, squash_src_outputs,
-                      &node_map, output);
-}
-
-// Builds a graph for the loop body.
-Status BuildLoopBody(const Graph& graph, Frame* frame,
-                     DataTypeVector* arg_types,
-                     std::unique_ptr<Graph>* body_output) {
-  VLOG(2) << "Building loop body for " << frame->name;
-  *body_output = xla::MakeUnique<Graph>(graph.op_registry());
-  Graph* output = body_output->get();
-
-  // Map from nodes in the original graph to the condition graph.
-  std::vector<Node*> node_map(graph.num_node_ids(), nullptr);
-  std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
-
-  // Build one _Arg node for each Enter node.
-  std::vector<Node*> next_iterations;
-  next_iterations.reserve(frame->args.size());
-  arg_types->reserve(frame->args.size());
-  for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
-
-    DataType dtype = arg.enter->input_type(0);
-    arg_types->push_back(dtype);
-
-    TF_ASSIGN_OR_RETURN(Node * arg_node, BuildArgNode(output, dtype, i));
-
-    if (dtype == DT_RESOURCE) {
-      // The convention of the XLA bridge is that resource variable arguments
-      // are only inputs to the loop body and have no corresponding output.
-      // TODO(b/37741920): change the convention so that DT_RESOURCE variables
-      // are both inputs and outputs, and then remove this case.
-      TF_RET_CHECK(arg.is_loop_invariant);
-      node_map[arg.enter->id()] = arg_node;
-    } else {
-      TF_ASSIGN_OR_RETURN(Node * retval_node,
-                          BuildRetvalNode(output, dtype, i));
-
-      if (arg.is_loop_invariant) {
-        // Argument is loop-invariant. Forward it from the Arg to the Retval.
-        node_map[arg.enter->id()] = arg_node;
-        output->AddEdge(arg_node, 0, retval_node, 0);
-      } else {
-        // Argument is loop-varying.
-        node_map[arg.switch_node->id()] = arg_node;
-        // The Switch node has two outputs, but _Arg only has one. This tells
-        // the CopySubgraph function to rewrite the output number of edges from
-        // the _Arg node to be 0 rather than copying the output number from the
-        // Switch node.
-        squash_src_outputs[arg.switch_node->id()] = true;
-        node_map[arg.next_iteration->id()] = retval_node;
-        next_iterations.push_back(arg.next_iteration);
-      }
-    }
-  }
-
-  // Performs a reverse DFS, copying nodes and edges to the output graph.
-  // The _Arg and _Retval nodes were added unconditionally above, so we are
-  // guaranteed to get the correct function signature.
-  TF_RETURN_IF_ERROR(CopySubgraph(graph, frame, std::move(next_iterations),
-                                  squash_src_outputs, &node_map, output));
-
-  return Status::OK();
-}
-
-// Copy the FunctionDef of given function from lookup_library to library, if
-// it can be found in lookup_library but is missing from library.
-Status AddMissingFunctionByName(const string& function_name,
-                                const FunctionLibraryDefinition* lookup_library,
-                                FunctionLibraryDefinition* library) {
-  if (!library->Find(function_name) && lookup_library->Find(function_name)) {
-    return library->AddFunctionDef(*lookup_library->Find(function_name));
-  }
-  return Status::OK();
-}
-
-// Iterate over all functions that the given fdef refers to. Copy the missing
-// FunctionDefs from lookup_library to library.
-Status AddMissingFunctionDef(const FunctionDef& fdef,
-                             const FunctionLibraryDefinition* lookup_library,
-                             FunctionLibraryDefinition* library) {
-  TF_RET_CHECK(lookup_library);
-  for (const NodeDef& node : fdef.node_def()) {
-    if (library->Find(node.op())) {
-      continue;
-    }
-    // The function referred by 'SymbolicGradient' node is specified in its
-    // attribute 'f'.
-    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
-      const AttrValue* attr =
-          AttrSlice(&node.attr()).Find(FunctionLibraryDefinition::kFuncAttr);
-      if (!attr) {
-        return errors::InvalidArgument("SymbolicGradient is missing attr: f");
-      }
-      const string& func_name = attr->func().name();
-      TF_RETURN_IF_ERROR(
-          AddMissingFunctionByName(func_name, lookup_library, library));
-      // Copy the user-defined gradient function if it exists.
-      const string grad_name = lookup_library->FindGradient(func_name);
-      if (!grad_name.empty() && library->FindGradient(func_name).empty()) {
-        TF_RETURN_IF_ERROR(
-            AddMissingFunctionByName(grad_name, lookup_library, library));
-        GradientDef grad_def;
-        grad_def.set_function_name(func_name);
-        grad_def.set_gradient_func(grad_name);
-        TF_RETURN_IF_ERROR(library->AddGradientDef(grad_def));
-      }
-    } else if (lookup_library->Find(node.op())) {
-      TF_RETURN_IF_ERROR(
-          library->AddFunctionDef(*lookup_library->Find(node.op())));
-    }
-  }
-  return Status::OK();
-}
-
-Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
-                         Graph* graph, Frame* frame,
-                         FunctionLibraryDefinition* library) {
-  VLOG(2) << "Frame " << frame->name << " before: "
-          << dump_graph::DumpGraphToFile("functionalize_before", *graph,
-                                         library);
-
-  // Split loop-varying Enter nodes with multiple successors. If the same
-  // Tensor is fed as input to multiple loop arguments, we may end up with a
-  // shared Enter node. We clone Enter nodes with multiple successors to
-  // maintain the invariant of a unique Enter node per argument of the final
-  // loop.
-  std::vector<Arg> args;
-  for (const Arg& arg : frame->args) {
-    if (arg.is_loop_invariant) {
-      args.push_back(arg);
-    } else {
-      std::vector<const Edge*> edges(arg.enter->out_edges().begin(),
-                                     arg.enter->out_edges().end());
-      for (int i = 0; i < edges.size(); ++i) {
-        if (edges[i]->IsControlEdge() && edges[i]->dst()->IsSink()) {
-          continue;
-        }
-        TF_RET_CHECK(!edges[i]->IsControlEdge()) << edges[i]->src()->name();
-        Arg new_arg;
-        new_arg.is_loop_invariant = false;
-        if (i == 0) {
-          new_arg.enter = arg.enter;
-        } else {
-          new_arg.enter = graph->CopyNode(arg.enter);
-          frame->nodes.insert(new_arg.enter);
-          for (Edge const* e : arg.enter->in_edges()) {
-            graph->AddEdge(e->src(), e->src_output(), new_arg.enter,
-                           e->IsControlEdge() ? Graph::kControlSlot : 0);
-          }
-          Node* dst = edges[i]->dst();
-          int dst_input = edges[i]->dst_input();
-          graph->RemoveEdge(edges[i]);
-          graph->AddEdge(new_arg.enter, 0, dst, dst_input);
-        }
-        args.push_back(new_arg);
-      }
-    }
-  }
-  frame->args = std::move(args);
-
-  std::sort(
-      frame->args.begin(), frame->args.end(),
-      [](const Arg& a, const Arg& b) { return NodeCmp()(a.enter, b.enter); });
-
-  if (frame->loop_cond == nullptr) {
-    return errors::InvalidArgument("Loop ", frame->name,
-                                   " has no LoopCond node");
-  }
-
-  // Find the set of Switch nodes that are successors of the LoopCond.
-  std::unordered_set<Node*> switches;
-  for (const Edge* edge : frame->loop_cond->out_edges()) {
-    if (!edge->IsControlEdge() && IsSwitch(edge->dst()) &&
-        edge->dst_input() == 1) {
-      switches.insert(edge->dst());
-    }
-  }
-
-  // For each non-constant argument, looks for the following pattern of nodes:
-  // Enter ----> Merge  -------->  Switch  --> Exit
-  //               ^                  ^
-  //               |                  |
-  //         NextIteration         LoopCond
-  //               ^                  ^
-  //               |                  |
-  //              ...                ...
-  for (Arg& arg : frame->args) {
-    if (!arg.is_loop_invariant) {
-      // Follow the edge from the Enter to Merge.
-      const Edge* enter_merge = nullptr;
-      for (const Edge* e : arg.enter->out_edges()) {
-        // Ignore control-edges to the sink node. These are allowed by the
-        // graph invariants, although probably they should have been stripped
-        // off earlier.
-        if (e->IsControlEdge() && e->dst()->IsSink()) {
-          continue;
-        }
-        if (enter_merge != nullptr) {
-          return errors::Internal("Enter node for loop-varying argument ",
-                                  FormatNodeForError(*arg.enter),
-                                  " has multiple successors: ",
-                                  FormatNodeForError(*enter_merge->dst()),
-                                  " and ", FormatNodeForError(*e->dst()));
-        }
-        enter_merge = e;
-      }
-      if (enter_merge == nullptr) {
-        return errors::Internal("Enter node for loop-varying argument ",
-                                FormatNodeForError(*arg.enter),
-                                " has zero successors");
-      }
-      arg.merge = enter_merge->dst();
-      if (!IsMerge(arg.merge)) {
-        return errors::InvalidArgument(
-            "Successor of Enter node for loop-varying argument ",
-            FormatNodeForError(*arg.merge),
-            " is not a Merge node; got: ", arg.merge->type_string());
-      }
-
-      // Find the NextIteration from the merge. There should be two inputs to
-      // the Merge and the NextIteration should be the other input.
-      if (arg.merge->input_types().size() != 2) {
-        return errors::InvalidArgument(
-            "Unexpected number of inputs to Merge node for loop-varying "
-            "argument ",
-            FormatNodeForError(*arg.merge), "; expected 2, got ",
-            arg.merge->input_types().size());
-      }
-      TF_RETURN_IF_ERROR(arg.merge->input_node(1 - enter_merge->dst_input(),
-                                               &arg.next_iteration));
-      if (!IsNextIteration(arg.next_iteration)) {
-        return errors::InvalidArgument(
-            "Expected NextIteration node as input to Merge node; got node ",
-            FormatNodeForError(*arg.next_iteration), " with kind ",
-            arg.next_iteration->type_string());
-      }
-
-      // Find the Switch successor of the Merge. There should be exactly one
-      // Switch node that is a successor of both the Merge and the LoopCond.
-      for (const Edge* edge : arg.merge->out_edges()) {
-        if (edge->dst_input() == 0 && IsSwitch(edge->dst()) &&
-            switches.find(edge->dst()) != switches.end()) {
-          if (arg.switch_node != nullptr) {
-            return errors::InvalidArgument("Duplicate Switch successors to ",
-                                           FormatNodeForError(*arg.merge));
-          }
-          arg.switch_node = edge->dst();
-        }
-      }
-      if (arg.switch_node == nullptr) {
-        return errors::InvalidArgument("Missing Switch successor to ",
-                                       FormatNodeForError(*arg.merge));
-      }
-
-      // Update the device on the Identity outputs of the switch to match their
-      // target. These Identity outputs do not
-
-      // Loop over the switch node's output to:
-      // - Find the Exit successor.
-      // - Set the sharding on all Identity outputs of the switch. These
-      //   identity nodes are values used by the loop body or condition.
-      //   The Identity node may have the wrong device so copy the device from
-      //   one of its outputs instead.
-      std::deque<const Edge*> possible_exit;
-      for (const Edge* edge : arg.switch_node->out_edges()) {
-        if (edge->src_output() == 0) {
-          possible_exit.push_back(edge);
-        }
-        if (IsIdentity(edge->dst())) {
-          TF_RETURN_IF_ERROR(
-              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
-        }
-      }
-      // TODO(b/67425339): Allow general graph between switch and exit.
-      while (!possible_exit.empty()) {
-        const Edge* edge = possible_exit.front();
-        possible_exit.pop_front();
-        if (IsExit(edge->dst())) {
-          if (arg.exit != nullptr) {
-            return errors::InvalidArgument(
-                "Duplicate Exit successors to ",
-                FormatNodeForError(*arg.switch_node));
-          }
-          arg.exit = edge->dst();
-        } else {
-          if (!IsIdentity(edge->dst())) {
-            return errors::Unimplemented("General graph between switch (",
-                                         FormatNodeForError(*arg.switch_node),
-                                         ") and exit node of frame ",
-                                         frame->name, " not supported yet.");
-          }
-          for (const Edge* out : edge->dst()->out_edges()) {
-            possible_exit.push_back(out);
-          }
-        }
-      }
-    }
-  }
-
-  // Builds the condition and body functions.
-  std::unique_ptr<Graph> cond_graph;
-  TF_RETURN_IF_ERROR(BuildLoopCondition(*graph, frame, &cond_graph));
-  DataTypeVector arg_types;
-  std::unique_ptr<Graph> body_graph;
-  TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph));
-
-  VLOG(2) << "Frame " << frame->name << " condition: "
-          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library)
-          << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph);
-
-  static std::atomic<int64> sequence_num(0LL);
-  int64 id = ++sequence_num;
-  NameAttrList cond_name;
-  cond_name.set_name(strings::StrCat("_functionalize_cond_", id));
-  NameAttrList body_name;
-  body_name.set_name(strings::StrCat("_functionalize_body_", id));
-  FunctionDef cond_fdef;
-  TF_RETURN_IF_ERROR(
-      GraphToFunctionDef(*cond_graph, cond_name.name(), &cond_fdef));
-  FunctionDef body_fdef;
-  TF_RETURN_IF_ERROR(
-      GraphToFunctionDef(*body_graph, body_name.name(), &body_fdef));
-
-  TF_RETURN_IF_ERROR(library->AddFunctionDef(cond_fdef));
-  TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef));
-  if (lookup_library) {
-    // Copy missing FunctionDefs from lookup_library to library to make library
-    // self-contained.
-    TF_RETURN_IF_ERROR(
-        AddMissingFunctionDef(cond_fdef, lookup_library, library));
-    TF_RETURN_IF_ERROR(
-        AddMissingFunctionDef(body_fdef, lookup_library, library));
-  }
-
-  // Builds a While operator.
-  NodeDef while_def;
-  NodeDefBuilder builder(frame->loop_cond->name(), "XlaWhile");
-  builder.Attr("T", arg_types);
-  builder.Attr("cond", cond_name);
-  builder.Attr("body", body_name);
-  std::vector<NodeDefBuilder::NodeOut> inputs;
-  for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
-    const Edge* in_edge;
-    TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
-    if (in_edge->IsControlEdge()) {
-      builder.ControlInput(in_edge->src()->name());
-    } else {
-      inputs.push_back(NodeDefBuilder::NodeOut(
-          in_edge->src()->name(), in_edge->src_output(), arg_types[i]));
-    }
-  }
-  builder.Input(inputs);
-  TF_RETURN_IF_ERROR(builder.Finalize(&while_def));
-  TF_ASSIGN_OR_RETURN(Node * while_node, AddNode(while_def, graph));
-
-  // Copies edges to the Enter nodes and from the Exit nodes onto the While.
-  for (int i = 0; i < frame->args.size(); ++i) {
-    const Arg& arg = frame->args[i];
-    const Edge* in_edge;
-    TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
-    if (in_edge->IsControlEdge()) {
-      graph->AddControlEdge(in_edge->src(), while_node);
-    } else {
-      graph->AddEdge(in_edge->src(), in_edge->src_output(), while_node, i);
-    }
-
-    if (!arg.is_loop_invariant) {
-      // Add output edges if the output of the loop is consumed.
-      if (arg.exit != nullptr) {
-        std::vector<const Edge*> edges(arg.exit->out_edges().begin(),
-                                       arg.exit->out_edges().end());
-        for (const Edge* edge : edges) {
-          Node* dst = edge->dst();
-          int dst_input = edge->dst_input();
-          graph->RemoveEdge(edge);
-
-          if (dst_input == Graph::kControlSlot) {
-            graph->AddControlEdge(while_node, dst);
-          } else {
-            graph->AddEdge(while_node, i, dst, dst_input);
-          }
-        }
-      }
-    }
-  }
-
-  // Remove the old nodes from the graph, and add the while node to the parent
-  // frame.
-  for (Node* node : frame->nodes) {
-    graph->RemoveNode(node);
-  }
-  frame->nodes.clear();
-  frame->parent->nodes.insert(while_node);
-
-  VLOG(2) << "Frame " << frame->name << " after: "
-          << dump_graph::DumpGraphToFile("functionalize_after", *graph,
-                                         library);
-
-  return Status::OK();
-}
-
-class FunctionalizeCond {
- public:
-  // All nodes are assumed to be either in no branch, then branch, else branch,
-  // or both branches (such as merge nodes).
-  enum Branch {
-    kElseBranch = 0,
-    kThenBranch = 1,
-    kBoth = 2,
-    kNeither = 3,
-    kNumBranchTypes = 4
-  };
-
-  // Returns a textual representation of the Branch b.
-  static string Branch_Name(FunctionalizeCond::Branch b);
-
-  // Functionalize all the switch-merge nodes of a loop-free graph into XlaIf
-  // nodes. That is, attempt to transform every remaining switch and merge nodes
-  // in the graph into XlaIf nodes.
-  // Precondition: All while loops have been removed from graph.
-  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library);
-
- private:
-  // CondArgNode represents a input to the conditional and its corresponding
-  // switch nodes.
-  struct CondArgNode {
-    explicit CondArgNode(Node* src, int src_output)
-        : src(src), src_output(src_output) {}
-    string ToString() const {
-      return strings::StrCat("src=", src->name(), ":", src_output,
-                             " switches=", NodesToString(switches));
-    }
-
-    Node* src;
-    int src_output;
-    std::vector<Node*> switches;
-  };
-  using CondArgNodes = std::vector<CondArgNode>;
-
-  struct ForwardFlowNode {
-    explicit ForwardFlowNode(Branch branch = Branch::kNeither)
-        : branch(branch), count(0) {}
-    string ToString() const {
-      return strings::StrCat("branch=", Branch_Name(branch), " count=", count);
-    }
-    Branch branch;
-    int count;
-  };
-
-  // Group of switch nodes that will be part of the same XlaIf.
-  struct SwitchCluster {
-    explicit SwitchCluster(const Edge* predicate_edge)
-        : predicate_edge(predicate_edge) {}
-    string ToString() const {
-      return strings::StrCat(name, " predicate=", predicate_edge->src()->name(),
-                             " switches=", NodesToString(switches));
-    }
-
-    string name;
-    const Edge* predicate_edge;
-    std::vector<Node*> switches;
-  };
-
-  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
-                    bool dump_graphs)
-      : library_(library), graph_(graph), dump_graphs_(dump_graphs) {}
-
-  // Perform the actual cond functionalization. Iterate over groups of switch
-  // nodes (linked by common predicate), from innermost to outermost, and
-  // extract into XlaIf nodes.
-  Status FunctionalizeInternal();
-
-  // Determines the branch_map (mapping from node to branch of cond) and
-  // frontier (the nodes where the cond ends).
-  StatusOr<std::pair<std::unordered_map<Node*, ForwardFlowNode>,
-                     std::unordered_set<Node*>>>
-  DetermineBranchMapAndFrontier(const SwitchCluster& switch_cluster);
-
-  // Returns XlaIf node created from subgraph of merge and switch nodes. This
-  // encapsulates the process of extracting the bodies needed for the then and
-  // else branch, creates a XlaIf node, removing the nodes of the branches from
-  // the graph and replacing the merge node with a XlaIf.
-  StatusOr<Node*> ConvertToXlaIf(const CondArgNodes& cond_arg_nodes,
-                                 const SwitchCluster& switch_cluster,
-                                 const std::vector<Node*>& switches);
-
-  // Builds a XlaIfOp to replace the Switch-Graph-Merge cluster with.
-  StatusOr<Node*> BuildAndAddXlaIfOp(const CondArgNodes& cond_arg_nodes,
-                                     const SwitchCluster& switch_cluster,
-                                     const std::vector<Node*>& merge_nodes);
-
-  // Extracts a function body corresponding to the given input edge of the merge
-  // node.
-  Status ExtractBody(const CondArgNodes& cond_arg_nodes,
-                     const std::vector<Node*>& switches,
-                     const std::vector<Node*>& merge_nodes, int input_edge,
-                     Graph* body);
-
-  // Adds all the input edges to `if_node` corresponding to the arguments.
-  Status AddInputEdges(const CondArgNodes& cond_arg_nodes,
-                       const Edge* predicate_edge, Node* if_node);
-
-  // Adds all output edges from the `if_node`.
-  Status AddOutputEdges(const std::vector<Node*>& outputs, Node* if_node);
-
-  // Returns the switch clusters of graph_ in postorder. Dead switch nodes are
-  // skipped and removed from the graph.
-  StatusOr<std::vector<SwitchCluster>> DeterminePredicateSwitchOrder();
-
-  // Update the state for destination based on the state of source and the node
-  // being updated.
-  Status Join(const ForwardFlowNode& src_state, const Node* dst,
-              ForwardFlowNode* dst_state);
-
-  // Ensure that all nodes in the branch_map are dominated by the switch
-  // nodes. Returns nodes that are not dominated by the switches but are a
-  // control dependency of a node in the cond, and remove such control
-  // dependencies.
-  StatusOr<std::vector<Node*>> EnsureDominanceAndReturnNonDominatedControlNodes(
-      const std::unordered_map<Node*, ForwardFlowNode>& branch_map,
-      const std::vector<Node*>& switches);
-
-  // Validates that the frontier of nodes for the conditional
-  // section are as expected.
-  Status ValidateFrontier(
-      const std::unordered_map<Node*, ForwardFlowNode>& branch_map,
-      const std::unordered_set<Node*>& frontier);
-
-  FunctionLibraryDefinition* library_;
-  Graph* graph_;
-  bool dump_graphs_;
-};
-
-bool IsDeadSwitch(const Node* node) {
-  for (const Edge* e : node->out_edges()) {
-    const Node* dst = e->dst();
-    if (!dst->IsIdentity()) {
-      return false;
-    }
-    for (const Edge* ee : dst->out_edges()) {
-      if (!ee->IsControlEdge() || !ee->dst()->IsSink()) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-string FunctionalizeCond::Branch_Name(FunctionalizeCond::Branch b) {
-  const string branch_name[FunctionalizeCond::kNumBranchTypes + 1] = {
-      "else", "then", "both", "neither", "count"};
-  return branch_name[b];
-}
-
-Status FunctionalizeCond::ValidateFrontier(
-    const std::unordered_map<Node*, FunctionalizeCond::ForwardFlowNode>&
-        branch_map,
-    const std::unordered_set<Node*>& frontier) {
-  std::unordered_set<const Node*> pending[kNumBranchTypes];
-  for (Node* n : frontier) {
-    pending[branch_map.at(n).branch].insert(n);
-  }
-  TF_RET_CHECK(pending[kNeither].empty()) << NodesToString(pending[kNeither]);
-  for (const Node* n : pending[kBoth]) {
-    TF_RET_CHECK(IsMerge(n)) << n->DebugString();
-    // Merge nodes may be in then or else branch too
-  }
-  int index = (pending[kThenBranch].size() <= pending[kElseBranch].size())
-                  ? kThenBranch
-                  : kElseBranch;
-  int other = 1 - index;
-  for (const Node* n : pending[index]) {
-    if (pending[other].find(n) != pending[other].end()) {
-      return errors::Internal(
-          "Node (", n->DebugString().c_str(),
-          ") in both Else and Then branch should be in Both.");
-    }
-  }
-  // An empty frontier indicates a dead switch. Above we attempt to remove dead
-  // switch nodes, but not all are removed so don't treat it as an error yet.
-  // TODO(jpienaar): Find out why dead switch nodes remain.
-  // if (pending[kBoth].empty() && pending[kThenBranch].empty() &&
-  //     pending[kElseBranch].empty()) {
-  //   return errors::Internal("Unexpected empty frontier for switch nodes");
-  // }
-  return Status::OK();
-}
-
-Status FunctionalizeCond::Join(const ForwardFlowNode& src_state,
-                               const Node* dst, ForwardFlowNode* dst_state) {
-  TF_RET_CHECK(dst_state->branch != Branch::kBoth &&
-               dst_state->branch != Branch::kNumBranchTypes)
-      << "Unexpected/Invalid branch type: Merging "
-      << Branch_Name(src_state.branch) << " with "
-      << Branch_Name(dst_state->branch);
-  if (dst_state->branch == Branch::kNeither) {
-    dst_state->branch = src_state.branch;
-  } else if (src_state.branch != dst_state->branch &&
-             src_state.branch != Branch::kNeither) {
-    if (IsMerge(dst)) {
-      dst_state->branch = Branch::kBoth;
-    } else {
-      return errors::Internal("Illegal merge:\n", src_state.ToString(),
-                              " with ", dst_state->ToString(), " for\n",
-                              dst->DebugString());
-    }
-  }
-  ++dst_state->count;
-  return Status::OK();
-}
-
-StatusOr<std::vector<FunctionalizeCond::SwitchCluster>>
-FunctionalizeCond::DeterminePredicateSwitchOrder() {
-  struct Cluster {
-    bool operator==(const Cluster& other) const {
-      return representative == other.representative;
-    }
-    int representative = -1;
-  };
-
-  // Perform a DFS over the graph and
-  // * Determine the reverse topological order of the nodes (there should be no
-  //   cycles at this point so the post-order numbering corresponds to the
-  //   reverse topological sorting);
-  // * Identify dead switches;
-  // * Initialize the cluster's representative;
-  std::vector<UnionFind<Cluster>> clusters(graph_->num_node_ids());
-  std::vector<Node*> dead_switches;
-  std::vector<Node*> switch_order;
-  std::vector<Node*> rev_topo_sorted_nodes;
-  DFS(*graph_, nullptr, [&](Node* n) {
-    clusters[n->id()].Get().representative = n->id();
-    if (IsSwitch(n)) {
-      if (IsDeadSwitch(n)) {
-        dead_switches.push_back(n);
-      } else {
-        rev_topo_sorted_nodes.push_back(n);
-        switch_order.push_back(n);
-      }
-    } else if (n->IsOp()) {
-      // Exclude src and sink nodes from further consideration.
-      rev_topo_sorted_nodes.push_back(n);
-    }
-  });
-
-  std::vector<SwitchCluster> switch_clusters;
-  // Return early if there are no switches in the graph.
-  if (switch_order.empty()) {
-    return switch_clusters;
-  }
-
-  // Remove all dead switch nodes.
-  for (Node* n : dead_switches) {
-    VLOG(2) << "Removing dead switch: " << n->DebugString();
-    graph_->RemoveNode(n);
-  }
-
-  // Identify switch nodes that are part of the same control flow context by
-  // considering the operands of operations: an operation is part of the same
-  // control context as its operands unless the operation is a switch. Control
-  // dependencies are considered part of the same control flow context if the
-  // switch depth is the same (see comment below).
-
-  // entry_cluster records the input cluster to a switch node. This is used when
-  // merging with a merge node where the dst's cluster is merged with the entry
-  // cluster of the merge node's cluster (which corresponds to a switch cluster
-  // and so has an entry cluster).
-  std::unordered_map<int, UnionFind<Cluster>*> entry_cluster;
-
-  // Returns the output cluster of a node. Where the output cluster is cluster
-  // where the output of the node is used. For non-merge nodes this is simply
-  // the cluster they are part of, while for merge nodes it is the entry cluster
-  // of the cluster they are part of (this will correspond to the entry node of
-  // a switch node that dominates the merge).
-  auto find_output_cluster = [&](Node* n) {
-    UnionFind<Cluster>* cluster = &clusters[n->id()];
-    if (!IsMerge(n)) return cluster;
-    auto it = entry_cluster.find(clusters[n->id()].Get().representative);
-    // If the cluster is not found in the entry_cluster map then an
-    // instruction not dominated by a switch node has been merged into the
-    // cluster of the merge. This indicates a failure of the clustering.
-    CHECK(it != entry_cluster.end())
-        << "Unable to find entry for n=" << n->id() << " ("
-        << cluster->Get().representative << ")";
-    return it->second;
-  };
-
-  // TODO(jpienaar): This could be combined with DetermineBranchMapAndFrontier.
-  std::vector<int> switch_depth(graph_->num_node_ids());
-  for (auto it = rev_topo_sorted_nodes.rbegin();
-       it != rev_topo_sorted_nodes.rend(); ++it) {
-    Node* n = *it;
-
-    // Compute switch depth.
-    int new_switch_depth = 0;
-    for (const Edge* e : n->in_edges()) {
-      Node* src = e->src();
-      new_switch_depth = std::max(
-          new_switch_depth, switch_depth[src->id()] - (IsMerge(src) ? 1 : 0));
-    }
-    switch_depth[n->id()] = new_switch_depth + (IsSwitch(n) ? 1 : 0);
-
-    // Only merge the input operands of a switch. The switch's clustering itself
-    // is determined by the interaction of the switch's outputs.
-    if (IsSwitch(n)) {
-      Node* input;
-      TF_CHECK_OK(n->input_node(0, &input));
-      entry_cluster[n->id()] = find_output_cluster(input);
-      UnionFind<Cluster>* cluster = entry_cluster[n->id()];
-      int cluster_depth = switch_depth[cluster->Get().representative];
-      // Merge the inputs of the switch node with one another. This results in
-      // predicates and control input residing in the same cluster.
-      for (const Edge* e : n->in_edges()) {
-        // Only consider the data inputs to the Switch node.
-        if (e->IsControlEdge()) continue;
-
-        Node* src = e->src();
-        UnionFind<Cluster>* src_cluster = find_output_cluster(src);
-        int src_cluster_depth = switch_depth[src_cluster->Get().representative];
-        if (cluster_depth != src_cluster_depth) {
-          return errors::InvalidArgument(
-              "Unable to functionalize control flow in graph: Switch ('",
-              n->name(), "') has operands ('", input->name(), "' and '",
-              src->name(), "') that have different switch depths (",
-              cluster_depth, " != ", src_cluster_depth, ")");
-        }
-        cluster->Merge(src_cluster);
-      }
-      continue;
-    }
-
-    for (const Edge* e : n->in_edges()) {
-      Node* src = e->src();
-      if (!src->IsOp()) continue;
-      UnionFind<Cluster>* cluster = find_output_cluster(src);
-      // Merge a node with its data operands and with its control operands if
-      // the src and dst are in the same ControlContext. The ControlContext is
-      // not explicitly available here, and instead the switch depth is used as
-      // a proxy here. Due to the invariant that control edges can only be from
-      // a containing scope to an inner scope or from the inner scope to its
-      // containing scope (for exit nodes), the switch depth will only match if
-      // the src and dst are in the same ControlContext. Control edges between
-      // ControlContexts are handled during the extraction.
-      int src_id = cluster->Get().representative;
-      int src_depth = switch_depth[src_id];
-      if (!e->IsControlEdge() || new_switch_depth == src_depth) {
-        if (src_depth != new_switch_depth) {
-          // TODO(b/77601805) remove this when outside_compilation supports
-          // control flow.
-          if (str_util::StrContains(src->name(), "outside_compilation") ||
-              str_util::StrContains(n->name(), "outside_compilation")) {
-            return errors::InvalidArgument(
-                "outside_compilation is not yet supported within TensorFlow "
-                "control flow constructs b/77601805");
-          }
-          return errors::InvalidArgument(
-              "Unable to functionalize control flow in graph: Operand ('",
-              src->name(), "') and operator ('", n->name(),
-              "') have different switch depths (", src_depth,
-              " != ", new_switch_depth, ")");
-        }
-        cluster->Merge(&clusters[n->id()]);
-      }
-    }
-  }
-
-  if (dump_graphs_) {
-    // Mark the switch cluster each node is part of.
-    for (Node* n : graph_->nodes()) {
-      n->ClearAttr("_XlaFunctionalizeSwitchGroup");
-      n->AddAttr("_XlaFunctionalizeSwitchGroup",
-                 clusters[n->id()].Get().representative);
-    }
-    LOG(INFO) << "FunctionalizeControlFlow (with_clusters): "
-              << dump_graph::DumpGraphToFile("functionalize_clustered", *graph_,
-                                             library_);
-  }
-
-  // Verify all the nodes of a cluster are at the same depth.
-  std::unordered_map<int, std::pair<int, Node*>> cluster_to_depth_node;
-  for (Node* n : graph_->nodes()) {
-    int depth = switch_depth[n->id()];
-    int cluster_rep = clusters[n->id()].Get().representative;
-    auto it = cluster_to_depth_node.find(cluster_rep);
-    if (it == cluster_to_depth_node.end()) {
-      cluster_to_depth_node[cluster_rep] = std::make_pair(depth, n);
-    } else {
-      if (it->second.first != depth) {
-        return errors::Internal(
-            "Illegal clustering created, mismatch in depths:", "\n\t",
-            n->DebugString(), "(", clusters[n->id()].Get().representative,
-            ") at depth=", depth, " vs\n\t", it->second.second->DebugString(),
-            "(", clusters[n->id()].Get().representative, ") at depth ",
-            it->second.first);
-      }
-    }
-  }
-
-  struct Hash {
-    size_t operator()(const std::pair<Node*, Cluster>& item) const {
-      return Hash64Combine(hash<Node*>()(item.first),
-                           std::hash<int>()(item.second.representative));
-    }
-  };
-
-  // Merge Switch nodes with common predicate.
-  std::unordered_map<std::pair<Node*, Cluster>, int, Hash> predicate_index;
-  // The nodes in switch_order are in reverse topological order, but the
-  // clustered switches need not be (i.e., when considered as a cluster one
-  // element of a cluster may be later in the topological order than another
-  // node whose cluster is later in the topological order of clustered
-  // switches).
-  for (auto it = switch_order.rbegin(); it != switch_order.rend(); ++it) {
-    const Edge* pred_edge;
-    TF_CHECK_OK((*it)->input_edge(1, &pred_edge));
-    // The predicate can be preceded by a identity node. Look through identity
-    // nodes to predicate.
-    while (pred_edge->src()->IsIdentity()) {
-      TF_CHECK_OK(pred_edge->src()->input_edge(0, &pred_edge));
-    }
-    auto repr = std::make_pair(pred_edge->src(), clusters[(*it)->id()].Get());
-    if (predicate_index.find(repr) == predicate_index.end()) {
-      predicate_index[repr] = switch_clusters.size();
-      switch_clusters.emplace_back(pred_edge);
-      // Generate a name by concatenating with the cluster representative as
-      // there could be multiple switch clusters with the same predicate.
-      switch_clusters[predicate_index[repr]].name = strings::StrCat(
-          pred_edge->src()->name(), "_", repr.second.representative, "_If");
-    }
-    switch_clusters[predicate_index[repr]].switches.push_back(*it);
-  }
-
-  return switch_clusters;
-}
-
-StatusOr<std::vector<Node*>>
-FunctionalizeCond::EnsureDominanceAndReturnNonDominatedControlNodes(
-    const std::unordered_map<Node*, ForwardFlowNode>& branch_map,
-    const std::vector<Node*>& switches) {
-  std::vector<Node*> old_control_nodes;
-  for (const auto& kv : branch_map) {
-    if (kv.second.count != kv.first->in_edges().size()) {
-      std::vector<const Edge*> delete_edges;
-      for (const Edge* in : kv.first->in_edges()) {
-        auto it = branch_map.find(in->src());
-        if (it == branch_map.end()) {
-          if (in->IsControlEdge()) {
-            old_control_nodes.push_back(in->src());
-            delete_edges.push_back(in);
-          } else {
-            if (IsSwitch(in->src())) {
-              if (std::find(switches.begin(), switches.end(), in->src()) ==
-                  switches.end()) {
-                return errors::Internal(
-                    "Unexpected switch node found during flow forward: ",
-                    in->src()->DebugString());
-              }
-              continue;
-            }
-            return errors::InvalidArgument(
-                "Value ", kv.first->name(), "'s input, ", in->src()->name(),
-                ", is not dominated by switch nodes ", NodesToString(switches));
-          }
-        }
-      }
-      // Remove control edges from nodes that are not dominated by the switch
-      // nodes. New control dependencies will be added between these nodes and
-      // the XlaIf node inserted.
-      for (const Edge* e : delete_edges) {
-        graph_->RemoveEdge(e);
-      }
-    }
-  }
-  return old_control_nodes;
-}
-
-StatusOr<
-    std::pair<std::unordered_map<Node*, FunctionalizeCond::ForwardFlowNode>,
-              std::unordered_set<Node*>>>
-FunctionalizeCond::DetermineBranchMapAndFrontier(
-    const SwitchCluster& switch_cluster) {
-  std::unordered_map<Node*, ForwardFlowNode> branch_map;
-  std::unordered_set<Node*> frontier;
-  std::vector<Node*> stack = switch_cluster.switches;
-  std::vector<bool> visited(graph_->num_node_ids(), false);
-  while (!stack.empty()) {
-    Node* n = stack.back();
-    stack.pop_back();
-
-    if (visited[n->id()]) {
-      continue;
-    }
-    visited[n->id()] = true;
-
-    // Propagate branch state along each edge of a switch node.
-    bool sink_only = true;
-    for (const Edge* e : n->out_edges()) {
-      Node* out = e->dst();
-      if (!out->IsOp()) {
-        continue;
-      }
-      sink_only = false;
-      // Propagate branch information.
-      ForwardFlowNode& ffn = branch_map[out];
-      if (IsSwitch(n)) {
-        int index = e->IsControlEdge() ? Branch::kNeither : e->src_output();
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(
-            Join(ForwardFlowNode(Branch(index)), out, &ffn), " when joining ",
-            e->DebugString());
-      } else {
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(Join(branch_map[n], out, &ffn),
-                                        " when joining ", e->DebugString());
-      }
-      if (IsMerge(out)) {
-        if (out->in_edges().size() == ffn.count) {
-          frontier.insert(out);
-        }
-      } else if (!visited[out->id()]) {
-        stack.push_back(out);
-      }
-    }
-    if (sink_only) {
-      if (!IsIdentity(n)) {
-        VLOG(1) << "Feeding into sink: " << n->DebugString();
-      }
-    }
-  }
-
-  if (dump_graphs_) {
-    for (const auto& kv : branch_map) {
-      // Append attribute to the graph if running with logging to make the
-      // changes clearer in the visualization.
-      kv.first->AddAttr("_XlaFunctionalizeBranch",
-                        Branch_Name(kv.second.branch));
-    }
-  }
-  return std::make_pair(std::move(branch_map), std::move(frontier));
-}
-
-Status FunctionalizeCond::FunctionalizeInternal() {
-  TF_ASSIGN_OR_RETURN(std::vector<SwitchCluster> predicate_switch_order,
-                      DeterminePredicateSwitchOrder());
-
-  // Iterate from innermost set of clustered switches to outermost, replacing
-  // matching switch->merge subgraphs with single XlaIf nodes.
-  for (auto it = predicate_switch_order.rbegin();
-       it != predicate_switch_order.rend(); ++it) {
-    auto& ps = *it;
-    VLOG(3) << "Flow down from: " << ps.ToString();
-
-    std::unordered_map<Node*, ForwardFlowNode> branch_map;
-    std::unordered_set<Node*> frontier;
-    TF_ASSIGN_OR_RETURN(std::tie(branch_map, frontier),
-                        DetermineBranchMapAndFrontier(ps));
-
-    if (dump_graphs_)
-      LOG(INFO) << "FunctionalizeControlFlow (before XlaIf conversion): "
-                << dump_graph::DumpGraphToFile("functionalize_bc", *graph_,
-                                               library_);
-    TF_RETURN_IF_ERROR(ValidateFrontier(branch_map, frontier));
-
-    struct Hash {
-      size_t operator()(const std::pair<Node*, int>& item) const {
-        return Hash64Combine(hash<Node*>()(item.first),
-                             std::hash<int>()(item.second));
-      }
-    };
-
-    // Sort the merge and switch nodes using NodeCmp. The switch-nodes are
-    // further grouped (post sorting) by input to the switch node as in the
-    // functionalized form each input will be passed in only once. This grouping
-    // should retain the sorted order.
-    CondArgNodes cond_arg_nodes;
-    std::sort(ps.switches.begin(), ps.switches.end(), NodeCmp());
-    std::unordered_map<std::pair<Node*, int>, int, Hash> input_index;
-    for (Node* switch_node : ps.switches) {
-      const Edge* e;
-      TF_RETURN_IF_ERROR(switch_node->input_edge(0, &e));
-      std::pair<Node*, int> key = std::make_pair(e->src(), e->src_output());
-      if (input_index.find(key) == input_index.end()) {
-        input_index[key] = cond_arg_nodes.size();
-        cond_arg_nodes.emplace_back(key.first, key.second);
-      }
-      cond_arg_nodes.at(input_index.at(key)).switches.push_back(switch_node);
-    }
-    std::vector<Node*> merge_nodes(frontier.begin(), frontier.end());
-    std::sort(merge_nodes.begin(), merge_nodes.end(), NodeCmp());
-
-    TF_ASSIGN_OR_RETURN(std::vector<Node*> old_control_nodes,
-                        EnsureDominanceAndReturnNonDominatedControlNodes(
-                            branch_map, ps.switches));
-
-    TF_ASSIGN_OR_RETURN(Node * if_node,
-                        ConvertToXlaIf(cond_arg_nodes, ps, merge_nodes));
-    for (Node* old : old_control_nodes) {
-      graph_->AddControlEdge(old, if_node);
-    }
-
-    for (auto& del_kv : branch_map) {
-      graph_->RemoveNode(del_kv.first);
-    }
-    for (auto& kv : cond_arg_nodes) {
-      for (Node* node : kv.switches) {
-        graph_->RemoveNode(node);
-      }
-    }
-    if (dump_graphs_)
-      LOG(INFO) << "FunctionalizeControlFlow (after XlaIf conversion): "
-                << dump_graph::DumpGraphToFile("functionalize_ac", *graph_,
-                                               library_);
-  }
-  return Status::OK();
-}
-
-StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
-    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
-    const std::vector<Node*>& merge_nodes) {
-  VLOG(2) << "Build if op for " << switch_cluster.name;
-
-  NodeDef if_def;
-  // Create a new If node using the name of the merge node.
-  NodeDefBuilder builder(switch_cluster.name, "XlaIf");
-  string branch[] = {"else_branch", "then_branch"};
-  for (int i = 0; i < 2; ++i) {
-    static std::atomic<int64> sequence_num(0LL);
-    int64 id = ++sequence_num;
-
-    NameAttrList body_name;
-    body_name.set_name(
-        strings::StrCat("_functionalize_if_", branch[i], "_", id));
-    auto body = xla::MakeUnique<Graph>(graph_->op_registry());
-    TF_RETURN_IF_ERROR(ExtractBody(cond_arg_nodes, switch_cluster.switches,
-                                   merge_nodes, i, body.get()));
-    VLOG(3) << "Body " << branch[i] << ": " << DebugString(body.get());
-    FunctionDef body_fdef;
-    TF_RETURN_IF_ERROR(GraphToFunctionDef(*body, body_name.name(), &body_fdef));
-    TF_RETURN_IF_ERROR(library_->AddFunctionDef(body_fdef));
-    builder.Attr(branch[i], body_name);
-  }
-
-  // Build input type.
-  std::vector<NodeDefBuilder::NodeOut> inputs;
-  DataTypeVector in_arg_types;
-  for (auto& kv : cond_arg_nodes) {
-    bool inserted = false;
-    for (const Node* arg : kv.switches) {
-      const Edge* in_edge;
-      TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
-      if (in_edge->IsControlEdge()) {
-        builder.ControlInput(in_edge->src()->name());
-      } else {
-        if (!inserted) {
-          DataType dtype = arg->input_type(0);
-          inputs.emplace_back(NodeDefBuilder::NodeOut(
-              in_edge->src()->name(), in_edge->src_output(), dtype));
-          in_arg_types.push_back(dtype);
-          inserted = true;
-        }
-      }
-    }
-  }
-  builder.Attr("Tin", in_arg_types);
-
-  // Build output type.
-  DataTypeVector out_type;
-  for (const Node* merge : merge_nodes) {
-    DataType dtype = merge->output_type(0);
-    out_type.push_back(dtype);
-  }
-  builder.Attr("Tout", out_type);
-
-  builder.Attr("Tcond", DT_BOOL);
-  builder.Device(switch_cluster.predicate_edge->src()->assigned_device_name());
-  // Conditional should be the first input ...
-  builder.Input(NodeDefBuilder::NodeOut(
-      switch_cluster.predicate_edge->src()->name(),
-      switch_cluster.predicate_edge->src_output(),
-      switch_cluster.predicate_edge->src()->output_type(0)));
-  // ... followed by the other inputs.
-  builder.Input(inputs);
-
-  TF_RETURN_IF_ERROR(builder.Finalize(&if_def));
-  TF_ASSIGN_OR_RETURN(Node * if_node, AddNode(if_def, graph_));
-  return if_node;
-}
-
-Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
-                                      const std::vector<Node*>& switches,
-                                      const std::vector<Node*>& merge_nodes,
-                                      int input_edge, Graph* body) {
-  VLOG(2) << "ExtractBody for " << NodesToString(merge_nodes) << " along edge "
-          << input_edge;
-  std::vector<bool> squash_src_outputs(graph_->num_node_ids(), false);
-  std::vector<Node*> node_map(graph_->num_node_ids(), nullptr);
-  int arg_count = 0;
-  for (auto& kv : cond_arg_nodes) {
-    Node* arg_node = nullptr;
-    for (const auto* arg : kv.switches) {
-      DataType dtype = arg->input_type(0);
-      if (arg_node == nullptr) {
-        TF_ASSIGN_OR_RETURN(arg_node, BuildArgNode(body, dtype, arg_count++));
-      }
-      node_map.at(arg->id()) = arg_node;
-      squash_src_outputs.at(arg->id()) = true;
-    }
-  }
-
-  std::vector<Node*> stack;
-  stack.reserve(merge_nodes.size());
-  for (int j = 0; j < merge_nodes.size(); ++j) {
-    Node* node = merge_nodes[j];
-    TF_ASSIGN_OR_RETURN(node_map.at(node->id()),
-                        BuildRetvalNode(body, node->output_type(0),
-                                        /*index=*/j));
-    const Edge* in_edge;
-    TF_RETURN_IF_ERROR(node->input_edge(input_edge, &in_edge));
-    Node* in = in_edge->src();
-    if (node_map.at(in->id()) == nullptr) {
-      node_map.at(in->id()) = body->CopyNode(in);
-    }
-
-    if (std::find(switches.begin(), switches.end(), in) == switches.end()) {
-      body->AddEdge(node_map.at(in->id()), in_edge->src_output(),
-                    node_map.at(node->id()), 0);
-    } else {
-      body->AddEdge(node_map.at(in->id()), 0, node_map.at(node->id()), 0);
-      // Don't include input nodes that are already just returned in stack.
-      continue;
-    }
-    stack.push_back(in);
-  }
-
-  return CopySubgraph(*graph_, nullptr, stack, squash_src_outputs, &node_map,
-                      body);
-}
-
-Status FunctionalizeCond::AddInputEdges(const CondArgNodes& cond_arg_nodes,
-                                        const Edge* predicate_edge,
-                                        Node* if_node) {
-  VLOG(3) << "AddInputEdges for " << if_node->name();
-  int index = 0;
-  graph_->AddEdge(predicate_edge->src(), predicate_edge->src_output(), if_node,
-                  index++);
-  for (auto& arg : cond_arg_nodes) {
-    if (arg.src_output == Graph::kControlSlot) {
-      graph_->AddControlEdge(arg.src, if_node);
-    } else {
-      graph_->AddEdge(arg.src, arg.src_output, if_node, index++);
-    }
-  }
-  return Status::OK();
-}
-
-Status FunctionalizeCond::AddOutputEdges(const std::vector<Node*>& outputs,
-                                         Node* if_node) {
-  VLOG(3) << "AddOutputEdges for " << if_node->name();
-  for (int i = 0; i < outputs.size(); ++i) {
-    Node* node = outputs[i];
-    std::vector<const Edge*> edges(node->out_edges().begin(),
-                                   node->out_edges().end());
-    for (const Edge* edge : edges) {
-      Node* dst = edge->dst();
-      int dst_input = edge->dst_input();
-
-      if (edge->src_output() > 0) {
-        return errors::Unimplemented("Output of index (", edge->src_output(),
-                                     ") of merge node ", node->name());
-      }
-
-      int src_output =
-          dst_input == Graph::kControlSlot ? Graph::kControlSlot : i;
-      graph_->RemoveEdge(edge);
-      graph_->AddEdge(if_node, src_output, dst, dst_input);
-    }
-  }
-  return Status::OK();
-}
-
-StatusOr<Node*> FunctionalizeCond::ConvertToXlaIf(
-    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
-    const std::vector<Node*>& merge_nodes) {
-  VLOG(1) << "ConvertToXlaIf for " << switch_cluster.ToString() << " -> "
-          << NodesToString(merge_nodes);
-
-  // Extract bodies and builds a If operator.
-  TF_ASSIGN_OR_RETURN(
-      Node * if_node,
-      BuildAndAddXlaIfOp(cond_arg_nodes, switch_cluster, merge_nodes));
-  TF_RETURN_IF_ERROR(
-      AddInputEdges(cond_arg_nodes, switch_cluster.predicate_edge, if_node));
-  TF_RETURN_IF_ERROR(AddOutputEdges(merge_nodes, if_node));
-  // Check that the if_node doesn't feed into itself.
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      CheckNoCycleContains(if_node, graph_->num_node_ids()),
-      "ConvertToXlaIf failed.");
-
-  return if_node;
-}
-
-Status FunctionalizeCond::Functionalize(Graph* graph,
-                                        FunctionLibraryDefinition* library) {
-  VLOG(1) << "FunctionalizeCond::Functionalize";
-  FunctionalizeCond fc(graph, library, /*dump_graphs=*/VLOG_IS_ON(2));
-  return fc.FunctionalizeInternal();
-}
-
-}  // namespace
-
-// Transformation that converts TensorFlow's graph control flow constructs into
-// functional equivalents.
-Status FunctionalizeControlFlow(Graph* graph,
-                                FunctionLibraryDefinition* library) {
-  return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
-}
-
 Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
                                 Graph* graph,
                                 FunctionLibraryDefinition* library) {
@@ -1462,98 +46,26 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
           << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
                                          library);
 
-  // Note: BuildControlFlowInfo() requires that the graph's source node is
-  // connected to all source nodes in the graph. Many graphs violate this
-  // invariant.
-  std::vector<ControlFlowInfo> cf_info;
-  std::vector<string> unreachable_nodes;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      BuildControlFlowInfo(graph, &cf_info, &unreachable_nodes),
-      "FunctionalizeControlFlow failed");
-  if (!unreachable_nodes.empty()) {
-    return errors::InvalidArgument(
-        "The following nodes are unreachable from the source in the graph: ",
-        errors::FormatNodeNamesForError(unreachable_nodes));
-  }
-
-  // Builds Frames, indexed by name.
-  std::unordered_map<string, Frame> frames;
-  for (Node* node : graph->op_nodes()) {
-    const ControlFlowInfo& cf = cf_info[node->id()];
-
-    VLOG(2) << "node: " << node->name() << " (" << node->id()
-            << ") frame_name: " << cf.frame_name
-            << " frame: " << (cf.frame ? cf.frame->name() : "---")
-            << " parent_frame: "
-            << (cf.parent_frame ? cf.parent_frame->name() : "---");
-    TF_RET_CHECK(cf.frame != nullptr && cf.parent_frame != nullptr);
-
-    Frame& frame = frames[cf.frame_name];
-    Frame* parent = &frames[cf_info[cf.parent_frame->id()].frame_name];
-    if (frame.parent == nullptr) {
-      frame.parent = parent;
-      frame.name = cf.frame_name;
-      ++parent->num_children;
-    }
-
-    if (IsEnter(node)) {
-      Arg arg;
-      arg.enter = node;
-      TF_RETURN_IF_ERROR(GetNodeAttr(arg.enter->attrs(), "is_constant",
-                                     &arg.is_loop_invariant));
-      frame.args.push_back(arg);
-    } else if (IsLoopCond(node)) {
-      frame.loop_cond = node;
-    }
-    frame.nodes.insert(node);
-  }
-
-  // Adds frames with no children (i.e., the innermost frames) to a worklist.
-  std::deque<Frame*> worklist;
-  for (auto& frame : frames) {
-    if (frame.second.num_children == 0) {
-      worklist.push_back(&frame.second);
-    }
-  }
-
-  // Eliminate loops from innermost to outermost.
-  while (!worklist.empty()) {
-    Frame* frame = worklist.front();
-    worklist.pop_front();
-    if (frame->parent == frame) {
-      // Skip the root frame.
-      continue;
-    }
-
-    TF_RETURN_IF_ERROR(
-        FunctionalizeLoop(lookup_library, graph, frame, library));
-
-    // If the parent has no remaining children, add it to the worklist.
-    --frame->parent->num_children;
-    if (frame->parent->num_children == 0) {
-      worklist.push_back(frame->parent);
-    }
-  }
-  // There should be no cycle at this point, since while loops have been removed
-  // from graph.
-  // Check that the newly added XlaWhile nodes don't feed into themselves.
-  for (const Node* node : graph->op_nodes()) {
-    if (node->def().op() == "XlaWhile") {
-      TF_RETURN_WITH_CONTEXT_IF_ERROR(
-          CheckNoCycleContains(node, graph->num_node_ids()),
-          "FunctionalizeLoop failed.");
-    }
-  }
+  // Functionalize and remove while loops from graph.
+  TF_RETURN_IF_ERROR(FunctionalizeWhileLoop(lookup_library, graph, library));
 
   // FunctionalizeControlFlow is invoked for every function, so the loops's
   // bodies and conditionals that were extracted into functions will be handled
   // in successive invocations.
-  TF_RETURN_IF_ERROR(FunctionalizeCond::Functionalize(graph, library));
+  TF_RETURN_IF_ERROR(FunctionalizeCond(graph, library));
 
   VLOG(2) << "FunctionalizeControlFlow (final): "
           << dump_graph::DumpGraphToFile("functionalize_final", *graph,
                                          library);
+
   return Status::OK();
 }
 
+// Transformation that converts TensorFlow's graph control flow constructs into
+// functional equivalents.
+Status FunctionalizeControlFlow(Graph* graph,
+                                FunctionLibraryDefinition* library) {
+  return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index d941041d15..55600f2a8b 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -16,14 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
 #define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_H_
 
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
 
 namespace tensorflow {
 
 // Transformation that converts tf.while_loop() loops into functional While
-// operators, suitable for XLA compilation. If lookup_library is provided, use
-// it to make the library for control flow self-contained.
+// operators and tf.cond() conditionals into function If operators, suitable for
+// XLA compilation. If lookup_library is provided, use it to make the library
+// for control flow self-contained.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library);
 Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index ccf249b35d..cc52057f21 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -37,12 +37,12 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// Returns the names of the "then" and "else" functions for the XlaIf node in a
+// Returns the names of the "then" and "else" functions for the If node in a
 // graph.
 Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
                          NameAttrList* then_fn, NameAttrList* else_fn) {
   for (const NodeDef& node : graph.node()) {
-    if (node.op() == "XlaIf") {
+    if (node.op() == "If") {
       *op_name = node.name();
       const NameAttrList* result;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "then_branch", &result));
@@ -52,7 +52,7 @@ Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
       return Status::OK();
     }
   }
-  return errors::NotFound("No XlaIf node found in graph");
+  return errors::NotFound("No If node found in graph");
 }
 
 // Graph:
@@ -115,8 +115,13 @@ TEST(FunctionalizeControlFlow, Conditional) {
     auto if_op = ops::XlaIf(scope.WithOpName(op_name), less,
                             std::initializer_list<Input>{less, y, x}, then_fn,
                             else_fn, {DT_INT32});
+    auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
     GraphDef expected;
     TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    // TODO(jpienaar): Create wrapper for IfOp.
+    for (NodeDef& n : *expected.mutable_node()) {
+      if (n.op() == "XlaIf") n.set_op("If");
+    }
     TF_EXPECT_GRAPH_EQ(expected, graph_def);
   }
 
@@ -1013,63 +1018,5 @@ TEST(FunctionalizeControlFlow, Complex) {
   }
 }
 
-TEST(FunctionalizeControlFlow, Cycle) {
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  //   -----------------------------------------------------
-  //   |                                                   |
-  //   |                                                   v
-  // less -> switch_1 --> add -> merge_1 -> identity -> switch_2
-  //            |          ^                               |
-  //            |          |                               v
-  //            --------> one -------------------------> add_2 ---> merge_2
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
-    auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto switch_1 = ops::Switch(scope.WithOpName("cond/Switch"), x, less);
-    auto two =
-        ops::Const<int32>(scope.WithOpName("cond/two")
-                              .WithControlDependencies(switch_1.output_true),
-                          2);
-    auto mul = ops::Multiply(scope.WithOpName("cond/true/mul"),
-                             switch_1.output_true, two);
-    auto one =
-        ops::Const<int32>(scope.WithOpName("cond/one")
-                              .WithControlDependencies(switch_1.output_false),
-                          1);
-    auto add = ops::Add(scope.WithOpName("cond/false/add"),
-                        switch_1.output_false, one);
-
-    auto merge_1 = ops::Merge(scope.WithOpName("cond/Merge"),
-                              std::initializer_list<Input>{add, mul});
-    auto identity =
-        ops::Identity(scope.WithOpName("cond/Merge/identity"), merge_1.output);
-    auto switch_2 =
-        ops::Switch(scope.WithOpName("grad/cond/Switch"), identity, less);
-    auto add_2 = ops::Add(scope.WithOpName("cond_2/false/add"),
-                          switch_2.output_false, one);
-    auto mul_2 = ops::Multiply(scope.WithOpName("cond_2/true/mul"),
-                               switch_2.output_true, two);
-    auto merge_2 = ops::Merge(scope.WithOpName("cond_2/Merge"),
-                              std::initializer_list<Input>{add_2, mul_2});
-    TF_ASSERT_OK(scope.ToGraph(graph.get()));
-  }
-  // No cycle before functionalize control flow.
-  TF_EXPECT_OK(graph::ValidateGraphHasNoCycle(*graph));
-  FunctionLibraryDefinition library(OpRegistry::Global(), {});
-  // switch_1 and switch_2 have the same switch depth. They are replaced by a
-  // single XlaIf node during FunctionalizeControlFlow, resulting in a cycle:
-  // less -> XlaIf <--> identity.
-  Status status = FunctionalizeControlFlow(graph.get(), &library);
-  EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Detected a cycle"))
-      << status.error_message();
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "{{node cond/Less_5_If}}"))
-      << status.error_message();
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
new file mode 100644
index 0000000000..924fcdd9cd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+
+bool NodeCmpByNameResourcesLast::operator()(const Node* lhs,
+                                            const Node* rhs) const {
+  bool lhs_is_resource =
+      lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false;
+  bool rhs_is_resource =
+      rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false;
+  return std::tie(lhs_is_resource, lhs->name()) <
+         std::tie(rhs_is_resource, rhs->name());
+}
+
+xla::StatusOr<Node*> AddNodeDefToGraph(const NodeDef& node_def, Graph* graph) {
+  Status status;
+  Node* inserted_node = graph->AddNode(node_def, &status);
+  if (!status.ok()) {
+    return status;
+  }
+  return inserted_node;
+}
+
+xla::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
+  const char* const kRetValOp = "_Retval";
+  NodeDef ret_def;
+  ret_def.set_op(kRetValOp);
+  ret_def.set_name(strings::StrCat(kRetValOp, index));
+  AddNodeAttr("T", type, &ret_def);
+  AddNodeAttr("index", index, &ret_def);
+  return AddNodeDefToGraph(ret_def, graph);
+}
+
+// Check that the graph has no cycle containing the given node.
+Status CheckNodeNotInCycle(const Node* node, const int num_nodes) {
+  std::vector<const Node*> ready;
+  ready.push_back(node);
+  std::vector<bool> visited(num_nodes);
+  while (!ready.empty()) {
+    const Node* current_node = ready.back();
+    ready.pop_back();
+    visited[current_node->id()] = true;
+    for (const Edge* out : current_node->out_edges()) {
+      if (out->dst() == node) {
+        return errors::Internal("Detected a cycle: ", FormatNodeForError(*node),
+                                " (", node->def().op(), ") feeds into itself.");
+      } else if (!visited[out->dst()->id()]) {
+        ready.push_back(out->dst());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
new file mode 100644
index 0000000000..61940e3586
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.h
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/graph/graph.h"
+
+// Utility functions shared between functionalize cond and while.
+
+namespace tensorflow {
+
+// Check that the graph has no cycle containing the given node.
+Status CheckNodeNotInCycle(const Node* node, const int num_nodes);
+
+// Comparison function used for sorting nodes consistently.
+// a) resource variables are last, and
+// b) sort lexicographically by name (for deterministic output).
+struct NodeCmpByNameResourcesLast {
+  bool operator()(const Node* lhs, const Node* rhs) const;
+};
+
+// Returns the Node* created from the NodeDef in the Graph.
+xla::StatusOr<Node*> AddNodeDefToGraph(const NodeDef& node_def, Graph* graph);
+
+// Build a retval node of given type and index.
+xla::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index);
+
+// Returns a textual representation of the names of the nodes in the input.
+template <typename T>
+string NodesToString(const T& nodes) {
+  return strings::StrCat("{",
+                         absl::StrJoin(nodes, ",",
+                                       [](string* output, const Node* node) {
+                                         strings::StrAppend(output,
+                                                            node->name());
+                                       }),
+                         "}");
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_CONTROL_FLOW_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
new file mode 100644
index 0000000000..6e3c4b0e0f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -0,0 +1,668 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/functionalize_while.h"
+
+#include <algorithm>
+#include <deque>
+#include <stack>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+namespace {
+
+using xla::StatusOr;
+
+// Information about a loop argument.
+struct Arg {
+  // Every loop argument has an Enter node.
+  Node* enter;
+
+  // Is the loop argument a loop-invariant value? Taken from the `is_constant`
+  // attribute on the Enter node.
+  bool is_loop_invariant;
+
+  // If 'is_loop_invariant' is true, the following are all nullptr. Non-constant
+  // arguments must have all of the following nodes:
+  Node* merge = nullptr;
+  Node* switch_node = nullptr;
+  Node* next_iteration = nullptr;
+  Node* exit = nullptr;
+};
+
+// Information about a loop frame.
+struct Frame {
+  string name;
+
+  // Pointer to the parent frame. The root frame has a pointer to itself.
+  Frame* parent = nullptr;
+  int num_children = 0;
+
+  // Arguments to this loop.
+  std::vector<Arg> args;
+
+  // The loop condition of the loop. There should be exactly one loop condition
+  // in every loop.
+  Node* loop_cond = nullptr;
+
+  // Set of nodes that belong to the loop frame.
+  std::unordered_set<Node*> nodes;
+};
+
+// Copies a subgraph from `graph` to `output` by performing a reverse DFS
+// starting at nodes in vector `stack`.
+// `node_map` is a vector indexed by source node ID to dest nodes.
+// Does not traverse into nodes in `node_map`, so by adding nodes to `node_map`
+// before the traversal clients can cut the graph. If a frame is provided (frame
+// != nullptr), then this functions will return an error if the
+// traversal leaves 'frame'; the client must add enough nodes to `node_map` to
+// cut the graph and prevent the traversal from escaping.
+//
+// `squash_src_outputs` contains a bool for each source node ID. If true, then
+// the source output on that node will be replaced by zero when copied. This is
+// used when replacing a Switch node with an _Arg node. The output we are
+// taking from the Switch node was not necessarily the first output, but _Arg
+// nodes only have one output. By adding the Switch node to `squash_src_outputs`
+// we rewrite the src_output of the corresponding edge to be 0.
+Status CopySubgraph(const Graph& graph, const Frame* frame,
+                    std::vector<Node*> stack,
+                    const std::vector<bool>& squash_src_outputs,
+                    std::vector<Node*>* node_map, Graph* output) {
+  VLOG(3) << "Stack: " << NodesToString(stack);
+  std::vector<bool> visited(graph.num_node_ids(), false);
+  while (!stack.empty()) {
+    Node* n = stack.back();
+    stack.pop_back();
+
+    VLOG(5) << "Copying node " << n->name();
+
+    if (visited[n->id()]) continue;
+    visited[n->id()] = true;
+
+    for (const Edge* e : n->in_edges()) {
+      Node* src = e->src();
+      if (frame != nullptr && frame->nodes.find(src) == frame->nodes.end()) {
+        // We traversed out of the loop frame, without encountering a cut node.
+        return errors::Internal("Graph traversal of loop frame ", frame->name,
+                                " escaped frame at ", src->name(),
+                                " without encountering an argument node.");
+      }
+      if ((*node_map)[src->id()] == nullptr) {
+        (*node_map)[src->id()] = output->CopyNode(src);
+        stack.push_back(src);
+      }
+      Node* src_copy = (*node_map)[e->src()->id()];
+      int src_output = squash_src_outputs[e->src()->id()] && !e->IsControlEdge()
+                           ? 0
+                           : e->src_output();
+      Node* dst_copy = (*node_map)[e->dst()->id()];
+      output->AddEdge(src_copy, src_output, dst_copy, e->dst_input());
+    }
+  }
+  return Status::OK();
+}
+
+StatusOr<Node*> BuildArgNode(Graph* graph, DataType type, int index) {
+  const char* const kArgOp = "_Arg";
+  NodeDef arg_def;
+  NodeDefBuilder builder(strings::StrCat(kArgOp, index), kArgOp);
+  builder.Attr("T", type);
+  builder.Attr("index", index);
+  TF_RETURN_IF_ERROR(builder.Finalize(&arg_def));
+  return AddNodeDefToGraph(arg_def, graph);
+}
+
+// Builds a graph for the loop condition.
+Status BuildLoopCondition(const Graph& graph, Frame* frame,
+                          std::unique_ptr<Graph>* cond_output) {
+  VLOG(2) << "Building loop condition for " << frame->name;
+  *cond_output = absl::make_unique<Graph>(graph.op_registry());
+  Graph* output = cond_output->get();
+
+  // Map from nodes in the original graph to the condition graph.
+  std::vector<Node*> node_map(graph.num_node_ids(), nullptr);
+  std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
+
+  // Build one _Arg node for each Enter node.
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+
+    TF_ASSIGN_OR_RETURN(Node * arg_node,
+                        BuildArgNode(output, arg.enter->input_type(0), i));
+    if (arg.is_loop_invariant) {
+      node_map[arg.enter->id()] = arg_node;
+    } else {
+      node_map[arg.merge->id()] = arg_node;
+    }
+  }
+
+  // Build a Retval node for the loop condition. The LoopCond nodes are always
+  // boolean because of the type constraints on the LoopCond op.
+  TF_ASSIGN_OR_RETURN(node_map[frame->loop_cond->id()],
+                      BuildRetvalNode(output, DT_BOOL, 0));
+
+  // Performs a reverse DFS, copying nodes and edges to the output graph.
+  // The _Arg and _Retval nodes were added unconditionally above, so we are
+  // guaranteed to get the correct function signature.
+  return CopySubgraph(graph, frame, {frame->loop_cond}, squash_src_outputs,
+                      &node_map, output);
+}
+
+// Builds a graph for the loop body.
+Status BuildLoopBody(const Graph& graph, Frame* frame,
+                     DataTypeVector* arg_types,
+                     std::unique_ptr<Graph>* body_output) {
+  VLOG(2) << "Building loop body for " << frame->name;
+  *body_output = absl::make_unique<Graph>(graph.op_registry());
+  Graph* output = body_output->get();
+
+  // Map from nodes in the original graph to the condition graph.
+  std::vector<Node*> node_map(graph.num_node_ids(), nullptr);
+  std::vector<bool> squash_src_outputs(graph.num_node_ids(), false);
+
+  // Build one _Arg node for each Enter node.
+  std::vector<Node*> next_iterations;
+  next_iterations.reserve(frame->args.size());
+  arg_types->reserve(frame->args.size());
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+
+    DataType dtype = arg.enter->input_type(0);
+    arg_types->push_back(dtype);
+
+    TF_ASSIGN_OR_RETURN(Node * arg_node, BuildArgNode(output, dtype, i));
+
+    if (dtype == DT_RESOURCE) {
+      // The convention of the XLA bridge is that resource variable arguments
+      // are only inputs to the loop body and have no corresponding output.
+      // TODO(b/37741920): change the convention so that DT_RESOURCE variables
+      // are both inputs and outputs, and then remove this case.
+      TF_RET_CHECK(arg.is_loop_invariant);
+      node_map[arg.enter->id()] = arg_node;
+    } else {
+      TF_ASSIGN_OR_RETURN(Node * retval_node,
+                          BuildRetvalNode(output, dtype, i));
+
+      if (arg.is_loop_invariant) {
+        // Argument is loop-invariant. Forward it from the Arg to the Retval.
+        node_map[arg.enter->id()] = arg_node;
+        output->AddEdge(arg_node, 0, retval_node, 0);
+      } else {
+        // Argument is loop-varying.
+        node_map[arg.switch_node->id()] = arg_node;
+        // The Switch node has two outputs, but _Arg only has one. This tells
+        // the CopySubgraph function to rewrite the output number of edges from
+        // the _Arg node to be 0 rather than copying the output number from the
+        // Switch node.
+        squash_src_outputs[arg.switch_node->id()] = true;
+        node_map[arg.next_iteration->id()] = retval_node;
+        next_iterations.push_back(arg.next_iteration);
+      }
+    }
+  }
+
+  // Performs a reverse DFS, copying nodes and edges to the output graph.
+  // The _Arg and _Retval nodes were added unconditionally above, so we are
+  // guaranteed to get the correct function signature.
+  TF_RETURN_IF_ERROR(CopySubgraph(graph, frame, std::move(next_iterations),
+                                  squash_src_outputs, &node_map, output));
+
+  return Status::OK();
+}
+
+// Copy the FunctionDef of given function from lookup_library to library, if
+// it can be found in lookup_library but is missing from library.
+Status AddMissingFunctionByName(const string& function_name,
+                                const FunctionLibraryDefinition* lookup_library,
+                                FunctionLibraryDefinition* library) {
+  if (!library->Find(function_name) && lookup_library->Find(function_name)) {
+    return library->AddFunctionDef(*lookup_library->Find(function_name));
+  }
+  return Status::OK();
+}
+
+// Iterate over all functions that the given fdef refers to. Copy the missing
+// FunctionDefs from lookup_library to library.
+Status AddMissingFunctionDef(const FunctionDef& fdef,
+                             const FunctionLibraryDefinition* lookup_library,
+                             FunctionLibraryDefinition* library) {
+  TF_RET_CHECK(lookup_library);
+  for (const NodeDef& node : fdef.node_def()) {
+    if (library->Find(node.op())) {
+      continue;
+    }
+    // The function referred by 'SymbolicGradient' node is specified in its
+    // attribute 'f'.
+    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
+      const AttrValue* attr =
+          AttrSlice(&node.attr()).Find(FunctionLibraryDefinition::kFuncAttr);
+      if (!attr) {
+        return errors::InvalidArgument("SymbolicGradient is missing attr: f");
+      }
+      const string& func_name = attr->func().name();
+      TF_RETURN_IF_ERROR(
+          AddMissingFunctionByName(func_name, lookup_library, library));
+      // Copy the user-defined gradient function if it exists.
+      const string grad_name = lookup_library->FindGradient(func_name);
+      if (!grad_name.empty() && library->FindGradient(func_name).empty()) {
+        TF_RETURN_IF_ERROR(
+            AddMissingFunctionByName(grad_name, lookup_library, library));
+        GradientDef grad_def;
+        grad_def.set_function_name(func_name);
+        grad_def.set_gradient_func(grad_name);
+        TF_RETURN_IF_ERROR(library->AddGradientDef(grad_def));
+      }
+    } else if (lookup_library->Find(node.op())) {
+      TF_RETURN_IF_ERROR(
+          library->AddFunctionDef(*lookup_library->Find(node.op())));
+    }
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
+                         Graph* graph, Frame* frame,
+                         FunctionLibraryDefinition* library) {
+  VLOG(2) << "Frame " << frame->name << " before: "
+          << dump_graph::DumpGraphToFile("functionalize_before", *graph,
+                                         library);
+
+  // Split loop-varying Enter nodes with multiple successors. If the same
+  // Tensor is fed as input to multiple loop arguments, we may end up with a
+  // shared Enter node. We clone Enter nodes with multiple successors to
+  // maintain the invariant of a unique Enter node per argument of the final
+  // loop.
+  std::vector<Arg> args;
+  for (const Arg& arg : frame->args) {
+    if (arg.is_loop_invariant) {
+      args.push_back(arg);
+    } else {
+      std::vector<const Edge*> edges(arg.enter->out_edges().begin(),
+                                     arg.enter->out_edges().end());
+      for (int i = 0; i < edges.size(); ++i) {
+        if (edges[i]->IsControlEdge() && edges[i]->dst()->IsSink()) {
+          continue;
+        }
+        TF_RET_CHECK(!edges[i]->IsControlEdge()) << edges[i]->src()->name();
+        Arg new_arg;
+        new_arg.is_loop_invariant = false;
+        if (i == 0) {
+          new_arg.enter = arg.enter;
+        } else {
+          new_arg.enter = graph->CopyNode(arg.enter);
+          frame->nodes.insert(new_arg.enter);
+          for (Edge const* e : arg.enter->in_edges()) {
+            graph->AddEdge(e->src(), e->src_output(), new_arg.enter,
+                           e->IsControlEdge() ? Graph::kControlSlot : 0);
+          }
+          Node* dst = edges[i]->dst();
+          int dst_input = edges[i]->dst_input();
+          graph->RemoveEdge(edges[i]);
+          graph->AddEdge(new_arg.enter, 0, dst, dst_input);
+        }
+        args.push_back(new_arg);
+      }
+    }
+  }
+  frame->args = std::move(args);
+
+  std::sort(frame->args.begin(), frame->args.end(),
+            [](const Arg& a, const Arg& b) {
+              return NodeCmpByNameResourcesLast()(a.enter, b.enter);
+            });
+
+  if (frame->loop_cond == nullptr) {
+    return errors::InvalidArgument("Loop ", frame->name,
+                                   " has no LoopCond node");
+  }
+
+  // Find the set of Switch nodes that are successors of the LoopCond.
+  std::unordered_set<Node*> switches;
+  for (const Edge* edge : frame->loop_cond->out_edges()) {
+    if (!edge->IsControlEdge() && IsSwitch(edge->dst()) &&
+        edge->dst_input() == 1) {
+      switches.insert(edge->dst());
+    }
+  }
+
+  // For each non-constant argument, looks for the following pattern of nodes:
+  // Enter ----> Merge  -------->  Switch  --> Exit
+  //               ^                  ^
+  //               |                  |
+  //         NextIteration         LoopCond
+  //               ^                  ^
+  //               |                  |
+  //              ...                ...
+  for (Arg& arg : frame->args) {
+    if (!arg.is_loop_invariant) {
+      // Follow the edge from the Enter to Merge.
+      const Edge* enter_merge = nullptr;
+      for (const Edge* e : arg.enter->out_edges()) {
+        // Ignore control-edges to the sink node. These are allowed by the
+        // graph invariants, although probably they should have been stripped
+        // off earlier.
+        if (e->IsControlEdge() && e->dst()->IsSink()) {
+          continue;
+        }
+        if (enter_merge != nullptr) {
+          return errors::Internal("Enter node for loop-varying argument ",
+                                  FormatNodeForError(*arg.enter),
+                                  " has multiple successors: ",
+                                  FormatNodeForError(*enter_merge->dst()),
+                                  " and ", FormatNodeForError(*e->dst()));
+        }
+        enter_merge = e;
+      }
+      if (enter_merge == nullptr) {
+        return errors::Internal("Enter node for loop-varying argument ",
+                                FormatNodeForError(*arg.enter),
+                                " has zero successors");
+      }
+      arg.merge = enter_merge->dst();
+      if (!IsMerge(arg.merge)) {
+        return errors::InvalidArgument(
+            "Successor of Enter node for loop-varying argument ",
+            FormatNodeForError(*arg.merge),
+            " is not a Merge node; got: ", arg.merge->type_string());
+      }
+
+      // Find the NextIteration from the merge. There should be two inputs to
+      // the Merge and the NextIteration should be the other input.
+      if (arg.merge->input_types().size() != 2) {
+        return errors::InvalidArgument(
+            "Unexpected number of inputs to Merge node for loop-varying "
+            "argument ",
+            FormatNodeForError(*arg.merge), "; expected 2, got ",
+            arg.merge->input_types().size());
+      }
+      TF_RETURN_IF_ERROR(arg.merge->input_node(1 - enter_merge->dst_input(),
+                                               &arg.next_iteration));
+      if (!IsNextIteration(arg.next_iteration)) {
+        return errors::InvalidArgument(
+            "Expected NextIteration node as input to Merge node; got node ",
+            FormatNodeForError(*arg.next_iteration), " with kind ",
+            arg.next_iteration->type_string());
+      }
+
+      // Find the Switch successor of the Merge. There should be exactly one
+      // Switch node that is a successor of both the Merge and the LoopCond.
+      for (const Edge* edge : arg.merge->out_edges()) {
+        if (edge->dst_input() == 0 && IsSwitch(edge->dst()) &&
+            switches.find(edge->dst()) != switches.end()) {
+          if (arg.switch_node != nullptr) {
+            return errors::InvalidArgument("Duplicate Switch successors to ",
+                                           FormatNodeForError(*arg.merge));
+          }
+          arg.switch_node = edge->dst();
+        }
+      }
+      if (arg.switch_node == nullptr) {
+        return errors::InvalidArgument("Missing Switch successor to ",
+                                       FormatNodeForError(*arg.merge));
+      }
+
+      // Update the device on the Identity outputs of the switch to match their
+      // target. These Identity outputs do not
+
+      // Loop over the switch node's output to:
+      // - Find the Exit successor.
+      // - Set the sharding on all Identity outputs of the switch. These
+      //   identity nodes are values used by the loop body or condition.
+      //   The Identity node may have the wrong device so copy the device from
+      //   one of its outputs instead.
+      std::deque<const Edge*> possible_exit;
+      for (const Edge* edge : arg.switch_node->out_edges()) {
+        if (edge->src_output() == 0) {
+          possible_exit.push_back(edge);
+        }
+        if (IsIdentity(edge->dst())) {
+          TF_RETURN_IF_ERROR(
+              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
+        }
+      }
+      // TODO(b/67425339): Allow general graph between switch and exit.
+      while (!possible_exit.empty()) {
+        const Edge* edge = possible_exit.front();
+        possible_exit.pop_front();
+        if (IsExit(edge->dst())) {
+          if (arg.exit != nullptr) {
+            return errors::InvalidArgument(
+                "Duplicate Exit successors to ",
+                FormatNodeForError(*arg.switch_node));
+          }
+          arg.exit = edge->dst();
+        } else {
+          if (!IsIdentity(edge->dst())) {
+            return errors::Unimplemented("General graph between switch (",
+                                         FormatNodeForError(*arg.switch_node),
+                                         ") and exit node of frame ",
+                                         frame->name, " not supported yet.");
+          }
+          for (const Edge* out : edge->dst()->out_edges()) {
+            possible_exit.push_back(out);
+          }
+        }
+      }
+    }
+  }
+
+  // Builds the condition and body functions.
+  std::unique_ptr<Graph> cond_graph;
+  TF_RETURN_IF_ERROR(BuildLoopCondition(*graph, frame, &cond_graph));
+  DataTypeVector arg_types;
+  std::unique_ptr<Graph> body_graph;
+  TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph));
+
+  VLOG(2) << "Frame " << frame->name << " condition: "
+          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library)
+          << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph);
+
+  static std::atomic<int64> sequence_num(0LL);
+  int64 id = ++sequence_num;
+  NameAttrList cond_name;
+  cond_name.set_name(strings::StrCat("_functionalize_cond_", id));
+  NameAttrList body_name;
+  body_name.set_name(strings::StrCat("_functionalize_body_", id));
+  FunctionDef cond_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*cond_graph, cond_name.name(), &cond_fdef));
+  FunctionDef body_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*body_graph, body_name.name(), &body_fdef));
+
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(cond_fdef));
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef));
+  if (lookup_library) {
+    // Copy missing FunctionDefs from lookup_library to library to make library
+    // self-contained.
+    TF_RETURN_IF_ERROR(
+        AddMissingFunctionDef(cond_fdef, lookup_library, library));
+    TF_RETURN_IF_ERROR(
+        AddMissingFunctionDef(body_fdef, lookup_library, library));
+  }
+
+  // Builds a While operator.
+  NodeDef while_def;
+  NodeDefBuilder builder(frame->loop_cond->name(), "XlaWhile");
+  builder.Attr("T", arg_types);
+  builder.Attr("cond", cond_name);
+  builder.Attr("body", body_name);
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+    const Edge* in_edge;
+    TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
+    if (in_edge->IsControlEdge()) {
+      builder.ControlInput(in_edge->src()->name());
+    } else {
+      inputs.push_back(NodeDefBuilder::NodeOut(
+          in_edge->src()->name(), in_edge->src_output(), arg_types[i]));
+    }
+  }
+  builder.Input(inputs);
+  TF_RETURN_IF_ERROR(builder.Finalize(&while_def));
+  TF_ASSIGN_OR_RETURN(Node * while_node, AddNodeDefToGraph(while_def, graph));
+
+  // Copies edges to the Enter nodes and from the Exit nodes onto the While.
+  for (int i = 0; i < frame->args.size(); ++i) {
+    const Arg& arg = frame->args[i];
+    const Edge* in_edge;
+    TF_RETURN_IF_ERROR(arg.enter->input_edge(0, &in_edge));
+    if (in_edge->IsControlEdge()) {
+      graph->AddControlEdge(in_edge->src(), while_node);
+    } else {
+      graph->AddEdge(in_edge->src(), in_edge->src_output(), while_node, i);
+    }
+
+    if (!arg.is_loop_invariant) {
+      // Add output edges if the output of the loop is consumed.
+      if (arg.exit != nullptr) {
+        std::vector<const Edge*> edges(arg.exit->out_edges().begin(),
+                                       arg.exit->out_edges().end());
+        for (const Edge* edge : edges) {
+          Node* dst = edge->dst();
+          int dst_input = edge->dst_input();
+          graph->RemoveEdge(edge);
+
+          if (dst_input == Graph::kControlSlot) {
+            graph->AddControlEdge(while_node, dst);
+          } else {
+            graph->AddEdge(while_node, i, dst, dst_input);
+          }
+        }
+      }
+    }
+  }
+
+  // Remove the old nodes from the graph, and add the while node to the parent
+  // frame.
+  for (Node* node : frame->nodes) {
+    graph->RemoveNode(node);
+  }
+  frame->nodes.clear();
+  frame->parent->nodes.insert(while_node);
+
+  VLOG(2) << "Frame " << frame->name << " after: "
+          << dump_graph::DumpGraphToFile("functionalize_after", *graph,
+                                         library);
+
+  return Status::OK();
+}
+}  // namespace
+
+Status FunctionalizeWhileLoop(const FunctionLibraryDefinition* lookup_library,
+                              Graph* graph,
+                              FunctionLibraryDefinition* library) {
+  // Note: BuildControlFlowInfo() requires that the graph's source node is
+  // connected to all source nodes in the graph. Many graphs violate this
+  // invariant.
+  std::vector<ControlFlowInfo> cf_info;
+  std::vector<string> unreachable_nodes;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info, &unreachable_nodes));
+  if (!unreachable_nodes.empty()) {
+    return errors::InvalidArgument(
+        "The following nodes are unreachable from the source in the graph: ",
+        errors::FormatNodeNamesForError(unreachable_nodes));
+  }
+
+  // Builds Frames, indexed by name.
+  std::unordered_map<string, Frame> frames;
+  for (Node* node : graph->op_nodes()) {
+    const ControlFlowInfo& cf = cf_info[node->id()];
+
+    VLOG(2) << "node: " << node->name() << " (" << node->id()
+            << ") frame_name: " << cf.frame_name
+            << " frame: " << (cf.frame ? cf.frame->name() : "---")
+            << " parent_frame: "
+            << (cf.parent_frame ? cf.parent_frame->name() : "---");
+    TF_RET_CHECK(cf.frame != nullptr && cf.parent_frame != nullptr);
+
+    Frame& frame = frames[cf.frame_name];
+    Frame* parent = &frames[cf_info[cf.parent_frame->id()].frame_name];
+    if (frame.parent == nullptr) {
+      frame.parent = parent;
+      frame.name = cf.frame_name;
+      ++parent->num_children;
+    }
+
+    if (IsEnter(node)) {
+      Arg arg;
+      arg.enter = node;
+      TF_RETURN_IF_ERROR(GetNodeAttr(arg.enter->attrs(), "is_constant",
+                                     &arg.is_loop_invariant));
+      frame.args.push_back(arg);
+    } else if (IsLoopCond(node)) {
+      frame.loop_cond = node;
+    }
+    frame.nodes.insert(node);
+  }
+
+  // Adds frames with no children (i.e., the innermost frames) to a worklist.
+  std::deque<Frame*> worklist;
+  for (auto& frame : frames) {
+    if (frame.second.num_children == 0) {
+      worklist.push_back(&frame.second);
+    }
+  }
+
+  // Eliminate loops from innermost to outermost.
+  while (!worklist.empty()) {
+    Frame* frame = worklist.front();
+    worklist.pop_front();
+    if (frame->parent == frame) {
+      // Skip the root frame.
+      continue;
+    }
+
+    TF_RETURN_IF_ERROR(
+        FunctionalizeLoop(lookup_library, graph, frame, library));
+
+    // If the parent has no remaining children, add it to the worklist.
+    --frame->parent->num_children;
+    if (frame->parent->num_children == 0) {
+      worklist.push_back(frame->parent);
+    }
+  }
+
+  // There should be no cycle at this point, since while loops have been removed
+  // from graph.
+  // Check that the newly added XlaWhile nodes don't feed into themselves.
+  for (const Node* node : graph->op_nodes()) {
+    if (node->def().op() == "XlaWhile") {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          CheckNodeNotInCycle(node, graph->num_node_ids()),
+          "Functionalizing loop failed.");
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.h b/tensorflow/compiler/tf2xla/functionalize_while.h
new file mode 100644
index 0000000000..a708c6e4ec
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/functionalize_while.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Transformation that converts tf.while_loop() loops into functional While
+// operators, suitable for XLA compilation. If lookup_library is provided, use
+// it to make the library for control flow self-contained.
+Status FunctionalizeWhileLoop(const FunctionLibraryDefinition* lookup_library,
+                              Graph* graph, FunctionLibraryDefinition* library);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_FUNCTIONALIZE_WHILE_H_
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index e4fdf0a618..ba37ed3337 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -57,7 +57,8 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
   std::vector<bool> compile_time_constant_flags(expressions.size());
 
   TF_RETURN_IF_ERROR(
-      BackwardsConstAnalysis(*graph, &compile_time_constant_flags));
+      BackwardsConstAnalysis(*graph, &compile_time_constant_flags,
+                             /*compile_time_const_nodes=*/nullptr));
 
   args->resize(expressions.size());
   for (int i = 0; i < args->size(); ++i) {
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index b1366e9e31..c1438f893f 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -22,6 +22,7 @@ tf_kernel_library(
         "bcast_ops.cc",
         "bias_ops.cc",
         "binary_ops.cc",
+        "broadcast_to_op.cc",
         "bucketize_op.cc",
         "cast_op.cc",
         "categorical_op.cc",
@@ -100,6 +101,12 @@ tf_kernel_library(
         "unary_ops.cc",
         "unpack_op.cc",
         "variable_ops.cc",
+        "xla_broadcast_helper_op.cc",
+        "xla_conv_op.cc",
+        "xla_dot_op.cc",
+        "xla_pad_op.cc",
+        "xla_reduce_op.cc",
+        "xla_select_and_scatter_op.cc",
     ],
     hdrs = [
         "index_ops.h",
@@ -108,6 +115,8 @@ tf_kernel_library(
     deps = [
         ":if_op",
         ":while_op",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:batch_dot",
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index ba3b1c9dab..2e383b1473 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // XLA-specific Ops for broadcasting used in gradient
 // code.
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -51,8 +52,8 @@ class BCastArgsOp : public XlaOpKernel {
     BCast bcast(shapes[0], shapes[1]);
     OP_REQUIRES(ctx, bcast.IsValid(),
                 errors::InvalidArgument(
-                    "Incompatible shapes: [", str_util::Join(shapes[0], ","),
-                    "] vs. [", str_util::Join(shapes[1], ","), "]"));
+                    "Incompatible shapes: [", absl::StrJoin(shapes[0], ","),
+                    "] vs. [", absl::StrJoin(shapes[1], ","), "]"));
 
     const int64 len = bcast.output_shape().size();
     Tensor output(DT_INT32, TensorShape({len}));
@@ -105,8 +106,8 @@ class BCastGradArgsOp : public XlaOpKernel {
     BCast bcast(shapes[0], shapes[1]);
     OP_REQUIRES(ctx, bcast.IsValid(),
                 errors::InvalidArgument(
-                    "Incompatible shapes: [", str_util::Join(shapes[0], ","),
-                    "] vs. [", str_util::Join(shapes[1], ","), "]"));
+                    "Incompatible shapes: [", absl::StrJoin(shapes[0], ","),
+                    "] vs. [", absl::StrJoin(shapes[1], ","), "]"));
     Output(ctx, 0, bcast.grad_x_reduce_idx());
     Output(ctx, 1, bcast.grad_y_reduce_idx());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
new file mode 100644
index 0000000000..4bd7c74dca
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+namespace {
+
+class BroadcastToOp : public XlaOpKernel {
+ public:
+  explicit BroadcastToOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    TensorShape output_shape;
+    OP_REQUIRES_OK(context, context->ConstantInputAsShape(1, &output_shape));
+
+    OP_REQUIRES(context, input_shape.dims() <= output_shape.dims(),
+                errors::InvalidArgument(
+                    "Input rank (", input_shape.dims(),
+                    ") must be less than or equal to the output rank (",
+                    output_shape.dims(), ")"));
+
+    auto input_dims = input_shape.dim_sizes();
+    auto output_dims = output_shape.dim_sizes();
+
+    // Broadcasting is done right-to-left on right-aligned dimensions; reverse
+    // the two vectors so elements to be broadcast are aligned.
+    absl::c_reverse(input_dims);
+    absl::c_reverse(output_dims);
+
+    std::vector<int64> broadcast_dims;
+    std::vector<int64> broadcast_shape;
+    for (int i = 0; i < output_shape.dims(); ++i) {
+      if (i < input_shape.dims()) {
+        OP_REQUIRES(
+            context,
+            (output_dims[i] == 0 && input_dims[i] == 0) ||
+                (input_dims[i] != 0 && output_dims[i] % input_dims[i] == 0),
+            errors::InvalidArgument("invalid shape to broadcast from ",
+                                    input_shape.DebugString(), " to ",
+                                    output_shape.DebugString()));
+
+        broadcast_dims.push_back(broadcast_shape.size());
+        if (output_dims[i] == input_dims[i] || input_dims[i] == 1) {
+          broadcast_shape.push_back(output_dims[i]);
+        }
+        if (output_dims[i] != input_dims[i]) {
+          // Add dimensions [I, O/I], which we will later flatten to just
+          // [O]. We must do this in two phases since XLA broadcasting does not
+          // support tiling.
+          broadcast_shape.push_back(input_dims[i]);
+          broadcast_shape.push_back(output_dims[i] / input_dims[i]);
+        }
+      } else {
+        broadcast_shape.push_back(output_dims[i]);
+      }
+    }
+    absl::c_reverse(broadcast_dims);
+    int broadcast_shape_size = broadcast_shape.size();
+    for (int64& broadcast_dim : broadcast_dims) {
+      broadcast_dim = broadcast_shape_size - broadcast_dim - 1;
+    }
+    absl::c_reverse(broadcast_shape);
+    xla::XlaOp output = xla::Reshape(
+        xla::BroadcastInDim(context->Input(0),
+                            xla::ShapeUtil::MakeShape(
+                                context->input_xla_type(0), broadcast_shape),
+                            broadcast_dims),
+        output_shape.dim_sizes());
+    context->SetOutput(0, output);
+  }
+};
+
+REGISTER_XLA_OP(Name("BroadcastTo").CompileTimeConstInput("shape"),
+                BroadcastToOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 5da7972397..674720e22f 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -120,45 +120,30 @@ xla::XlaOp CreateExpandedFilterMask(const TensorShape& filter_shape,
                  {expanded_filter_shape.dims() - 2});
 }
 
-// Expands a filter of shape [H, W, ..., M, N] to [H, W, ..., M, M*N] by adding
-// zeros for the cross-depth filters. Used to build a depthwise convolution.
-xla::XlaOp ExpandFilterForDepthwiseConvolution(const TensorShape& filter_shape,
-                                               DataType dtype,
-                                               const xla::XlaOp& filter,
-                                               xla::XlaBuilder* builder) {
-  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
-  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
-  TensorShape expanded_filter_shape =
-      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+// Reshapes a filter of shape [H, W, ..., M, N] to [H, W, ..., 1, M*N]. Used to
+// build a depthwise convolution.
+xla::XlaOp ReshapeFilterForDepthwiseConvolution(const TensorShape& filter_shape,
+                                                const xla::XlaOp& filter) {
+  int64 input_feature_dim = filter_shape.dims() - 2;
+  int64 output_feature_dim = filter_shape.dims() - 1;
+  int64 depthwise_multiplier = filter_shape.dim_size(output_feature_dim);
+  int64 input_feature = filter_shape.dim_size(input_feature_dim);
 
   // Create a [H, W, ..., 1, N*M] reshape of the filter.
-  TensorShape implicit_broadcast_filter_shape = expanded_filter_shape;
-  implicit_broadcast_filter_shape.set_dim(
-      implicit_broadcast_filter_shape.dims() - 2, 1);
-  implicit_broadcast_filter_shape.set_dim(
-      implicit_broadcast_filter_shape.dims() - 1,
-      depthwise_multiplier * input_feature);
-  auto implicit_broadcast_filter =
-      xla::Reshape(filter, implicit_broadcast_filter_shape.dim_sizes());
-
-  // Broadcast the filter to  [H, W, ..., M, M*N].
-  auto expanded_zero = CreateExpandedZero(filter_shape, dtype, builder);
-  auto expanded_filter = xla::Add(implicit_broadcast_filter, expanded_zero);
-
-  // If the filter mask is set, choose the broadcasted filter, othwerwise,
-  // choose zero.
-  return xla::Select(CreateExpandedFilterMask(filter_shape, builder),
-                     expanded_filter, expanded_zero);
+  TensorShape implicit_broadcast_filter_shape = filter_shape;
+  implicit_broadcast_filter_shape.set_dim(input_feature_dim, 1);
+  implicit_broadcast_filter_shape.set_dim(output_feature_dim,
+                                          depthwise_multiplier * input_feature);
+  return xla::Reshape(filter, implicit_broadcast_filter_shape.dim_sizes());
 }
 
-// Inverse of ExpandFilterForDepthwiseConvolution.
+// Reduces the results of the convolution with an expanded filter to the
+// non-expanded filter.
 xla::XlaOp ContractFilterForDepthwiseBackprop(XlaOpKernelContext* ctx,
                                               const TensorShape& filter_shape,
                                               DataType dtype,
                                               const xla::XlaOp& filter_backprop,
                                               xla::XlaBuilder* builder) {
-  TensorShape expanded_filter_shape =
-      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   auto masked_expanded_filter = xla::Select(
       CreateExpandedFilterMask(filter_shape, builder), filter_backprop,
       CreateExpandedZero(filter_shape, dtype, builder));
@@ -168,8 +153,7 @@ xla::XlaOp ContractFilterForDepthwiseBackprop(XlaOpKernelContext* ctx,
       // ExpandedZero guarantees that only one element is non zero, so there
       // cannot be accumulated precision error.
       xla::Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype),
-                  *ctx->GetOrCreateAdd(dtype),
-                  {expanded_filter_shape.dims() - 2}),
+                  *ctx->GetOrCreateAdd(dtype), {filter_shape.dims() - 2}),
       filter_shape.dim_sizes());
 }
 
@@ -245,15 +229,9 @@ class ConvOp : public XlaOpKernel {
                     "input and filter must have the same depth: ", in_depth,
                     " vs ", input_shape.dim_size(feature_dim)));
 
-    xla::XlaBuilder* b = ctx->builder();
-
     xla::XlaOp filter = ctx->Input(1);
-    TensorShape expanded_filter_shape = filter_shape;
     if (depthwise_) {
-      filter = ExpandFilterForDepthwiseConvolution(
-          filter_shape, ctx->input_type(0), filter, b);
-      expanded_filter_shape =
-          ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+      filter = ReshapeFilterForDepthwiseConvolution(filter_shape, filter);
     }
 
     xla::ConvolutionDimensionNumbers dims;
@@ -280,14 +258,15 @@ class ConvOp : public XlaOpKernel {
       int64 unused_output_size;
       OP_REQUIRES_OK(
           ctx, GetWindowedOutputSizeVerboseV2(
-                   input_shape.dim_size(dim), expanded_filter_shape.dim_size(i),
+                   input_shape.dim_size(dim), filter_shape.dim_size(i),
                    rhs_dilation[i], window_strides[i], padding_,
                    &unused_output_size, &padding[i].first, &padding[i].second));
     }
 
-    xla::XlaOp conv =
-        xla::ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding,
-                                lhs_dilation, rhs_dilation, dims);
+    xla::XlaOp conv = xla::ConvGeneralDilated(
+        ctx->Input(0), filter, window_strides, padding, lhs_dilation,
+        rhs_dilation, dims,
+        /*feature_group_count=*/depthwise_ ? in_depth : 1);
     ctx->SetOutput(0, conv);
   }
 
@@ -388,7 +367,6 @@ class ConvBackpropInputOp : public XlaOpKernel {
                        expanded_filter_shape, out_backprop_shape, dilations_,
                        strides_, padding_, data_format_, &dims));
 
-    xla::XlaBuilder* b = ctx->builder();
     auto filter = ctx->Input(1);
     auto out_backprop = ctx->Input(2);
 
@@ -425,12 +403,6 @@ class ConvBackpropInputOp : public XlaOpKernel {
       rhs_dilation[i] = dilations_[dim];
     }
 
-    // If this is a depthwise convolution, expand the filter.
-    if (depthwise_) {
-      filter = ExpandFilterForDepthwiseConvolution(
-          filter_shape, ctx->input_type(1), filter, b);
-    }
-
     // Mirror the filter in the spatial dimensions.
     xla::XlaOp mirrored_weights = xla::Rev(filter, kernel_spatial_dims);
 
@@ -438,7 +410,11 @@ class ConvBackpropInputOp : public XlaOpKernel {
     //   = gradients (with padding and dilation) <conv> mirrored_weights
     xla::XlaOp in_backprop = xla::ConvGeneralDilated(
         out_backprop, mirrored_weights, /*window_strides=*/ones, padding,
-        lhs_dilation, rhs_dilation, dnums);
+        lhs_dilation, rhs_dilation, dnums,
+        /*feature_group_count=*/
+        depthwise_ ? out_backprop_shape.dim_size(feature_dim) /
+                         filter_shape.dim_size(num_spatial_dims_ + 1)
+                   : 1);
 
     ctx->SetOutput(0, in_backprop);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 35de96e0aa..44140304fd 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -95,11 +95,11 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
   //  operand = s32[3,3] parameter(0)
   //  indices = s32[2] parameter(1)
   //  gather = s32[3,2] gather(operand, indices),
-  //       output_window_dims={0},
-  //       elided_window_dims={1},
-  //       gather_dims_to_operand_dims={1},
+  //       offset_dims={0},
+  //       collapsed_slice_dims={1},
+  //       start_index_map={1},
   //       index_vector_dim=1,
-  //       window_bounds={3, 1}
+  //       slice_sizes={3, 1}
   //
   //
   // Example of an N-D gather pulling out slices of shape [1,1,2] out of a
@@ -108,42 +108,42 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
   //  operand = s32[3,3,2] parameter(0)
   //  indices = s32[2,2] parameter(1)
   //  gather = s32[2,2] gather(operand, indices),
-  //       output_window_dims={1},
-  //       elided_window_dims={0,1},
-  //       gather_dims_to_operand_dims={0,1},
+  //       offset_dims={1},
+  //       collapsed_slice_dims={0,1},
+  //       start_index_map={0,1},
   //       index_vector_dim=0,
-  //       window_bounds={1,1,2}
+  //       slice_sizes={1,1,2}
 
   xla::GatherDimensionNumbers dim_numbers;
-  std::vector<int64> window_bounds;
-  window_bounds.reserve(input_shape.dims());
+  std::vector<int64> slice_sizes;
+  slice_sizes.reserve(input_shape.dims());
   for (int64 i = 0; i < input_shape.dims(); i++) {
     int64 window_bound;
     if (axis <= i && i < (axis + num_index_dims)) {
-      dim_numbers.add_elided_window_dims(i);
+      dim_numbers.add_collapsed_slice_dims(i);
       window_bound = 1;
     } else {
       window_bound = input_shape.dim_size(i);
     }
 
-    window_bounds.push_back(window_bound);
+    slice_sizes.push_back(window_bound);
 
     if (i < axis) {
-      dim_numbers.add_output_window_dims(i);
+      dim_numbers.add_offset_dims(i);
     } else if (i >= (axis + num_index_dims)) {
       int64 indices_rank =
           indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims();
-      dim_numbers.add_output_window_dims(i + indices_rank - num_index_dims);
+      dim_numbers.add_offset_dims(i + indices_rank - num_index_dims);
     }
   }
 
   dim_numbers.set_index_vector_dim(indices_are_nd ? (indices_shape.dims() - 1)
                                                   : indices_shape.dims());
   for (int64 i = axis; i < axis + num_index_dims; i++) {
-    dim_numbers.add_gather_dims_to_operand_dims(i);
+    dim_numbers.add_start_index_map(i);
   }
 
-  *gather_output = xla::Gather(input, indices, dim_numbers, window_bounds);
+  *gather_output = xla::Gather(input, indices, dim_numbers, slice_sizes);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index e72200bfbc..19dd38c46e 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -25,7 +25,10 @@ class IdentityOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
-      ctx->SetOutput(i, ctx->Input(i));
+      // Forwards using the underlying op_kernel_context so both tensor and
+      // resource values are forwarded correctly.
+      ctx->op_kernel_context()->set_output(i,
+                                           ctx->op_kernel_context()->input(i));
     }
   }
 
@@ -35,9 +38,10 @@ class IdentityOp : public XlaOpKernel {
 
 // XLA_* devices also register a "real" Identity operator so we suppress the
 // dummy operator using CompilationOnly().
-REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp);
-
-REGISTER_XLA_OP(Name("IdentityN").CompilationOnly(), IdentityOp);
+REGISTER_XLA_OP(Name("Identity").AllowResourceTypes().CompilationOnly(),
+                IdentityOp);
+REGISTER_XLA_OP(Name("IdentityN").AllowResourceTypes().CompilationOnly(),
+                IdentityOp);
 REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp);
 REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
 REGISTER_XLA_OP(Name("StopGradient"), IdentityOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 6a7eb8d90c..6e1dbf5472 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -200,21 +200,10 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
     }
   }
 
-  bool resource_variable_seen = false;
-  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    if (ctx->input_type(i) == DT_RESOURCE) {
-      resource_variable_seen = true;
-    } else {
-      OP_REQUIRES(
-          ctx, !resource_variable_seen,
-          errors::FailedPrecondition(
-              "Resource variables and regular inputs cannot be interleaved."));
-    }
-  }
-
-  xla::XlaOp outputs = xla::Conditional(
-      ctx->Input(0), xla::Tuple(b, inputs), *then_result.computation,
-      xla::Tuple(b, inputs), *else_result.computation);
+  auto input_tuple = xla::Tuple(b, inputs);
+  xla::XlaOp outputs =
+      xla::Conditional(ctx->Input(0), input_tuple, *then_result.computation,
+                       input_tuple, *else_result.computation);
   // Sets non-variable outputs.
   for (int i = 0; i < output_types_.size(); ++i) {
     xla::XlaOp output_handle = xla::GetTupleElement(outputs, i);
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 8d75624e74..8e071bf0b7 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -32,13 +32,13 @@ namespace {
 //
 //    1. S := (N - 1) /  gcd(N-1, R-1)
 //    2. k := (R - 1) /  gcd(N-1, R-1)
-//    3. Convolution(kxk, stride=S, lhs_dilation=k, padding=k-1)
+//    3. Convolution((2k-1)x(2k-1), stride=S, lhs_dilation=k, padding=k-1)
 //
 // For example, to Scale from 7x7 -> 15x15:
 //
 //    1. S := (7-1) / gcd(7-1, 15-1) = 6 / gcd(6, 14) = 6 / 2 = 3
 //    2. k := (15 - 1) / gcd(7-1, 15-1) = 14 / gcd(6, 14) = 14 / 2 = 7
-//    3. Convolution(7x7, stride=3, lhs_dilation=3, padding=2)
+//    3. Convolution(15x15, stride=3, lhs_dilation=7, padding=2)
 //
 //
 // The 7x7 -> 15x15 case is much too large to write out in full as an
@@ -65,6 +65,8 @@ namespace {
 // 1/9 * 3 6 9 6 3
 //       2 4 6 4 2
 //       1 2 3 2 1
+// Note that the convolution kernel matrix is separable and thus we can instead
+// use 2 consecutive 1D kernel of the dimension 2k-1, along each axis.
 
 // Computes the size of the convolutional kernel and stride to use when resizing
 // from in_size to out_size.
@@ -76,7 +78,8 @@ struct ResizeConvolutionDims {
   std::vector<int64> stride;
 };
 ResizeConvolutionDims ComputeResizeConvolutionParameters(
-    gtl::ArraySlice<int64> in_size, gtl::ArraySlice<int64> out_size) {
+    gtl::ArraySlice<int64> in_size, gtl::ArraySlice<int64> out_size,
+    bool align_corners) {
   CHECK_EQ(in_size.size(), out_size.size());
   int num_spatial_dims = in_size.size();
   ResizeConvolutionDims dims;
@@ -92,15 +95,32 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters(
       // entry before resizing.
       dims.stride[i] = dims.kernel_size[i] = 1;
     } else {
-      int64 gcd = MathUtil::GCD(static_cast<uint64>(in_size[i] - 1),
-                                static_cast<uint64>(out_size[i] - 1));
-      dims.stride[i] = (in_size[i] - 1) / gcd;
-      dims.kernel_size[i] = (out_size[i] - 1) / gcd;
+      // The scaling factor changes depending on the alignment of corners.
+      const int64 in_size_factor = align_corners ? in_size[i] - 1 : in_size[i];
+      const int64 out_size_factor =
+          align_corners ? out_size[i] - 1 : out_size[i];
+
+      int64 gcd = MathUtil::GCD(static_cast<uint64>(in_size_factor),
+                                static_cast<uint64>(out_size_factor));
+      dims.stride[i] = in_size_factor / gcd;
+      dims.kernel_size[i] = out_size_factor / gcd;
     }
   }
   return dims;
 }
 
+// The upper padding of the input needed by ConvGeneralDilated calls is
+// determined by solving two related relationships (assuming rhs_dilation == 0):
+// 1. dilated_input_dim = lower_padding + upper_padding
+//                        + lhs_dilation * (in_size - 1) + 1
+// 2. dilated_input_dim = (2 * dims.kernel-size - 1)
+//                        + dims.stride * (out_size - 1)
+int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size,
+                            int64 stride) {
+  return (2 * kernel_size - 1) + (out_size - 1) * stride - (kernel_size - 1) -
+         1 - (kernel_size * (in_size - 1));
+}
+
 // Form a 2D convolution kernel like:
 //       1 2 3 2 1
 //       2 4 6 4 2
@@ -171,7 +191,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
                                              const int num_spatial_dims,
                                              std::vector<int64> in_size,
                                              std::vector<int64> out_size,
-                                             const int64 channels) {
+                                             const int64 channels,
+                                             const bool align_corners) {
   // Picture for a 1x3 to 1x4 resize:
   // stride = 2, kernel size = 3
   // Input:
@@ -196,27 +217,82 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
   dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims);
 
   ResizeConvolutionDims dims =
-      ComputeResizeConvolutionParameters(in_size, out_size);
+      ComputeResizeConvolutionParameters(in_size, out_size, align_corners);
   xla::XlaOp output;
-  // Split convolutions into independent dimensions if they wmuld be a very
+
+  // Concatenation and padding below currently assumes num_spatial_dims is 2 to
+  // prevent needless code complexity.
+  CHECK_EQ(num_spatial_dims, 2)
+      << "ResizeUsingDilationAndConvolution pads only 2 dimensions currently.";
+  std::vector<int64> upper_padding(num_spatial_dims);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    upper_padding[i] = dims.kernel_size[i] - 1;
+  }
+  xla::XlaOp input_data = input;
+
+  if (!align_corners) {
+    // When Tensorflow does not align_corners, the resize indexing can access
+    // beyond the upper bound and is instead clamped to prevent out of bounds
+    // reads. This is conceptually the same as extending the edges of the input.
+    // We emulate this by copying the last row/column of the input.
+    // Calculate what padding would be needed then determine how far to extend
+    // the border before lhs dilation.
+    std::vector<int64> num_extended(num_spatial_dims);
+    upper_padding[0] = CalculateUpperPadding(
+        in_size[0], out_size[0], dims.kernel_size[0], dims.stride[0]);
+    upper_padding[1] = CalculateUpperPadding(
+        in_size[1], out_size[1], dims.kernel_size[1], dims.stride[1]);
+    num_extended[0] = upper_padding[0] / (dims.kernel_size[0]);
+    num_extended[1] = upper_padding[1] / (dims.kernel_size[1]);
+
+    if (num_extended[0] > 0) {
+      auto slice =
+          xla::Slice(input_data, {0, in_size[0] - 1, 0, 0},
+                     {1, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
+      for (int i = 0; i < num_extended[0]; i++) {
+        input_data = xla::ConcatInDim(builder, {input_data, slice}, 1);
+      }
+    }
+
+    if (num_extended[1] > 0) {
+      auto slice =
+          xla::Slice(input_data, {0, 0, in_size[1] - 1, 0},
+                     {1, in_size[0] + num_extended[0], in_size[1], channels},
+                     {1, 1, 1, 1});
+      for (int i = 0; i < num_extended[1]; i++) {
+        input_data = xla::ConcatInDim(builder, {input_data, slice}, 2);
+      }
+    }
+
+    // Setting in_size to (in_size + num_extended) due to the above Slice and
+    // ConcatInDim. Recalculate needed padding after the above Slice/Concat.
+    upper_padding[0] =
+        CalculateUpperPadding(in_size[0] + num_extended[0], out_size[0],
+                              dims.kernel_size[0], dims.stride[0]);
+    upper_padding[1] =
+        CalculateUpperPadding(in_size[1] + num_extended[1], out_size[1],
+                              dims.kernel_size[1], dims.stride[1]);
+  }
+
+  // Split convolutions into independent dimensions if they would be a very
   // large kernel.
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
     xla::XlaOp kernel =
         MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
-    output = xla::ConvGeneralDilated(
-        input, kernel, dims.stride,
-        /*padding=*/
-        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
-         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
-        /*lhs_dilation=*/dims.kernel_size,
-        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+    output =
+        xla::ConvGeneralDilated(input_data, kernel, dims.stride,
+                                /*padding=*/
+                                {{dims.kernel_size[0] - 1, upper_padding[0]},
+                                 {dims.kernel_size[1] - 1, upper_padding[1]}},
+                                /*lhs_dilation=*/dims.kernel_size,
+                                /*rhs_dilation=*/{1, 1}, dimension_numbers);
   } else {
     xla::XlaOp kernel0 =
         MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
     output = xla::ConvGeneralDilated(
-        input, kernel0, {dims.stride[0], 1},
+        input_data, kernel0, {dims.stride[0], 1},
         /*padding=*/
-        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, {0, 0}},
+        {{dims.kernel_size[0] - 1, upper_padding[0]}, {0, 0}},
         /*lhs_dilation=*/{dims.kernel_size[0], 1},
         /*rhs_dilation=*/{1, 1}, dimension_numbers);
     xla::XlaOp kernel1 =
@@ -224,7 +300,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
     output = xla::ConvGeneralDilated(
         output, kernel1, {1, dims.stride[1]},
         /*padding=*/
-        {{0, 0}, {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        {{0, 0}, {dims.kernel_size[1] - 1, upper_padding[1]}},
         /*lhs_dilation=*/{1, dims.kernel_size[1]},
         /*rhs_dilation=*/{1, 1}, dimension_numbers);
   }
@@ -245,9 +321,10 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
                                                    const int num_spatial_dims,
                                                    std::vector<int64> in_size,
                                                    std::vector<int64> grad_size,
-                                                   const int64 channels) {
+                                                   const int64 channels,
+                                                   const bool align_corners) {
   ResizeConvolutionDims dims =
-      ComputeResizeConvolutionParameters(in_size, grad_size);
+      ComputeResizeConvolutionParameters(in_size, grad_size, align_corners);
 
   // To form the backward convolution, we keep the kernel unchanged (it is
   // already symmetric) and swap the roles of strides and LHS dilation.
@@ -341,10 +418,6 @@ class ResizeBilinearOp : public XlaOpKernel {
  public:
   explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
-    OP_REQUIRES(
-        ctx, align_corners_ == true,
-        errors::Unimplemented(
-            "ResizeBilinear with align_corners=False is not yet implemented"));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -377,20 +450,19 @@ class ResizeBilinearOp : public XlaOpKernel {
 
     // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
     // dimension i.
-    std::vector<int64> slice_size = in_size;
     bool slice_input = false;
     for (int i = 0; i < num_spatial_dims; ++i) {
       if (in_size[i] > 1 && out_size[i] == 1) {
         // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
         // entry before resizing.
         slice_input = true;
-        slice_size[i] = 1;
+        in_size[i] = 1;
       }
     }
     if (slice_input) {
-      input = xla::Slice(input, {0, 0, 0, 0},
-                         {batch, slice_size[0], slice_size[1], channels},
-                         {1, 1, 1, 1});
+      input =
+          xla::Slice(input, {0, 0, 0, 0},
+                     {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
     }
 
     // Output is always type float.
@@ -406,6 +478,9 @@ class ResizeBilinearOp : public XlaOpKernel {
     // operations along different dimensions.
     // Given sufficient numerical stability and a<e<c and b<f<d, bilinear resize
     // from image of size axb -> cxd is same as resizing axb -> exf -> cxd.
+    // This does not work in the case of align_corners_=false because of special
+    // padding requirements that cause multiple resizes to be very different
+    // from a single resize.
     //
     // This makes the convolutions kernels smaller and the operation faster.
     xla::XlaOp output = input;
@@ -415,21 +490,24 @@ class ResizeBilinearOp : public XlaOpKernel {
             (static_cast<float>(out_size[0]) - 1) / ((in_size[0] - 1) * 2),
             (static_cast<float>(out_size[1]) - 1) / ((in_size[1] - 1) * 2)};
         if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) &&
-            k[0] > 1 && k[1] > 1) {
+            k[0] > 1 && k[1] > 1 && align_corners_) {
           std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
                                               (in_size[1] - 1) * 2 + 1};
-          output = ResizeUsingDilationAndConvolution(
-              b, input, num_spatial_dims, in_size, next_out_size, channels);
+          output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
+                                                     in_size, next_out_size,
+                                                     channels, align_corners_);
           input = output;
           in_size = next_out_size;
         } else {
-          output = ResizeUsingDilationAndConvolution(
-              b, input, num_spatial_dims, in_size, out_size, channels);
+          output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
+                                                     in_size, out_size,
+                                                     channels, align_corners_);
           in_size = out_size;
         }
       } else {
         output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
-                                                   in_size, out_size, channels);
+                                                   in_size, out_size, channels,
+                                                   align_corners_);
         in_size = out_size;
       }
     }
@@ -509,17 +587,20 @@ class ResizeBilinearGradOp : public XlaOpKernel {
           std::vector<int64> next_grad_size = {(in_size[0] - 1) * 2 + 1,
                                                (in_size[1] - 1) * 2 + 1};
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, next_grad_size, channels);
+              b, grad, num_spatial_dims, in_size, next_grad_size, channels,
+              align_corners_);
           grad = output;
           in_size = next_grad_size;
         } else {
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, grad_size, channels);
+              b, grad, num_spatial_dims, in_size, grad_size, channels,
+              align_corners_);
           in_size = grad_size;
         }
       } else {
         output = ResizeUsingDilationAndConvolutionGradOp(
-            b, grad, num_spatial_dims, in_size, grad_size, channels);
+            b, grad, num_spatial_dims, in_size, grad_size, channels,
+            align_corners_);
         in_size = grad_size;
       }
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index d4d180aff8..f6f158a73b 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -199,59 +199,6 @@ class MaxPool3DOp : public MaxPoolOp {
 };
 REGISTER_XLA_OP(Name("MaxPool3D"), MaxPool3DOp);
 
-// Divide each element of an image by the count of elements that contributed to
-// that element during pooling.
-static xla::XlaOp AvgPoolDivideByCount(
-    XlaOpKernelContext* ctx, const xla::XlaOp& output, DataType dtype,
-    const TensorShape& input_shape, xla::Padding padding,
-    const std::vector<int64>& ksize, const std::vector<int64>& stride,
-    int num_spatial_dims, TensorFormat data_format) {
-  if (padding == xla::Padding::kValid) {
-    // In VALID padding, all windows have the same number of elements
-    // contributing to each average. Divide by the window size everywhere to
-    // get the average.
-    int64 window_size = std::accumulate(ksize.begin(), ksize.end(), 1,
-                                        [](int64 a, int64 b) { return a * b; });
-
-    auto divisor =
-        XlaHelpers::IntegerLiteral(ctx->builder(), dtype, window_size);
-    return xla::Div(output, divisor);
-  } else {
-    // For SAME padding, the padding shouldn't be included in the
-    // counts. We use another ReduceWindow to find the right counts.
-
-    // TODO(phawkins): use a less brute-force way to compute this. Only
-    // the boundary regions will have interesting values here.
-
-    std::vector<int64> input_dim_sizes(num_spatial_dims);
-    std::vector<int64> window_dims(num_spatial_dims);
-    std::vector<int64> window_ksize(num_spatial_dims);
-    std::vector<int64> window_stride(num_spatial_dims);
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      int dim = GetTensorSpatialDimIndex(num_spatial_dims + 2, data_format, i);
-      input_dim_sizes[i] = input_shape.dim_size(dim);
-      window_dims[i] = dim;
-      window_ksize[i] = ksize[dim];
-      window_stride[i] = stride[dim];
-    }
-
-    // Build a matrix of all 1s, with the same width/height as the input.
-    const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
-    auto ones = xla::Broadcast(
-        XlaHelpers::One(ctx->builder(), accumulation_type), input_dim_sizes);
-
-    // Perform a ReduceWindow with the same window size, strides, and padding
-    // to count the number of contributions to each result element.
-    auto reduce = xla::ReduceWindow(
-        ones, XlaHelpers::Zero(ctx->builder(), accumulation_type),
-        *ctx->GetOrCreateAdd(accumulation_type), window_ksize, window_stride,
-        xla::Padding::kSame);
-    auto counts = XlaHelpers::ConvertElementType(ctx->builder(), reduce, dtype);
-
-    return xla::Div(output, counts, window_dims);
-  }
-}
-
 class AvgPoolOp : public PoolingOp {
  public:
   AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
@@ -463,78 +410,31 @@ class AvgPoolGradOp : public XlaOpKernel {
                 errors::InvalidArgument("out_backprop must be ", num_dims(),
                                         "-dimensional"));
 
-    int depth_dim = GetTensorFeatureDimIndex(num_dims(), data_format_);
-    int64 depth = out_backprop_shape.dim_size(depth_dim);
-
-    // We can think of average-pooling as:
-    // * a convolution with a kernel consisting entirely of 1s, where the
-    //   input feature and output feature are equal, and 0s everywhere else.
-    // * followed by dividing by the counts.
-    //
-    // This then gives us an algorithm to build the gradient:
-    // * divide out_backprop by the counts, followed by
-    // * Conv2DBackpropInput specialized for that kernel, which simplifies to
-    //   a Pad and a ReduceWindow.
-    //
-    // For an explanation of backpropagation for convolution, see the comments
-    // in third_party/tensorflow/core/kernels/conv_grad_ops.h
-
-    // TF filter shape is [ H, W, ..., inC, outC ]
-    std::vector<int64> filter_dims(num_dims());
-    for (int i = 0; i < num_spatial_dims_; ++i) {
-      int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
-      filter_dims[i] = ksize_[dim];
-    }
-    filter_dims[num_dims() - 2] = depth;
-    filter_dims[num_dims() - 1] = depth;
-    TensorShape filter_shape(filter_dims);
-
-    // Reuse the logic from Conv2DBackpropInput to compute padding.
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(
-        ctx, ConvBackpropComputeDimensions(
-                 type_string(), /*num_spatial_dims=*/num_spatial_dims_,
-                 gradients_shape, filter_shape, out_backprop_shape, stride_,
-                 padding_, data_format_, &dims));
-
-    // The input gradients are computed by a convolution of the output gradients
-    // and the filter, with some appropriate padding. See the comment at the top
-    // of conv_grad_ops.h for details.
-    xla::XlaBuilder* const b = ctx->builder();
     auto out_backprop = ctx->Input(1);
-    auto dtype = input_type(1);
+    std::vector<int64> stride_int64s(stride_.begin(), stride_.end());
     xla::Padding xla_padding =
         (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
-
-    // Divide the out_backprop values by the counts for each spatial position.
-    std::vector<int64> stride_int64s(stride_.begin(), stride_.end());
-    auto out_backprop_div = AvgPoolDivideByCount(
-        ctx, out_backprop, dtype, gradients_shape, xla_padding, ksize_,
-        stride_int64s, num_spatial_dims_, data_format_);
-
-    // Pad the gradients in the spatial dimensions. We use the same padding
-    // as Conv2DBackpropInput.
-    xla::PaddingConfig padding_config = xla::MakeNoPaddingConfig(num_dims());
-    for (int i = 0; i < num_spatial_dims_; ++i) {
-      int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
-      auto* padding = padding_config.mutable_dimensions(dim);
-      padding->set_edge_padding_low(dims.spatial_dims[i].pad_before);
-      padding->set_edge_padding_high(dims.spatial_dims[i].pad_after);
-      padding->set_interior_padding(dims.spatial_dims[i].stride - 1);
-    }
-
-    auto zero = XlaHelpers::Zero(b, dtype);
-    auto padded_gradients = xla::Pad(out_backprop_div, zero, padding_config);
-
-    // in_backprop = padded_gradients <conv> ones
-    std::vector<int64> ones(num_dims(), 1LL);
-    auto accumulation_type = XlaHelpers::SumAccumulationType(dtype);
-    auto in_backprop = xla::ReduceWindow(
-        XlaHelpers::ConvertElementType(b, padded_gradients, accumulation_type),
-        XlaHelpers::Zero(b, accumulation_type),
-        *ctx->GetOrCreateAdd(accumulation_type), ksize_,
-        /* window_strides=*/ones, xla::Padding::kValid);
-    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, in_backprop, dtype));
+    xla::PrimitiveType xla_reduction_type;
+    auto reduction_type = XlaHelpers::SumAccumulationType(ctx->input_type(1));
+    OP_REQUIRES_OK(
+        ctx, DataTypeToPrimitiveType(reduction_type, &xla_reduction_type));
+    auto converted_out_backprop =
+        xla::ConvertElementType(out_backprop, xla_reduction_type);
+    auto xla_data_format =
+        XlaTensorFormat(data_format_, gradients_shape.dims() - 2);
+    auto padding_values =
+        MakeSpatialPadding(gradients_shape.dim_sizes(), ksize_, stride_int64s,
+                           xla_padding, xla_data_format);
+    auto in_backprop =
+        xla::AvgPoolGrad(converted_out_backprop, gradients_shape.dim_sizes(),
+                         ksize_, stride_int64s, padding_values, xla_data_format,
+                         /*counts_include_padding=*/padding_ == VALID);
+    // Convert the pooling result back to the input type before returning it.
+    xla::PrimitiveType xla_out_backprop_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(1),
+                                                &xla_out_backprop_type));
+    ctx->SetOutput(0,
+                   xla::ConvertElementType(in_backprop, xla_out_backprop_type));
   }
 
  protected:
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
index b11a4ce36d..8102faad28 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -32,41 +32,30 @@ class ReduceWindowOp : public XlaOpKernel {
   explicit ReduceWindowOp(OpKernelConstruction* context)
       : XlaOpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("computation", &computation_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("window_dimensions", &window_dimensions_));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("window_strides", &window_strides_));
-    OP_REQUIRES_OK(context, context->GetAttr("padding_low", &padding_low_));
-    OP_REQUIRES_OK(context, context->GetAttr("padding_high", &padding_high_));
   }
 
   void Compile(XlaOpKernelContext* context) override {
     const TensorShape input_shape = context->InputShape(0);
     const DataType dtype = context->input_type(0);
 
+    std::vector<int64> window_dimensions;
+    std::vector<int64> window_strides;
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector(
+                                "window_dimensions", &window_dimensions));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides",
+                                                              &window_strides));
+
     const int rank = input_shape.dims();
-    OP_REQUIRES(context, rank == window_dimensions_.size(),
+    OP_REQUIRES(context, rank == window_dimensions.size(),
                 errors::InvalidArgument(
                     "The size of window_dimensions must be equal to the input "
                     "rank (",
-                    window_dimensions_.size(), " vs. ", rank, ")"));
-    OP_REQUIRES(context, rank == window_strides_.size(),
+                    window_dimensions.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == window_strides.size(),
                 errors::InvalidArgument(
                     "The size of window_strides must be equal to the input "
                     "rank (",
-                    window_strides_.size(), " vs. ", rank, ")"));
-    OP_REQUIRES(context, rank == padding_low_.size(),
-                errors::InvalidArgument(
-                    "The size of padding_low must be equal to the input "
-                    "rank (",
-                    padding_low_.size(), " vs. ", rank, ")"));
-    OP_REQUIRES(context, rank == padding_high_.size(),
-                errors::InvalidArgument(
-                    "The size of padding_high must be equal to the input "
-                    "rank (",
-                    padding_high_.size(), " vs. ", rank, ")"));
-
-    xla::XlaBuilder* builder = context->builder();
+                    window_strides.size(), " vs. ", rank, ")"));
 
     // Build the reducer function.
     XlaCompiler::Argument reducer_arg;
@@ -78,6 +67,7 @@ class ReduceWindowOp : public XlaOpKernel {
     compile_options.use_tuple_arg = false;
     compile_options.resolve_compile_time_constants = false;
     compile_options.is_entry_computation = false;
+    compile_options.always_return_tuple = false;
     XlaCompiler::CompilationResult reducer;
     OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
                                 compile_options, *computation_,
@@ -86,51 +76,47 @@ class ReduceWindowOp : public XlaOpKernel {
     xla::Shape scalar_shape;
     OP_REQUIRES_OK(context,
                    TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    OP_REQUIRES(
+        context,
+        xla::ShapeUtil::Compatible(reducer.xla_output_shape, scalar_shape),
+        errors::InvalidArgument(
+            "Invalid output shape of ReduceWindow reducer. Expected ",
+            xla::ShapeUtil::HumanString(scalar_shape), " got ",
+            xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
+
+    const TensorShape padding_shape = context->InputShape("padding");
     OP_REQUIRES(context,
-                xla::ShapeUtil::Compatible(
-                    reducer.xla_output_shape,
-                    xla::ShapeUtil::MakeTupleShape({scalar_shape})),
+                TensorShapeUtils::IsMatrix(padding_shape) &&
+                    padding_shape.dim_size(1) == 2,
                 errors::InvalidArgument(
-                    "Invalid output shape of ReduceWindow reducer. Expected ",
-                    xla::ShapeUtil::HumanString(scalar_shape), " got ",
-                    xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
-
-    // Wraps the reducer in a computation that unpacks the output tuple.
-    xla::XlaComputation wrapper;
-    {
-      std::unique_ptr<xla::XlaBuilder> cb =
-          builder->CreateSubBuilder("wrapper");
-      auto x = xla::Parameter(cb.get(), 0, scalar_shape, "x");
-      auto y = xla::Parameter(cb.get(), 1, scalar_shape, "y");
-      auto outputs = xla::Call(cb.get(), *reducer.computation, {x, y});
-      xla::GetTupleElement(outputs, 0);
-      xla::StatusOr<xla::XlaComputation> result = cb->Build();
-      OP_REQUIRES_OK(context, result.status());
-      wrapper = std::move(result.ValueOrDie());
-    }
-
-    std::vector<std::pair<int64, int64>> padding(rank);
-    for (int i = 0; i < rank; ++i) {
-      padding[i] = {padding_low_[i], padding_high_[i]};
+                    "padding must be a matrix with minor dimension 2, got ",
+                    padding_shape.DebugString()));
+    xla::Literal padding_literal;
+    OP_REQUIRES_OK(context, context->ConstantInputAsInt64Literal(
+                                "padding", &padding_literal));
+    std::vector<std::pair<int64, int64>> padding(padding_shape.dim_size(0));
+    for (int i = 0; i < padding.size(); ++i) {
+      padding[i] = {padding_literal.Get<int64>({i, 0}),
+                    padding_literal.Get<int64>({i, 1})};
     }
 
     xla::XlaOp output = xla::ReduceWindowWithGeneralPadding(
-        context->Input(0), context->Input(1), wrapper, window_dimensions_,
-        window_strides_, padding);
+        context->Input(0), context->Input(1), *reducer.computation,
+        window_dimensions, window_strides, padding);
     context->SetOutput(0, output);
   }
 
  private:
   const NameAttrList* computation_;
-  std::vector<int64> window_dimensions_;
-  std::vector<int64> window_strides_;
-  std::vector<int64> padding_low_;
-  std::vector<int64> padding_high_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ReduceWindowOp);
 };
 
-REGISTER_XLA_OP(Name("XlaReduceWindow"), ReduceWindowOp);
+REGISTER_XLA_OP(Name("XlaReduceWindow")
+                    .CompileTimeConstInput("window_dimensions")
+                    .CompileTimeConstInput("window_strides")
+                    .CompileTimeConstInput("padding"),
+                ReduceWindowOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index b52f0a0ab6..598248563b 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA-specific reduction Ops.
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/kernels/reduction_ops.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -29,9 +30,6 @@ namespace tensorflow {
 XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
                                DataType reduction_type)
     : XlaOpKernel(ctx), reduction_type_(reduction_type) {
-  const DataType dt = BaseType(input_type(0));
-  OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt}));
-
   OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_));
   OP_REQUIRES_OK(
       ctx, DataTypeToPrimitiveType(reduction_type_, &xla_reduction_type_));
@@ -58,20 +56,24 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
     return;
   }
 
+  OP_REQUIRES(ctx, axes_tensor_shape.dims() <= 1,
+              errors::InvalidArgument(
+                  "Expected scalar or vector as index argument, got ",
+                  axes_tensor_shape.DebugString()));
+
   // Evaluate the constant, reshaping to a 1-vector if it is a scalar.
+  std::vector<int64> axes;
   xla::Literal axes_literal;
-  OP_REQUIRES_OK(
-      ctx, ctx->ConstantInputReshaped(1, {axes_tensor_shape.num_elements()},
-                                      &axes_literal));
+  OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector(1, &axes));
 
   VLOG(1) << "data shape: " << data_shape.DebugString();
-  VLOG(1) << "axes      : " << axes_literal.ToString();
+  VLOG(1) << "axes      : " << absl::StrJoin(axes, ",");
 
   gtl::InlinedVector<bool, 4> bitmap(data_shape.dims(), false);
   std::vector<int64> xla_axes;
   int64 num_elements_reduced = 1LL;
   for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) {
-    int32 index = axes_literal.Get<int>({i});
+    int64 index = axes[i];
     OP_REQUIRES(ctx,
                 !(index < -data_shape.dims() || index >= data_shape.dims()),
                 errors::InvalidArgument("Invalid reduction dimension (", index,
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 121750a82a..366ce42866 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -41,8 +41,8 @@ class ReshapeOp : public XlaOpKernel {
                                         sizes_shape.DebugString()));
     const int64 num_dims = sizes_shape.num_elements();
 
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &literal));
+    std::vector<int64> shape_input;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &shape_input));
 
     // Compute the output shape.  Determine product of specified
     // dimensions, and find the index of the unspecified one if there
@@ -51,7 +51,7 @@ class ReshapeOp : public XlaOpKernel {
     int64 product = 1;
     int unknown_index = -1;
     for (int d = 0; d < num_dims; ++d) {
-      const int32 size = literal.Get<int>({d});
+      const int32 size = shape_input[d];
       if (size == -1) {
         OP_REQUIRES(
             ctx, unknown_index == -1,
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index d962ef4a5f..c0afccaa5b 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -95,10 +95,24 @@ class ReverseV2Op : public XlaOpKernel {
     std::vector<int64> axes;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &axes));
 
+    // witnessed_axes is used to ensure that the same axis is not marked to be
+    // reversed multiple times.
+    gtl::InlinedVector<bool, 8> witnessed_axes(x_shape.dims(), false);
+
     for (int d = 0; d < axes.size(); ++d) {
-      OP_REQUIRES(ctx, (0 <= axes[d]) && (axes[d] < x_shape.dims()),
-                  errors::InvalidArgument(axes[d], " is out of range [0, ",
-                                          x_shape.dims(), ")."));
+      OP_REQUIRES(
+          ctx, (-x_shape.dims() <= axes[d]) && (axes[d] < x_shape.dims()),
+          errors::InvalidArgument(axes[d], " is out of range [-",
+                                  x_shape.dims(), ", ", x_shape.dims(), ")."));
+      // Axes can be negative and are shifted to the canonical index before
+      // being lowered to HLO.
+      if (axes[d] < 0) {
+        axes[d] += x_shape.dims();
+      }
+      OP_REQUIRES(ctx, !witnessed_axes[axes[d]],
+                  errors::InvalidArgument("canonicalized axis ", axes[d],
+                                          " was repeated."));
+      witnessed_axes[axes[d]] = true;
     }
 
     ctx->SetOutput(0, xla::Rev(ctx->Input(0), axes));
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index 025ba82741..d6bd927135 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA-specific Ops for softmax.
 
+#include "absl/strings/match.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace {
@@ -33,7 +33,7 @@ namespace {
 class SoftmaxOp : public XlaOpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    log_ = str_util::StartsWith(type_string(), "Log");
+    log_ = absl::StartsWith(type_string(), "Log");
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 1233a37565..2c7213f322 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -70,7 +70,7 @@ class TileOp : public XlaOpKernel {
     bool one_dimension_is_broadcasted_without_multiple = true;
     for (int i = 0; i < input_dims; ++i) {
       int multiple = literal.Get<int>({i});
-      OP_REQUIRES(ctx, multiple,
+      OP_REQUIRES(ctx, multiple >= 0,
                   errors::InvalidArgument("Expected multiples[", i,
                                           "] >= 0, but got ", multiple));
       int64 new_dim = input_shape.dim_size(i) * multiple;
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
new file mode 100644
index 0000000000..412afeaaad
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
@@ -0,0 +1,115 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaBroadcastHelperOp : public XlaOpKernel {
+ public:
+  explicit XlaBroadcastHelperOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    xla::XlaOp lhs = context->Input(0);
+    xla::XlaOp rhs = context->Input(1);
+    const TensorShape lhs_shape = context->InputShape(0);
+    const TensorShape rhs_shape = context->InputShape(1);
+
+    const bool broadcast_lhs = lhs_shape.dims() < rhs_shape.dims();
+    const TensorShape* min_rank_shape = broadcast_lhs ? &lhs_shape : &rhs_shape;
+    const TensorShape* max_rank_shape = broadcast_lhs ? &rhs_shape : &lhs_shape;
+
+    std::vector<int64> broadcast_dims;
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("broadcast_dims",
+                                                              &broadcast_dims));
+    if (broadcast_dims.empty()) {
+      OP_REQUIRES(
+          context,
+          lhs_shape.dims() == rhs_shape.dims() || lhs_shape.dims() == 0 ||
+              rhs_shape.dims() == 0,
+          errors::InvalidArgument(
+              "If broadcast_dims is empty, both "
+              "arguments must have equal rank; "
+              "argument shapes, or at least one argument must be a scalar: ",
+              lhs_shape.DebugString(), " and ", rhs_shape.DebugString()));
+      context->SetOutput(0, lhs);
+      context->SetOutput(1, rhs);
+      return;
+    }
+
+    OP_REQUIRES(
+        context, broadcast_dims.size() == min_rank_shape->dims(),
+        errors::InvalidArgument(
+            "broadcast_dims must have size equal to the smaller argument rank; "
+            "broadcast_dims: [",
+            absl::StrJoin(broadcast_dims, ","), "]; argument shapes: ",
+            lhs_shape.DebugString(), " and ", rhs_shape.DebugString()));
+    std::vector<int64> sorted_broadcast_dims = broadcast_dims;
+    absl::c_sort(sorted_broadcast_dims);
+    std::set<int64> dims_set(broadcast_dims.begin(), broadcast_dims.end());
+    OP_REQUIRES(context,
+                dims_set.size() == broadcast_dims.size() &&
+                    broadcast_dims == sorted_broadcast_dims,
+                errors::InvalidArgument(
+                    "Duplicate or nonmonotonic dimension in broadcast_dims; "
+                    "broadcast_dims: [",
+                    absl::StrJoin(broadcast_dims, ","), "]"));
+
+    std::vector<int64> broadcast_shape(max_rank_shape->dims(), 1LL);
+    for (int i = 0; i < broadcast_dims.size(); ++i) {
+      const int dim = broadcast_dims[i];
+      OP_REQUIRES(
+          context, dim >= 0 && dim < broadcast_shape.size(),
+          errors::InvalidArgument(
+              "Invalid broadcast dimension (", dim, "); broadcast_dims: [",
+              absl::StrJoin(broadcast_dims, ","), "]; argument shapes: ",
+              lhs_shape.DebugString(), " and ", rhs_shape.DebugString()));
+      broadcast_shape[dim] = min_rank_shape->dim_size(i);
+    }
+    xla::PrimitiveType type = context->input_xla_type(0);
+    xla::Shape broadcast_xla_shape =
+        xla::ShapeUtil::MakeShape(type, broadcast_shape);
+    if (broadcast_lhs) {
+      lhs = xla::BroadcastInDim(lhs, broadcast_xla_shape, broadcast_dims);
+    } else {
+      rhs = xla::BroadcastInDim(rhs, broadcast_xla_shape, broadcast_dims);
+    }
+    context->SetOutput(0, lhs);
+    context->SetOutput(1, rhs);
+  }
+
+ private:
+  xla::DotDimensionNumbers dnums_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaBroadcastHelperOp);
+};
+
+REGISTER_XLA_OP(
+    Name("XlaBroadcastHelper").CompileTimeConstInput("broadcast_dims"),
+    XlaBroadcastHelperOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
new file mode 100644
index 0000000000..8848623868
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaConvOp : public XlaOpKernel {
+ public:
+  explicit XlaConvOp(OpKernelConstruction* context) : XlaOpKernel(context) {
+    string dnums_attr;
+    OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
+    OP_REQUIRES(
+        context, dnums_.ParsePartialFromString(dnums_attr),
+        errors::InvalidArgument("Error parsing convolution dimension numbers"));
+    string precision_config_attr;
+    OP_REQUIRES_OK(
+        context, context->GetAttr("precision_config", &precision_config_attr));
+    OP_REQUIRES(
+        context,
+        precision_config_.ParsePartialFromString(precision_config_attr),
+        errors::InvalidArgument("Error parsing convolution dimension numbers"));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape lhs_shape = context->InputShape(0);
+    const TensorShape rhs_shape = context->InputShape(1);
+    const TensorShape padding_shape = context->InputShape("padding");
+    std::vector<int64> window_strides;
+    std::vector<int64> lhs_dilation;
+    std::vector<int64> rhs_dilation;
+    int64 feature_group_count;
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides",
+                                                              &window_strides));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("lhs_dilation",
+                                                              &lhs_dilation));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("rhs_dilation",
+                                                              &rhs_dilation));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(
+                                "feature_group_count", &feature_group_count));
+
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(padding_shape) &&
+                    padding_shape.dim_size(1) == 2,
+                errors::InvalidArgument(
+                    "padding must be a matrix with minor dimension 2, got ",
+                    padding_shape.DebugString()));
+    xla::Literal padding_literal;
+    OP_REQUIRES_OK(context, context->ConstantInputAsInt64Literal(
+                                "padding", &padding_literal));
+    std::vector<std::pair<int64, int64>> padding(padding_shape.dim_size(0));
+    for (int i = 0; i < padding.size(); ++i) {
+      padding[i] = {padding_literal.Get<int64>({i, 0}),
+                    padding_literal.Get<int64>({i, 1})};
+    }
+
+    // We do only minimal checking, relying on XLA to check the shape
+    // invariants.
+    xla::XlaOp output = xla::ConvGeneralDilated(
+        context->Input(0), context->Input(1), window_strides, padding,
+        lhs_dilation, rhs_dilation, dnums_, feature_group_count,
+        &precision_config_);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  xla::ConvolutionDimensionNumbers dnums_;
+  xla::PrecisionConfigProto precision_config_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaConvOp);
+};
+
+REGISTER_XLA_OP(Name("XlaConv")
+                    .CompileTimeConstInput("window_strides")
+                    .CompileTimeConstInput("lhs_dilation")
+                    .CompileTimeConstInput("rhs_dilation")
+                    .CompileTimeConstInput("feature_group_count")
+                    .CompileTimeConstInput("padding"),
+                XlaConvOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
new file mode 100644
index 0000000000..2fed53e5c0
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaDotOp : public XlaOpKernel {
+ public:
+  explicit XlaDotOp(OpKernelConstruction* context) : XlaOpKernel(context) {
+    string dnums_attr;
+    OP_REQUIRES_OK(context, context->GetAttr("dimension_numbers", &dnums_attr));
+    OP_REQUIRES(
+        context, dnums_.ParsePartialFromString(dnums_attr),
+        errors::InvalidArgument("Error parsing convolution dimension numbers"));
+    string precision_config_attr;
+    OP_REQUIRES_OK(
+        context, context->GetAttr("precision_config", &precision_config_attr));
+    OP_REQUIRES(
+        context,
+        precision_config_.ParsePartialFromString(precision_config_attr),
+        errors::InvalidArgument("Error parsing convolution dimension numbers"));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape lhs_shape = context->InputShape(0);
+    const TensorShape rhs_shape = context->InputShape(1);
+
+    // We do only minimal checking, relying on XLA to check the shape
+    // invariants.
+    xla::XlaOp output = xla::DotGeneral(context->Input(0), context->Input(1),
+                                        dnums_, &precision_config_);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  xla::DotDimensionNumbers dnums_;
+  xla::PrecisionConfigProto precision_config_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaDotOp);
+};
+
+REGISTER_XLA_OP(Name("XlaDot"), XlaDotOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
new file mode 100644
index 0000000000..59502d83c7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaPadOp : public XlaOpKernel {
+ public:
+  explicit XlaPadOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape("input");
+    const TensorShape padding_value_shape =
+        context->InputShape("padding_value");
+
+    std::vector<int64> padding_low;
+    std::vector<int64> padding_high;
+    std::vector<int64> padding_interior;
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("padding_low",
+                                                              &padding_low));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("padding_high",
+                                                              &padding_high));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector(
+                                "padding_interior", &padding_interior));
+
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(padding_value_shape),
+                errors::InvalidArgument("padding_value must be a scalar"));
+    const int rank = input_shape.dims();
+    OP_REQUIRES(context, rank == padding_low.size(),
+                errors::InvalidArgument(
+                    "The size of padding_low must be equal to the input "
+                    "rank (",
+                    padding_low.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_high.size(),
+                errors::InvalidArgument(
+                    "The size of padding_high must be equal to the input "
+                    "rank (",
+                    padding_high.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == padding_interior.size(),
+                errors::InvalidArgument(
+                    "The size of padding_interior must be equal to the input "
+                    "rank (",
+                    padding_interior.size(), " vs. ", rank, ")"));
+
+    auto non_negative = [](int64 x) { return x >= 0; };
+    OP_REQUIRES(
+        context, absl::c_all_of(padding_low, non_negative),
+        errors::InvalidArgument("padding_low must be non-negative, got [",
+                                absl::StrJoin(padding_low, ","), "]"));
+    OP_REQUIRES(
+        context, absl::c_all_of(padding_high, non_negative),
+        errors::InvalidArgument("padding_high must be non-negative, got [",
+                                absl::StrJoin(padding_high, ","), "]"));
+    OP_REQUIRES(
+        context, absl::c_all_of(padding_interior, non_negative),
+        errors::InvalidArgument("padding_interior must be non-negative, got [",
+                                absl::StrJoin(padding_interior, ","), "]"));
+
+    xla::PaddingConfig padding_config;
+    for (int i = 0; i < rank; ++i) {
+      auto* dim = padding_config.add_dimensions();
+      dim->set_edge_padding_low(padding_low[i]);
+      dim->set_edge_padding_high(padding_high[i]);
+      dim->set_interior_padding(padding_interior[i]);
+    }
+
+    xla::XlaOp output =
+        xla::Pad(context->Input("input"), context->Input("padding_value"),
+                 padding_config);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaPadOp);
+};
+
+REGISTER_XLA_OP(Name("XlaPad")
+                    .CompileTimeConstInput("padding_low")
+                    .CompileTimeConstInput("padding_high")
+                    .CompileTimeConstInput("padding_interior"),
+                XlaPadOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
new file mode 100644
index 0000000000..fc2425f37b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaReduceOp : public XlaOpKernel {
+ public:
+  explicit XlaReduceOp(OpKernelConstruction* context) : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("reducer", &reducer_));
+    OP_REQUIRES_OK(context, context->GetAttr("dimensions_to_reduce",
+                                             &dimensions_to_reduce_));
+    std::set<int64> dims_set(dimensions_to_reduce_.begin(),
+                             dimensions_to_reduce_.end());
+    OP_REQUIRES(
+        context, dims_set.size() == dimensions_to_reduce_.size(),
+        errors::InvalidArgument("Duplicate dimension in dimensions_to_reduce "
+                                "argument to XlaReduce"));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape("input");
+    const TensorShape init_value_shape = context->InputShape("init_value");
+    const DataType dtype = context->input_type(0);
+
+    const int rank = input_shape.dims();
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(init_value_shape),
+                errors::InvalidArgument("init_value must be a scalar"));
+
+    auto dim_in_range = [rank](int64 dim) { return dim >= 0 && dim < rank; };
+    OP_REQUIRES(context,
+                rank >= dimensions_to_reduce_.size() &&
+                    absl::c_all_of(dimensions_to_reduce_, dim_in_range),
+                errors::InvalidArgument(
+                    "Invalid dimensions_to_reduce argument to XlaReduce"));
+
+    // Build the reducer function.
+    XlaCompiler::Argument reducer_arg;
+    reducer_arg.kind = XlaCompiler::Argument::kParameter;
+    reducer_arg.type = dtype;
+    reducer_arg.shape = TensorShape();
+
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.use_tuple_arg = false;
+    compile_options.always_return_tuple = false;
+    compile_options.resolve_compile_time_constants = false;
+    compile_options.is_entry_computation = false;
+    XlaCompiler::CompilationResult reducer;
+    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
+                                compile_options, *reducer_,
+                                {reducer_arg, reducer_arg}, &reducer));
+
+    xla::Shape scalar_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    OP_REQUIRES(
+        context,
+        xla::ShapeUtil::Compatible(reducer.xla_output_shape, scalar_shape),
+        errors::InvalidArgument(
+            "Invalid output shape of XlaReduce reducer. Expected ",
+            xla::ShapeUtil::HumanString(scalar_shape), " got ",
+            xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
+
+    xla::XlaOp output =
+        xla::Reduce(context->Input("input"), context->Input("init_value"),
+                    *reducer.computation, dimensions_to_reduce_);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  const NameAttrList* reducer_;
+  std::vector<int64> dimensions_to_reduce_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaReduceOp);
+};
+
+REGISTER_XLA_OP(Name("XlaReduce"), XlaReduceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc
new file mode 100644
index 0000000000..089776fcf7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc
@@ -0,0 +1,147 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/while_op.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaSelectAndScatterOp : public XlaOpKernel {
+ public:
+  explicit XlaSelectAndScatterOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("select", &select_computation_));
+    OP_REQUIRES_OK(context, context->GetAttr("scatter", &scatter_computation_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const DataType dtype = context->input_type(0);
+
+    std::vector<int64> window_dimensions;
+    std::vector<int64> window_strides;
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector(
+                                "window_dimensions", &window_dimensions));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides",
+                                                              &window_strides));
+
+    const int rank = input_shape.dims();
+    OP_REQUIRES(context, rank == window_dimensions.size(),
+                errors::InvalidArgument(
+                    "The size of window_dimensions must be equal to the input "
+                    "rank (",
+                    window_dimensions.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == window_strides.size(),
+                errors::InvalidArgument(
+                    "The size of window_strides must be equal to the input "
+                    "rank (",
+                    window_strides.size(), " vs. ", rank, ")"));
+
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.use_tuple_arg = false;
+    compile_options.resolve_compile_time_constants = false;
+    compile_options.is_entry_computation = false;
+    compile_options.always_return_tuple = false;
+
+    // Build the select function.
+    XlaCompiler::Argument select_arg;
+    select_arg.kind = XlaCompiler::Argument::kParameter;
+    select_arg.type = dtype;
+    select_arg.shape = TensorShape();
+
+    XlaCompiler::CompilationResult select;
+    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
+                                compile_options, *select_computation_,
+                                {select_arg, select_arg}, &select));
+
+    xla::Shape select_output_shape = xla::ShapeUtil::MakeShape(xla::PRED, {});
+    OP_REQUIRES(
+        context,
+        xla::ShapeUtil::Compatible(select.xla_output_shape,
+                                   select_output_shape),
+        errors::InvalidArgument(
+            "Invalid output shape of XlaSelectAndScatter select. Expected ",
+            xla::ShapeUtil::HumanString(select_output_shape), " got ",
+            xla::ShapeUtil::HumanString(select.xla_output_shape)));
+
+    // Build the scatter function.
+    XlaCompiler::Argument scatter_arg;
+    scatter_arg.kind = XlaCompiler::Argument::kParameter;
+    scatter_arg.type = dtype;
+    scatter_arg.shape = TensorShape();
+
+    XlaCompiler::CompilationResult scatter;
+    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
+                                compile_options, *scatter_computation_,
+                                {scatter_arg, scatter_arg}, &scatter));
+
+    xla::Shape scalar_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    OP_REQUIRES(
+        context,
+        xla::ShapeUtil::Compatible(scatter.xla_output_shape, scalar_shape),
+        errors::InvalidArgument(
+            "Invalid output shape of scatter. Expected ",
+            xla::ShapeUtil::HumanString(scalar_shape), " got ",
+            xla::ShapeUtil::HumanString(scatter.xla_output_shape)));
+
+    const TensorShape padding_shape = context->InputShape("padding");
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(padding_shape) &&
+                    padding_shape.dim_size(1) == 2,
+                errors::InvalidArgument(
+                    "padding must be a matrix with minor dimension 2, got ",
+                    padding_shape.DebugString()));
+    xla::Literal padding_literal;
+    OP_REQUIRES_OK(context, context->ConstantInputAsInt64Literal(
+                                "padding", &padding_literal));
+    std::vector<std::pair<int64, int64>> padding(padding_shape.dim_size(0));
+    for (int i = 0; i < padding.size(); ++i) {
+      padding[i] = {padding_literal.Get<int64>({i, 0}),
+                    padding_literal.Get<int64>({i, 1})};
+    }
+
+    xla::XlaOp output = xla::SelectAndScatterWithGeneralPadding(
+        context->Input("operand"), *select.computation, window_dimensions,
+        window_strides, padding, context->Input("source"),
+        context->Input("init_value"), *scatter.computation);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  const NameAttrList* select_computation_;
+  const NameAttrList* scatter_computation_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaSelectAndScatterOp);
+};
+
+REGISTER_XLA_OP(Name("XlaSelectAndScatter")
+                    .CompileTimeConstInput("window_dimensions")
+                    .CompileTimeConstInput("window_strides")
+                    .CompileTimeConstInput("padding"),
+                XlaSelectAndScatterOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index cb7a40e23d..99511e9914 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -25,8 +25,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -44,8 +44,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/core:lib",
     ],
@@ -78,8 +78,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
@@ -119,6 +119,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index f666d22ea4..d8c050d09e 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -27,7 +27,8 @@ limitations under the License.
 namespace tensorflow {
 
 xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
-                    bool transpose_y, bool conjugate_x, bool conjugate_y) {
+                    bool transpose_y, bool conjugate_x, bool conjugate_y,
+                    xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
     TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
@@ -95,6 +96,10 @@ xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
       y = xla::Conj(y);
     }
 
+    xla::PrecisionConfigProto precision_proto;
+    precision_proto.add_operand_precision(precision);
+    precision_proto.add_operand_precision(precision);
+
     // If there are no batch dimensions, use a regular Dot.
     // TODO(b/69062148) Remove this code when Dot emitters can be passed
     // dimensions to transpose directly (i.e. without requiring a Transpose
@@ -102,7 +107,7 @@ xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
     if (batch_dimension_numbers.empty()) {
       auto lhs = transpose_x ? xla::Transpose(x, {1, 0}) : x;
       auto rhs = transpose_y ? xla::Transpose(y, {1, 0}) : y;
-      return xla::Dot(lhs, rhs);
+      return xla::Dot(lhs, rhs, &precision_proto);
     }
 
     xla::DotDimensionNumbers dot_dnums;
@@ -112,7 +117,8 @@ xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
       dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
       dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
     }
-    return xla::DotGeneral(x, y, dot_dnums);
+
+    return xla::DotGeneral(x, y, dot_dnums, &precision_proto);
   });
 }
 
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index 8757b16a1c..6cfccd5553 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace tensorflow {
 
@@ -45,7 +45,9 @@ namespace tensorflow {
 //     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x = false,
                     bool transpose_y = false, bool conjugate_x = false,
-                    bool conjugate_y = false);
+                    bool conjugate_y = false,
+                    xla::PrecisionConfigProto::Precision precision =
+                        xla::PrecisionConfigProto::DEFAULT);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index 87d73eb3f0..67fb56510c 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -49,7 +49,8 @@ namespace {
 //     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) /
 //                       l[..., j, j]
 //   return l
-xla::XlaOp CholeskyUnblocked(xla::XlaOp a) {
+xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
+                             xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
     TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
@@ -101,7 +102,8 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a) {
       // np.dot(row, np.swapaxes(row, -1, -2))
       auto diag_dot = BatchDot(row, row,
                                /*transpose_x=*/false,
-                               /*transpose_y=*/true);
+                               /*transpose_y=*/true, /*conjugate_x=*/false,
+                               /*conjugate_y=*/false, precision);
       // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
       //                                              np.swapaxes(row, -1, -2)))
       auto l_ii =
@@ -121,7 +123,8 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a) {
       // r.T)
       auto dot = BatchDot(body_l, row,
                           /*transpose_x=*/false,
-                          /*transpose_y=*/true);
+                          /*transpose_y=*/true, /*conjugate_x=*/false,
+                          /*conjugate_y=*/false, precision);
       // np.dot(l[..., i+1:, :i], r.T)
       auto dot_ip1 =
           xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot);
@@ -145,7 +148,8 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a) {
 
 }  // namespace
 
-xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size) {
+xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
+                    xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
     TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
@@ -181,14 +185,15 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size) {
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
         auto delta = BatchDot(lhs, rhs, /*transpose_x=*/false,
-                              /*transpose_y=*/true);
+                              /*transpose_y=*/true, /*conjugate_x=*/false,
+                              /*conjugate_y=*/false, precision);
         auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
         a = UpdateSliceInMinorDims(a, before - delta, {i, i});
       }
 
       // l[i:i+k, i:i+k] = cholesky_unblocked(a[i:i+k, i:i+k])
       auto x = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-      auto factorized = CholeskyUnblocked(x);
+      auto factorized = CholeskyUnblocked(x, precision);
       l = UpdateSliceInMinorDims(l, factorized, {i, i});
 
       if (i + k < n) {
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 1bef9bb166..60cd7ded53 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace tensorflow {
 
@@ -30,7 +30,9 @@ namespace tensorflow {
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
 // TODO(znado): handle the complex Hermitian case
-xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size = 256);
+xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size = 256,
+                    xla::PrecisionConfigProto::Precision precision =
+                        xla::PrecisionConfigProto::HIGHEST);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc
index fc0c1ee838..b6f30d8d49 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/tf2xla/lib/qr.cc
@@ -149,7 +149,8 @@ struct QRBlockResult {
   xla::XlaOp taus;  // Shape: [..., n]
   xla::XlaOp vs;    // Shape: [..., m, n]
 };
-xla::StatusOr<QRBlockResult> QRBlock(xla::XlaOp a) {
+xla::StatusOr<QRBlockResult> QRBlock(
+    xla::XlaOp a, xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
   const int num_dims = xla::ShapeUtil::Rank(a_shape);
@@ -190,8 +191,12 @@ xla::StatusOr<QRBlockResult> QRBlock(xla::XlaOp a) {
     auto v_broadcast = xla::Reshape(v, shape);
     // a[:, :] -= tau * np.dot(v[:, np.newaxis],
     //                          np.dot(v[np.newaxis, :], a[:, :]))
-    auto vva = BatchDot(v_broadcast, a);
-    vva = BatchDot(v_broadcast, vva, /*transpose_x=*/true);
+    auto vva =
+        BatchDot(v_broadcast, a, /*transpose_x=*/false, /*transpose_y=*/false,
+                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    vva =
+        BatchDot(v_broadcast, vva, /*transpose_x=*/true, /*transpose_y=*/false,
+                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
     a = a - xla::Mul(tau, vva,
                      /*broadcast_dimensions=*/batch_dim_indices);
 
@@ -251,7 +256,8 @@ xla::StatusOr<QRBlockResult> QRBlock(xla::XlaOp a) {
 // vs.
 xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
     xla::PrimitiveType type, gtl::ArraySlice<int64> batch_dims, xla::XlaOp vs,
-    xla::XlaOp taus, int64 m, int64 n) {
+    xla::XlaOp taus, int64 m, int64 n,
+    xla::PrecisionConfigProto::Precision precision) {
   std::vector<int64> batch_dim_indices(batch_dims.size());
   std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
   int64 n_index = batch_dims.size() + 1;
@@ -272,9 +278,12 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
     auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
 
     // yv has shape [..., n, 1]
-    auto yv = BatchDot(y, v, /*transpose_x=*/true);
+    auto yv = BatchDot(y, v, /*transpose_x=*/true, /*transpose_y=*/false,
+                       /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
     // wyv has shape [..., m, 1]
-    auto wyv = BatchDot(w, yv);
+    auto wyv =
+        BatchDot(w, yv, /*transpose_x=*/false, /*transpose_y=*/false,
+                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
 
     auto z = xla::Mul(
         -beta, v + wyv,
@@ -321,8 +330,9 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
 //   return (q, a)
 // TODO(phawkins): consider using UT transformations (in the form I - V U V')
 // rather than WY transformations.
-xla::StatusOr<QRDecompositionResult> QRDecomposition(xla::XlaOp a,
-                                                     int64 block_size) {
+xla::StatusOr<QRDecompositionResult> QRDecomposition(
+    xla::XlaOp a, int64 block_size,
+    xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = a.builder();
   TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
   const int num_dims = xla::ShapeUtil::Rank(a_shape);
@@ -352,29 +362,36 @@ xla::StatusOr<QRDecompositionResult> QRDecomposition(xla::XlaOp a,
     int64 k = std::min(block_size, p - i);
 
     auto a_block = SliceInMinorDims(a, {i, i}, {m, i + k});
-    TF_ASSIGN_OR_RETURN(auto qr_block, QRBlock(a_block));
+    TF_ASSIGN_OR_RETURN(auto qr_block, QRBlock(a_block, precision));
 
     a = UpdateSliceInMinorDims(a, qr_block.r, {i, i});
 
     // Compute the I-WY block representation of a product of Householder
     // matrices.
-    TF_ASSIGN_OR_RETURN(auto w,
-                        ComputeWYRepresentation(type, batch_dims, qr_block.vs,
-                                                qr_block.taus, m - i, k));
+    TF_ASSIGN_OR_RETURN(
+        auto w, ComputeWYRepresentation(type, batch_dims, qr_block.vs,
+                                        qr_block.taus, m - i, k, precision));
     auto y = qr_block.vs;
 
     // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
     auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update = BatchDot(w, a_panel, /*transpose_x=*/true);
-    a_update = BatchDot(y, a_update);
+    auto a_update =
+        BatchDot(w, a_panel, /*transpose_x=*/true, /*transpose_y=*/false,
+                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    a_update =
+        BatchDot(y, a_update, /*transpose_x=*/false, /*transpose_y=*/false,
+                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
     a_panel = a_panel + a_update;
     a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
 
     // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
     auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
-    auto q_update = BatchDot(q_panel, w);
-    q_update =
-        BatchDot(q_update, y, /*transpose_x=*/false, /*transpose_y=*/true);
+    auto q_update =
+        BatchDot(q_panel, w, /*transpose_x=*/false, /*transpose_y=*/false,
+                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    q_update = BatchDot(q_update, y, /*transpose_x=*/false,
+                        /*transpose_y=*/true, /*conjugate_x=*/false,
+                        /*conjugate_y=*/false, precision);
     q_panel = q_panel + q_update;
     q = UpdateSliceInMinorDims(q, q_panel, {0, i});
   }
diff --git a/tensorflow/compiler/tf2xla/lib/qr.h b/tensorflow/compiler/tf2xla/lib/qr.h
index abd2316ac9..05565477b6 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.h
+++ b/tensorflow/compiler/tf2xla/lib/qr.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace tensorflow {
 
@@ -32,8 +33,10 @@ struct QRDecompositionResult {
   xla::XlaOp r;
 };
 
-xla::StatusOr<QRDecompositionResult> QRDecomposition(xla::XlaOp a,
-                                                     int64 block_size = 128);
+xla::StatusOr<QRDecompositionResult> QRDecomposition(
+    xla::XlaOp a, int64 block_size = 128,
+    xla::PrecisionConfigProto::Precision precision =
+        xla::PrecisionConfigProto::HIGHEST);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 04fa10108c..37b2240b45 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -57,7 +57,7 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
     // We can grab entire blocks using gather
     if (n > block_size) {
       // Construct the starting indices of the diagonal blocks
-      auto gather_indices =
+      auto start_indices =
           Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks),
                                   xla::ConstantR0<int32>(builder, block_size)),
                               /*broadcast_sizes=*/{2}),
@@ -65,13 +65,13 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
 
       // Gather the diagonal blocks
       xla::GatherDimensionNumbers dim_numbers;
-      dim_numbers.add_output_window_dims(ndims - 1);
-      dim_numbers.add_output_window_dims(ndims);
-      dim_numbers.add_gather_dims_to_operand_dims(ndims - 2);
-      dim_numbers.add_gather_dims_to_operand_dims(ndims - 1);
+      dim_numbers.add_offset_dims(ndims - 1);
+      dim_numbers.add_offset_dims(ndims);
+      dim_numbers.add_start_index_map(ndims - 2);
+      dim_numbers.add_start_index_map(ndims - 1);
       dim_numbers.set_index_vector_dim(1);
-      diag_blocks = Gather(a, gather_indices, dim_numbers,
-                           /*window_bounds=*/{block_size, block_size});
+      diag_blocks = Gather(a, start_indices, dim_numbers,
+                           /*slice_sizes=*/{block_size, block_size});
     }
 
     // The last block might be smaller than the block size,
@@ -110,8 +110,9 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
   });
 }
 
-xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
-                                bool transpose_a, bool conjugate_a) {
+xla::XlaOp InvertDiagonalBlocks(
+    xla::XlaOp diag_blocks, bool lower, bool transpose_a, bool conjugate_a,
+    xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = diag_blocks.builder();
   return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
     // Input is a batch of square lower triangular square matrices. Its shape is
@@ -215,7 +216,10 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
       dnums.add_rhs_batch_dimensions(0);
       dnums.add_lhs_contracting_dimensions(2);
       dnums.add_rhs_contracting_dimensions(1);
-      auto update = -DotGeneral(input_row, body_out, dnums);
+      xla::PrecisionConfigProto precision_proto;
+      precision_proto.add_operand_precision(precision);
+      precision_proto.add_operand_precision(precision);
+      auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
 
       body_out = DynamicUpdateSlice(body_out, update, start_indices);
 
@@ -238,10 +242,10 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
   });
 }
 
-xla::XlaOp SolveWithInvertedDiagonalBlocks(xla::XlaOp a, xla::XlaOp b,
-                                           xla::XlaOp inv_diag_blocks,
-                                           bool left_side, bool lower,
-                                           bool transpose_a, bool conjugate_a) {
+xla::XlaOp SolveWithInvertedDiagonalBlocks(
+    xla::XlaOp a, xla::XlaOp b, xla::XlaOp inv_diag_blocks, bool left_side,
+    bool lower, bool transpose_a, bool conjugate_a,
+    xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
     TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
@@ -307,9 +311,13 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(xla::XlaOp a, xla::XlaOp b,
         auto a_row =
             MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
         if (left_side) {
-          remainder = b_row - BatchDot(a_row, x, transpose_a, false);
+          remainder = b_row - BatchDot(a_row, x, transpose_a, false,
+                                       /*conjugate_x=*/false,
+                                       /*conjugate_y=*/false, precision);
         } else {
-          remainder = b_row - BatchDot(x, a_row, false, transpose_a);
+          remainder = b_row - BatchDot(x, a_row, false, transpose_a,
+                                       /*conjugate_x=*/false,
+                                       /*conjugate_y=*/false, precision);
         }
       }
 
@@ -319,9 +327,13 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(xla::XlaOp a, xla::XlaOp b,
           xla::ConstantR0WithType(builder, xla::S32, j * block_size);
       std::vector<xla::XlaOp> update_starts = {start_index, zero};
       if (left_side) {
-        x_update = BatchDot(inv_block, remainder, transpose_a, false);
+        x_update =
+            BatchDot(inv_block, remainder, transpose_a, false,
+                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
       } else {
-        x_update = BatchDot(remainder, inv_block, false, transpose_a);
+        x_update =
+            BatchDot(remainder, inv_block, false, transpose_a,
+                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
         std::swap(update_starts[0], update_starts[1]);
       }
       x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
@@ -333,7 +345,8 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(xla::XlaOp a, xla::XlaOp b,
 
 xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
                            bool lower, bool transpose_a, bool conjugate_a,
-                           int64 block_size) {
+                           int64 block_size,
+                           xla::PrecisionConfigProto::Precision precision) {
   xla::XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
     TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
@@ -388,12 +401,13 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
     auto diag_blocks = DiagonalBlocks(a, block_size);
 
     // We invert these blocks in parallel using batched matrix-vector products
-    auto inv_diag_blocks =
-        InvertDiagonalBlocks(diag_blocks, lower, transpose_a, conjugate_a);
+    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
+                                                conjugate_a, precision);
 
     // We now find the solution using GEMMs
-    auto x = SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side,
-                                             lower, transpose_a, conjugate_a);
+    auto x =
+        SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower,
+                                        transpose_a, conjugate_a, precision);
 
     return x;
   });
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index 555760b7ef..ac42a48352 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace tensorflow {
 
@@ -59,7 +59,9 @@ namespace tensorflow {
 // blocking is used.
 xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
                            bool lower, bool transpose_a, bool conjugate_a,
-                           int64 block_size = 128);
+                           int64 block_size = 128,
+                           xla::PrecisionConfigProto::Precision precision =
+                               xla::PrecisionConfigProto::HIGHEST);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index ace6fd1d8e..4dce0a2102 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -11,6 +11,8 @@ cc_library(
     srcs = ["xla_ops.cc"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index a59c77f5c3..2cd9ae799f 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -13,11 +13,97 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
+namespace {
+
+// Helper shape function for operators that return an output with the same rank
+// as their first input.
+Status UnchangedRank(shape_inference::InferenceContext* c) {
+  if (c->RankKnown(c->input(0))) {
+    c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(0))));
+  } else {
+    c->set_output(0, c->input(0));
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("XlaBroadcastHelper")
+    .Input("lhs: T")
+    .Input("rhs: T")
+    .Input("broadcast_dims: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Output("lhs_output: T")
+    .Output("rhs_output: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Helper operator for performing XLA-style broadcasts
+
+Broadcasts `lhs` and `rhs` to the same rank, by adding size 1 dimensions to
+whichever of `lhs` and `rhs` has the lower rank, using XLA's broadcasting rules
+for binary operators.
+
+lhs: the LHS input tensor
+rhs: the RHS input tensor
+broadcast_dims: an XLA-style broadcast dimension specification
+lhs_output: the broadcasted LHS tensor
+rhs_output: the broadcasted RHS tensor
+)doc");
+
+REGISTER_OP("XlaConv")
+    .Input("lhs: T")
+    .Input("rhs: T")
+    .Input("window_strides: Tindices")
+    .Input("padding: Tindices")
+    .Input("lhs_dilation: Tindices")
+    .Input("rhs_dilation: Tindices")
+    .Input("feature_group_count: Tindices")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("dimension_numbers: string")
+    .Attr("precision_config: string")
+    .Output("output: T")
+    .SetShapeFn(UnchangedRank)
+    .Doc(R"doc(
+Wraps the XLA ConvGeneralDilated operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
+.
+
+lhs: the input tensor
+rhs: the kernel tensor
+window_strides: the inter-window strides
+padding: the padding to apply at the start and end of each input dimensions
+lhs_dilation: dilation to apply between input elements
+rhs_dilation: dilation to apply between kernel elements
+feature_group_count: number of feature groups for grouped convolution.
+dimension_numbers: a serialized xla::ConvolutionDimensionNumbers proto.
+precision_config: a serialized xla::PrecisionConfigProto proto.
+)doc");
+
+REGISTER_OP("XlaDot")
+    .Input("lhs: T")
+    .Input("rhs: T")
+    .Attr("T: numbertype")
+    .Attr("dimension_numbers: string")
+    .Attr("precision_config: string")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Wraps the XLA ConvGeneralDilated operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
+.
+
+lhs: the LHS tensor
+rhs: the RHS tensor
+dimension_numbers: a serialized xla::DotDimensionNumbers proto.
+precision_config: a serialized xla::PrecisionConfigProto proto.
+)doc");
 
 REGISTER_OP("XlaDynamicUpdateSlice")
     .Input("input: T")
@@ -73,6 +159,29 @@ else_branch: A function takes 'inputs' and returns a list of tensors.
              whose types are the same as what then_branch returns.
 )doc");
 
+REGISTER_OP("XlaPad")
+    .Input("input: T")
+    .Input("padding_value: T")
+    .Input("padding_low: Tindices")
+    .Input("padding_high: Tindices")
+    .Input("padding_interior: Tindices")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(UnchangedRank)
+    .Doc(R"doc(
+Wraps the XLA Pad operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#pad
+.
+
+input: A `Tensor` of type T.
+padding_value: A scalar `Tensor` of type T.
+padding_low: the padding to apply at the start of each input dimensions
+padding_high: the padding to apply at the end of each input dimension.
+padding_interior: the padding to apply between each input element.
+output: A `Tensor` of type T.
+)doc");
+
 REGISTER_OP("XlaRecv")
     .Output("tensor: dtype")
     .Attr("dtype: type")
@@ -98,17 +207,58 @@ tensor_name: A string key that identifies the channel.
 shape: The shape of the tensor.
 )doc");
 
+REGISTER_OP("XlaReduce")
+    .Input("input: T")
+    .Input("init_value: T")
+    .Attr("T: numbertype")
+    .Attr("dimensions_to_reduce: list(int)")
+    .Attr("reducer: func")
+    .Output("output: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      if (c->RankKnown(c->input(0))) {
+        int rank = c->Rank(c->input(0));
+        std::vector<int64> dimensions_to_reduce;
+        TF_RETURN_IF_ERROR(
+            c->GetAttr("dimensions_to_reduce", &dimensions_to_reduce));
+        std::set<int64> dims_set(dimensions_to_reduce.begin(),
+                                 dimensions_to_reduce.end());
+        auto dim_in_range = [rank](int64 dim) {
+          return dim >= 0 && dim < rank;
+        };
+        if (rank < dimensions_to_reduce.size() ||
+            dims_set.size() != dimensions_to_reduce.size() ||
+            !absl::c_all_of(dimensions_to_reduce, dim_in_range)) {
+          return errors::InvalidArgument(
+              "Invalid dimensions_to_reduce argument to XlaReduce");
+        }
+        c->set_output(
+            0, c->UnknownShapeOfRank(rank - dimensions_to_reduce.size()));
+      } else {
+        c->set_output(0, c->input(0));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Wraps the XLA Reduce operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#reduce .
+
+input: the input tensor
+init_value: a scalar representing the initial value for the reduction
+reducer: a reducer function to apply
+dimensions_to_reduce: dimension numbers over which to reduce
+)doc");
+
 REGISTER_OP("XlaReduceWindow")
     .Input("input: T")
     .Input("init_value: T")
+    .Input("window_dimensions: Tindices")
+    .Input("window_strides: Tindices")
+    .Input("padding: Tindices")
     .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
     .Attr("computation: func")
-    .Attr("window_dimensions: list(int)")
-    .Attr("window_strides: list(int)")
-    .Attr("padding_low: list(int)")
-    .Attr("padding_high: list(int)")
     .Output("output: T")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn(UnchangedRank)
     .Doc(R"doc(
 Wraps the XLA ReduceWindow operator, documented at
  https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
@@ -118,8 +268,35 @@ init_value: a scalar representing the initial value for the reduction
 computation: a reducer function to apply
 window_dimensions: the shape of the window
 window_strides: the inter-window strides
-padding_low: the padding to apply at the start of each input dimensions
-padding_high: the padding to apply at the end of each input dimension.
+padding: the padding to apply at the start and end of each input dimensions
+)doc");
+
+REGISTER_OP("XlaSelectAndScatter")
+    .Input("operand: T")
+    .Input("window_dimensions: Tindices")
+    .Input("window_strides: Tindices")
+    .Input("padding: Tindices")
+    .Input("source: T")
+    .Input("init_value: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("select: func")
+    .Attr("scatter: func")
+    .Output("output: T")
+    .SetShapeFn(UnchangedRank)
+    .Doc(R"doc(
+Wraps the XLA SelectAndScatter operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
+.
+
+operand: the input tensor
+window_dimensions: the shape of the window
+window_strides: the inter-window strides
+padding: the padding to apply at the start and end of each input dimensions
+source: a tensor of values to scatter
+init_value: a scalar representing the initial value for the output tensor
+select: a selection function to apply
+scatter: a scatter function to apply
 )doc");
 
 REGISTER_OP("XlaSend")
@@ -179,4 +356,5 @@ body: A function that takes a list of tensors and returns another
       list of tensors. Both lists have the same types as specified by T.
 )doc");
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 42b6292f79..69ca394360 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -28,5 +28,6 @@ py_library(
     srcs = ["xla.py"],
     deps = [
         "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
+        "//tensorflow/compiler/xla:xla_data_proto_py",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 2fc47dffb8..3626de375e 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -15,11 +15,12 @@
 """Experimental library that exposes XLA operations directly in TensorFlow.
 
 It is sometimes useful to be able to build HLO programs directly from
-TensorFlow. This file provides Tensorflow operators that map as closely as
-possible to HLO operators.
+TensorFlow. This file provides Tensorflow operators that mirror the semantics of
+HLO operators as closely as possible.
 
-There is no promise of backward or forward compatibility for operators defined
-in this module.
+Note: There is no promise of backward or forward compatibility for operators
+defined in this module. This is primarily because the underlying HLO operators
+do not promise backward or forward compatibility.
 """
 
 from __future__ import absolute_import
@@ -27,11 +28,298 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.compiler.tf2xla.ops import gen_xla_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bitwise_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+
+# TODO(phawkins): provide wrappers for all XLA operators. Currently the missing
+# ops include:
+# infeed/outfeed (available via tf.contrib.tpu)
+# collectives, e.g., cross-replica-sum (available via tf.contrib.tpu)
+# conditional
+# gather/scatter
+# collapse
+
+# This file reuses builtin names (following XLA's names, so we can call things
+# like xla.max), so we capture the builtin versions here.
+# pylint: disable=redefined-builtin
+_max = max
+_min = min
+_slice = slice  # pylint: disable=invalid-name
+
+constant = constant_op.constant
+
+# Unary operators.
+
+# For most arithmetic operators there is a TensorFlow operator
+# that exactly corresponds to each XLA operator. Rather than defining
+# XLA-specific variants, we reuse the corresponding TensorFlow operator.
+# TODO(phawkins): It would be even better to have TensorFlow operators that 1:1
+# wrap every HLO operator, because that would allow us to be confident that the
+# semantics match.
+
+
+def _unary_op(fn):
+  """Wrapper that restricts `fn` to have the correct signature."""
+
+  def unary_op_wrapper(x, name=None):
+    return fn(x, name=name)
+
+  return unary_op_wrapper
+
+
+abs = _unary_op(math_ops.abs)
+# TODO(phawkins): implement clz.
+conj = _unary_op(math_ops.conj)
+cos = _unary_op(math_ops.cos)
+ceil = _unary_op(math_ops.ceil)
+digamma = _unary_op(math_ops.digamma)
+erf = _unary_op(math_ops.erf)
+erfc = _unary_op(math_ops.erfc)
+# TODO(phawkins): implement erfinv
+exp = _unary_op(math_ops.exp)
+expm1 = _unary_op(math_ops.expm1)
+floor = _unary_op(math_ops.floor)
+imag = _unary_op(math_ops.imag)
+is_finite = _unary_op(math_ops.is_finite)
+lgamma = _unary_op(math_ops.lgamma)
+log = _unary_op(math_ops.log)
+log1p = _unary_op(math_ops.log1p)
+logical_not = _unary_op(math_ops.logical_not)
+neg = _unary_op(math_ops.neg)
+real = _unary_op(math_ops.real)
+# TODO(phawkins): unlike xla::Round, this rounds to even instead of zero for
+# numbers halfway between two integers.
+round = _unary_op(math_ops.round)
+sin = _unary_op(math_ops.sin)
+sign = _unary_op(math_ops.sign)
+tanh = _unary_op(math_ops.tanh)
+
+# Binary operators
+
+# The main difference between TensorFlow and XLA binary ops is the broadcasting
+# semantics. TensorFlow uses Numpy-style broadcasting semantics, whereas XLA
+# requires an explicit specification of which dimensions to broadcast if the
+# arguments have different ranks.
+
+
+def _broadcasting_binary_op(fn):
+  """Wraps a binary Tensorflow operator and performs XLA-style broadcasting."""
+
+  def broadcasting_binary_op_wrapper(x, y, broadcast_dims=None, name=None):
+    """Inner wrapper function."""
+    broadcast_dims = broadcast_dims or []
+    broadcast_dims = ops.convert_to_tensor(broadcast_dims, dtypes.int64)
+    # Rather than relying on having static shape information in the TensorFlow
+    # graph, we use an XlaBroadcastHelper op that can compute the correct shapes
+    # at JIT compilation time.
+    x, y = gen_xla_ops.xla_broadcast_helper(x, y, broadcast_dims)
+    return fn(x, y, name=name)
+
+  return broadcasting_binary_op_wrapper
+
+
+# Map from TF signed types to TF unsigned types.
+_SIGNED_TO_UNSIGNED_TABLE = {
+    dtypes.int8: dtypes.uint8,
+    dtypes.int16: dtypes.uint16,
+    dtypes.int32: dtypes.uint32,
+    dtypes.int64: dtypes.uint64,
+}
+
+# Map from TF unsigned types to TF signed types.
+_UNSIGNED_TO_SIGNED_TABLE = {
+    dtypes.uint8: dtypes.int8,
+    dtypes.uint16: dtypes.int16,
+    dtypes.uint32: dtypes.int32,
+    dtypes.uint64: dtypes.int64,
+}
+
+
+def _shift_right_logical_helper(x, y, name=None):
+  """Performs an integer right logical shift irrespective of input type."""
+  assert y.dtype == x.dtype
+  dtype = x.dtype
+  signed = dtype in _SIGNED_TO_UNSIGNED_TABLE
+  if signed:
+    unsigned_dtype = _SIGNED_TO_UNSIGNED_TABLE[dtype]
+    x = math_ops.cast(x, unsigned_dtype)
+    y = math_ops.cast(y, unsigned_dtype)
+  output = bitwise_ops.right_shift(x, y, name=name)
+  if signed:
+    output = math_ops.cast(output, dtype)
+  return output
+
+
+def _shift_right_arithmetic_helper(x, y, name=None):
+  """Performs an integer right arithmetic shift irrespective of input type."""
+  assert y.dtype == x.dtype
+  dtype = x.dtype
+  unsigned = dtype in _UNSIGNED_TO_SIGNED_TABLE
+  if unsigned:
+    signed_dtype = _UNSIGNED_TO_SIGNED_TABLE[dtype]
+    x = math_ops.cast(x, signed_dtype)
+    y = math_ops.cast(y, signed_dtype)
+  output = bitwise_ops.right_shift(x, y, name=name)
+  if unsigned:
+    output = math_ops.cast(output, dtype)
+  return output
+
+
+add = _broadcasting_binary_op(math_ops.add)
+sub = _broadcasting_binary_op(math_ops.sub)
+mul = _broadcasting_binary_op(math_ops.mul)
+div = _broadcasting_binary_op(math_ops.div)
+rem = _broadcasting_binary_op(gen_math_ops.mod)
+max = _broadcasting_binary_op(math_ops.maximum)
+min = _broadcasting_binary_op(math_ops.minimum)
+atan2 = _broadcasting_binary_op(math_ops.atan2)
+complex = _broadcasting_binary_op(math_ops.complex)
+logical_and = _broadcasting_binary_op(math_ops.logical_and)
+logical_or = _broadcasting_binary_op(math_ops.logical_or)
+logical_xor = _broadcasting_binary_op(math_ops.logical_xor)
+eq = _broadcasting_binary_op(math_ops.equal)
+ne = _broadcasting_binary_op(math_ops.not_equal)
+ge = _broadcasting_binary_op(math_ops.greater_equal)
+gt = _broadcasting_binary_op(math_ops.greater)
+le = _broadcasting_binary_op(math_ops.less_equal)
+lt = _broadcasting_binary_op(math_ops.less)
+pow = _broadcasting_binary_op(math_ops.pow)
+shift_left = _broadcasting_binary_op(bitwise_ops.left_shift)
+shift_right_logical = _broadcasting_binary_op(_shift_right_logical_helper)
+shift_right_arithmetic = _broadcasting_binary_op(_shift_right_arithmetic_helper)
+
+
+def _binary_op(fn):
+  """Wrapper that restricts `fn` to have the correct signature."""
+
+  def binary_op_wrapper(x, y, name=None):
+    return fn(x, y, name=name)
+
+  return binary_op_wrapper
+
+
+transpose = _binary_op(array_ops.transpose)
+rev = _binary_op(array_ops.reverse)
+
+bitcast_convert_type = array_ops.bitcast
+
+
+def broadcast(x, dims, name=None):
+  x = ops.convert_to_tensor(x)
+  shape = array_ops.concat(
+      [constant_op.constant(dims),
+       array_ops.shape(x)], axis=0)
+  return array_ops.broadcast_to(x, shape, name=name)
+
+
+def clamp(a, x, b, name=None):
+  return min(max(a, x, name=name), b, name=name)
+
+
+concatenate = array_ops.concat
+
+
+def conv(lhs,
+         rhs,
+         window_strides,
+         padding,
+         lhs_dilation,
+         rhs_dilation,
+         dimension_numbers,
+         feature_group_count=1,
+         precision_config=None,
+         name=None):
+  """Wraps the XLA ConvGeneralDilated operator.
+
+  ConvGeneralDilated is the most general form of XLA convolution and is
+  documented at
+  https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
+
+  Args:
+    lhs: the input tensor
+    rhs: the kernel tensor
+    window_strides: the inter-window strides
+    padding: the padding to apply at the start and end of each input dimensions
+    lhs_dilation: dilation to apply between input elements
+    rhs_dilation: dilation to apply between kernel elements
+    dimension_numbers: a `ConvolutionDimensionNumbers` proto.
+    feature_group_count: number of feature groups for grouped convolution.
+    precision_config: a `PrecisionConfigProto` proto.
+    name: an optional name for the operator
+
+  Returns:
+    A tensor representing the output of the convolution.
+  """
+  precision_config_proto = ""
+  if precision_config:
+    precision_config_proto = precision_config.SerializeToString()
+  return gen_xla_ops.xla_conv(
+      lhs,
+      rhs,
+      window_strides=window_strides,
+      padding=padding,
+      lhs_dilation=lhs_dilation,
+      rhs_dilation=rhs_dilation,
+      feature_group_count=feature_group_count,
+      dimension_numbers=dimension_numbers.SerializeToString(),
+      precision_config=precision_config_proto,
+      name=name)
+
+
+convert_element_type = math_ops.cast
+
+
+def dot(lhs, rhs, name=None):
+  return math_ops.tensordot(lhs, rhs, axes=1, name=name)
+
+
+def dot_general(lhs, rhs, dimension_numbers, precision_config=None, name=None):
+  precision_config_proto = ""
+  if precision_config:
+    precision_config_proto = precision_config.SerializeToString()
+  return gen_xla_ops.xla_dot(
+      lhs,
+      rhs,
+      dimension_numbers=dimension_numbers.SerializeToString(),
+      precision_config=precision_config_proto,
+      name=name)
+
+
+def dynamic_slice(x, starts, sizes, name=None):
+  # TODO(phawkins): the Slice operator lowers to DynamicSlice if `starts` is not
+  # a compile-time constant. This doesn't exactly mimic the semantics of dynamic
+  # slice if the slice is out of bounds.
+  return array_ops.slice(x, starts, sizes, name=name)
 
-# TODO(phawkins): provide wrappers for all XLA operators.
 
 dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
 
+# TODO(phawkins): generalize tf.pad to support interior padding, and then remove
+# the XLA-specific pad operator.
+pad = gen_xla_ops.xla_pad
+
+
+def random_normal(mu, sigma, dims, name=None):
+  mu = ops.convert_to_tensor(mu)
+  return random_ops.random_normal(
+      dims, mean=mu, stddev=sigma, dtype=mu.dtype, name=name)
+
+
+def random_uniform(minval, maxval, dims, name=None):
+  minval = ops.convert_to_tensor(minval)
+  return random_ops.random_uniform(
+      dims, minval, maxval, dtype=minval.dtype, name=name)
+
+
+recv = gen_xla_ops.xla_recv
+reduce = gen_xla_ops.xla_reduce
+
 
 def reduce_window(operand,
                   init,
@@ -61,22 +349,38 @@ def reduce_window(operand,
   """
   window_strides = window_strides or [1] * len(window_dimensions)
   padding = padding or [(0, 0)] * len(window_dimensions)
-  padding_low = [x for (x, _) in padding]
-  padding_high = [y for (_, y) in padding]
   return gen_xla_ops.xla_reduce_window(
-      operand,
-      init,
-      reducer,
-      window_dimensions,
-      window_strides,
-      padding_low,
-      padding_high,
+      input=operand,
+      init_value=init,
+      window_dimensions=window_dimensions,
+      window_strides=window_strides,
+      padding=padding,
+      computation=reducer,
       name=name)
 
 
-recv = gen_xla_ops.xla_recv
+def reshape(x, new_sizes, dimensions=None, name=None):
+  if dimensions is not None:
+    x = array_ops.transpose(x, dimensions)
+  x = array_ops.reshape(x, new_sizes, name=name)
+  return x
+
+
+def select(condition, x, y, name=None):
+  return array_ops.where(condition, x, y, name)
+
+
+select_and_scatter = gen_xla_ops.xla_select_and_scatter
 send = gen_xla_ops.xla_send
 
-sort = gen_xla_ops.xla_sort
 
+def slice(x, start_dims, limit_dims, strides):
+  spec = [
+      _slice(start, limit, stride)
+      for (start, limit, stride) in zip(start_dims, limit_dims, strides)
+  ]
+  return x[tuple(spec)]
+
+
+sort = gen_xla_ops.xla_sort
 while_loop = gen_xla_ops.xla_while
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
new file mode 100644
index 0000000000..32ba6df2e6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -0,0 +1,130 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+#include "absl/algorithm/container.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+/*static*/ StringPiece XlaResourceOpInfo::XlaResourceOpKindToString(
+    XlaResourceOpKind op_kind) {
+  switch (op_kind) {
+    case XlaResourceOpKind::kRead:
+      return "Read";
+    case XlaResourceOpKind::kWrite:
+      return "Write";
+    case XlaResourceOpKind::kReadWrite:
+      return "Modify";
+  }
+}
+
+static gtl::FlatMap<StringPiece, XlaResourceOpInfo>* CreateResourceOpInfoMap() {
+  gtl::FlatMap<StringPiece, XlaResourceOpInfo>* result =
+      new gtl::FlatMap<StringPiece, XlaResourceOpInfo>;
+
+  auto add = [&](StringPiece op, XlaResourceOpKind op_kind,
+                 XlaResourceKind resource_kind) {
+    auto insert_result =
+        result->insert({op, XlaResourceOpInfo(op_kind, resource_kind)});
+    CHECK(insert_result.second);
+  };
+
+  auto kRead = XlaResourceOpKind::kRead;
+  auto kWrite = XlaResourceOpKind::kWrite;
+  auto kReadWrite = XlaResourceOpKind::kReadWrite;
+
+  auto kVariable = XlaResourceKind::kVariable;
+  auto kStack = XlaResourceKind::kStack;
+  auto kTensorArray = XlaResourceKind::kTensorArray;
+
+  // clang-format off
+  add("AssignAddVariableOp"                  , kReadWrite, kVariable);
+  add("AssignSubVariableOp"                  , kReadWrite, kVariable);
+  add("AssignVariableOp"                     , kWrite,     kVariable);
+  add("ReadVariableOp"                       , kRead,      kVariable);
+  add("ResourceApplyAdaMax"                  , kReadWrite, kVariable);
+  add("ResourceApplyAdadelta"                , kReadWrite, kVariable);
+  add("ResourceApplyAdagrad"                 , kReadWrite, kVariable);
+  add("ResourceApplyAdagradDA"               , kReadWrite, kVariable);
+  add("ResourceApplyAdam"                    , kReadWrite, kVariable);
+  add("ResourceApplyAddSign"                 , kReadWrite, kVariable);
+  add("ResourceApplyCenteredRMSProp"         , kReadWrite, kVariable);
+  add("ResourceApplyFtrl"                    , kReadWrite, kVariable);
+  add("ResourceApplyFtrlV2"                  , kReadWrite, kVariable);
+  add("ResourceApplyGradientDescent"         , kReadWrite, kVariable);
+  add("ResourceApplyMomentum"                , kReadWrite, kVariable);
+  add("ResourceApplyPowerSign"               , kReadWrite, kVariable);
+  add("ResourceApplyProximalAdagrad"         , kReadWrite, kVariable);
+  add("ResourceApplyProximalGradientDescent" , kReadWrite, kVariable);
+  add("ResourceApplyRMSProp"                 , kReadWrite, kVariable);
+  add("ResourceGather"                       , kRead,      kVariable);
+  add("ResourceScatterAdd"                   , kReadWrite, kVariable);
+  add("ResourceScatterDiv"                   , kReadWrite, kVariable);
+  add("ResourceScatterMax"                   , kReadWrite, kVariable);
+  add("ResourceScatterMin"                   , kReadWrite, kVariable);
+  add("ResourceScatterMul"                   , kReadWrite, kVariable);
+  add("ResourceScatterNdAdd"                 , kReadWrite, kVariable);
+  add("ResourceScatterNdUpdate"              , kReadWrite, kVariable);
+  add("ResourceScatterSub"                   , kReadWrite, kVariable);
+  add("ResourceScatterUpdate"                , kReadWrite, kVariable);
+  add("ResourceStridedSliceAssign"           , kReadWrite, kVariable);
+  add("VarIsInitializedOp"                   , kRead,      kVariable);
+  add("VariableShape"                        , kRead,      kVariable);
+
+  add("StackV2"                              , kWrite,     kStack);
+  add("StackCloseV2"                         , kRead,      kStack);
+  add("StackPopV2"                           , kReadWrite, kStack);
+  add("StackPushV2"                          , kReadWrite, kStack);
+
+  add("TensorArrayV3"                        , kWrite,     kTensorArray);
+  add("TensorArrayConcatV3"                  , kRead,      kTensorArray);
+  add("TensorArrayGatherV3"                  , kRead,      kTensorArray);
+  add("TensorArrayScatterV3"                 , kWrite,     kTensorArray);
+  add("TensorArrayGradV3"                    , kRead,      kTensorArray);
+  add("TensorArrayCloseV3"                   , kRead,      kTensorArray);
+  add("TensorArrayReadV3"                    , kRead,      kTensorArray);
+  add("TensorArraySizeV3"                    , kRead,      kTensorArray);
+  add("TensorArraySplitV3"                   , kWrite,     kTensorArray);
+  add("TensorArrayWriteV3"                   , kWrite,     kTensorArray);
+  // clang-format on
+
+  return result;
+}
+
+static const gtl::FlatMap<StringPiece, XlaResourceOpInfo>&
+GetStaticResourceOpInfoMap() {
+  static gtl::FlatMap<StringPiece, XlaResourceOpInfo>* op_info_map =
+      CreateResourceOpInfoMap();
+  return *op_info_map;
+}
+
+const XlaResourceOpInfo* GetResourceOpInfoForOp(StringPiece op) {
+  const gtl::FlatMap<StringPiece, XlaResourceOpInfo>& op_infos =
+      GetStaticResourceOpInfoMap();
+  auto it = op_infos.find(op);
+  return it == op_infos.end() ? nullptr : &it->second;
+}
+
+namespace resource_op_table_internal {
+std::vector<StringPiece> GetKnownResourceOps() {
+  std::vector<StringPiece> result;
+  for (const auto& p : GetStaticResourceOpInfoMap()) {
+    result.push_back(p.first);
+  }
+  absl::c_sort(result);
+  return result;
+}
+}  // namespace resource_op_table_internal
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.h b/tensorflow/compiler/tf2xla/resource_operation_table.h
new file mode 100644
index 0000000000..7f627a64c6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.h
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/logging.h"
+
+// Exposes information about the resource operations supported by tf2xla in a
+// structured form.
+
+namespace tensorflow {
+enum class XlaResourceOpKind {
+  kRead,      // Only reads from resources.
+  kWrite,     // Only writes to resources.
+  kReadWrite  // Reads from and writes to resources.
+};
+
+enum class XlaResourceKind {
+  kVariable,    // Operates on resource variables.
+  kStack,       // Operates on stacks.
+  kTensorArray  // Operates on tensor arrays.
+};
+
+class XlaResourceOpInfo {
+ public:
+  explicit XlaResourceOpInfo(XlaResourceOpKind op_kind,
+                             XlaResourceKind resource_kind)
+      : op_kind_(op_kind), resource_kind_(resource_kind) {}
+
+  XlaResourceOpKind kind() const { return op_kind_; }
+  XlaResourceKind resource_kind() const { return resource_kind_; }
+
+  static StringPiece XlaResourceOpKindToString(XlaResourceOpKind op_kind);
+
+ private:
+  XlaResourceOpKind op_kind_;
+  XlaResourceKind resource_kind_;
+};
+
+// Returns a XlaResourceOpInfo describing `op` if it is a resource operation
+// supported by tf2xla, otherwise returns null (i.e. if this returns null then
+// `op` is either not a resource operation or is unsupported by XLA).
+const XlaResourceOpInfo* GetResourceOpInfoForOp(StringPiece op);
+
+namespace resource_op_table_internal {
+// NB! Implementation detail exposed for unit testing, do not use.
+//
+// Returns the set of resource operations known by this module.
+std::vector<StringPiece> GetKnownResourceOps();
+}  // namespace resource_op_table_internal
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_RESOURCE_OPERATION_TABLE_H_
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table_test.cc b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
new file mode 100644
index 0000000000..0343f80de9
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/resource_operation_table_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+bool IsResourceArgDef(const OpDef::ArgDef& arg_def) {
+  return arg_def.type() == DT_RESOURCE;
+}
+
+bool HasResourceInputOrOutput(const OpDef& op_def) {
+  return absl::c_any_of(op_def.input_arg(), IsResourceArgDef) ||
+         absl::c_any_of(op_def.output_arg(), IsResourceArgDef);
+}
+
+TEST(ResourceOperationTableTest, HaveAllResourceOps) {
+  gtl::FlatMap<string, bool> known_resource_ops;
+  for (StringPiece known_resource_op :
+       resource_op_table_internal::GetKnownResourceOps()) {
+    ASSERT_TRUE(
+        known_resource_ops.insert({string(known_resource_op), false}).second);
+  }
+
+  std::vector<string> xla_op_names = XlaOpRegistry::GetAllRegisteredOps();
+  for (const string& xla_op_name : xla_op_names) {
+    const OpDef* op_def;
+    TF_ASSERT_OK(OpRegistry::Global()->LookUpOpDef(xla_op_name, &op_def));
+    if (HasResourceInputOrOutput(*op_def)) {
+      EXPECT_EQ(known_resource_ops.count(xla_op_name), 1)
+          << "Unknown resource op " << xla_op_name;
+      known_resource_ops[xla_op_name] = true;
+    }
+  }
+
+  std::vector<string> unnecessary_resource_ops;
+  for (const auto& pair : known_resource_ops) {
+    if (!pair.second) {
+      unnecessary_resource_ops.push_back(pair.first);
+    }
+  }
+
+  EXPECT_TRUE(unnecessary_resource_ops.empty())
+      << "Stale resource ops:\n"
+      << absl::StrJoin(unnecessary_resource_ops, "\n");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 5759c72af3..2d7eb8b915 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -27,10 +27,10 @@ const char kShardingAttribute[] = "_XlaSharding";
 }  // namespace
 
 namespace {
-xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-GetShardingFromNodeDef(const NodeDef& node_def) {
+xla::StatusOr<absl::optional<xla::OpSharding>> GetShardingFromNodeDef(
+    const NodeDef& node_def) {
   if (!HasNodeAttr(node_def, kShardingAttribute)) {
-    return tensorflow::gtl::optional<xla::OpSharding>();
+    return absl::optional<xla::OpSharding>();
   }
   string value;
   xla::OpSharding sharding;
@@ -40,7 +40,7 @@ GetShardingFromNodeDef(const NodeDef& node_def) {
         "Experimental _XlaSharding attribute was not a valid encoded "
         "xla::OpSharding proto.");
   }
-  return tensorflow::gtl::optional<xla::OpSharding>(sharding);
+  return absl::optional<xla::OpSharding>(sharding);
 }
 
 Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
@@ -50,12 +50,11 @@ Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
 }
 }  // namespace
 
-xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
     const string& device_name, int num_cores_per_replica,
-    tensorflow::gtl::optional<xla::OpSharding> explicit_sharding) {
+    absl::optional<xla::OpSharding> explicit_sharding) {
   if (device_name.empty()) {
-    return tensorflow::gtl::optional<xla::OpSharding>();
+    return absl::optional<xla::OpSharding>();
   }
   DeviceNameUtils::ParsedName parsed_device;
   if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) {
@@ -66,34 +65,34 @@ ParseShardingFromDevice(
   if (explicit_sharding.has_value()) {
     return explicit_sharding;
   } else if (!parsed_device.has_type || !parsed_device.has_id ||
-             !str_util::StrContains(parsed_device.type,
-                                    kDeviceSuffixReplicatedCore)) {
-    return tensorflow::gtl::optional<xla::OpSharding>();
+             !absl::StrContains(parsed_device.type,
+                                kDeviceSuffixReplicatedCore)) {
+    return absl::optional<xla::OpSharding>();
   } else {
     const int core = parsed_device.id;
     if (core < 0 || core >= num_cores_per_replica) {
       return CoreOutOfRangeError(core, num_cores_per_replica);
     }
-    return tensorflow::gtl::optional<xla::OpSharding>(
+    return absl::optional<xla::OpSharding>(
         xla::sharding_builder::AssignDevice(core));
   }
 }
 
-xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica) {
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const NodeDef& node_def, int num_cores_per_replica) {
   const string& device_name = node_def.device();
-  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+  TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> sharding,
                       GetShardingFromNodeDef(node_def));
   return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
 }
 
-xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const Node& node, int num_cores_per_replica) {
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const Node& node, int num_cores_per_replica) {
   string device_name = node.assigned_device_name();
   if (device_name.empty()) {
     device_name = node.requested_device();
   }
-  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+  TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> sharding,
                       GetShardingFromNodeDef(node.def()));
   return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
 }
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
index b1c817bdcc..ab67d4f154 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.h
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_
-#define TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_
 
 #include <string>
 
@@ -33,19 +33,18 @@ namespace tensorflow {
 // - explicit_sharding if explicit_sharding.has_value()
 // - a non-value if there is no assigned core or
 // - a sharding set as per xla::sharding_builder::AssignDevice.
-xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const string& device_name, int num_cores_per_replica,
-                        tensorflow::gtl::optional<xla::OpSharding>
-                            explicit_sharding = tensorflow::gtl::nullopt);
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const string& device_name, int num_cores_per_replica,
+    absl::optional<xla::OpSharding> explicit_sharding = absl::nullopt);
 
-xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const Node& node, int num_cores_per_replica);
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const Node& node, int num_cores_per_replica);
 
-xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica);
+xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
+    const NodeDef& node_def, int num_cores_per_replica);
 
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/sharding_util_test.cc b/tensorflow/compiler/tf2xla/sharding_util_test.cc
index bff5978237..dcb7e212b7 100644
--- a/tensorflow/compiler/tf2xla/sharding_util_test.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util_test.cc
@@ -23,7 +23,7 @@ TEST(CoreUtilTest, ParseShardingFromDevice) {
   Graph graph(OpRegistry::Global());
 
   auto core_from_sharding =
-      [](tensorflow::gtl::optional<xla::OpSharding> sharding) -> int64 {
+      [](absl::optional<xla::OpSharding> sharding) -> int64 {
     if (sharding.has_value() &&
         sharding.value().type() ==
             xla::OpSharding::Type::OpSharding_Type_MAXIMAL) {
diff --git a/tensorflow/compiler/tf2xla/str_util.cc b/tensorflow/compiler/tf2xla/str_util.cc
deleted file mode 100644
index 2b0834fe7b..0000000000
--- a/tensorflow/compiler/tf2xla/str_util.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/str_util.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace tensorflow {
-namespace str_util {
-
-static void ReplaceAll(string* text, StringPiece from, StringPiece to) {
-  size_t pos = 0;
-  while ((pos = text->find(from.data(), pos, from.size())) != string::npos) {
-    text->replace(pos, from.size(), to.data(), to.size());
-    pos += to.size();
-    if (from.empty()) {
-      pos++;  // Match at the beginning of the text and after every byte
-    }
-  }
-}
-
-void ReplaceAllPairs(string* text,
-                     const std::vector<std::pair<string, string>>& replace) {
-  for (const std::pair<string, string>& from_to : replace) {
-    ReplaceAll(text, from_to.first, from_to.second);
-  }
-}
-
-}  // namespace str_util
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/str_util.h b/tensorflow/compiler/tf2xla/str_util.h
deleted file mode 100644
index 51f25009d7..0000000000
--- a/tensorflow/compiler/tf2xla/str_util.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// String utilities that are esoteric enough that they don't belong in
-// third_party/tensorflow/core/lib/strings/str_util.h, but are still generally
-// useful under xla.
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_STR_UTIL_H_
-#define TENSORFLOW_COMPILER_TF2XLA_STR_UTIL_H_
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/lib/core/stringpiece.h"
-
-namespace tensorflow {
-namespace str_util {
-
-// Replace all non-overlapping occurrences of the given (from,to) pairs in-place
-// in text.  If from is empty, it matches at the beginning of the text and after
-// every byte.  Each (from,to) replacement pair is processed in the order it is
-// given.
-void ReplaceAllPairs(string* text,
-                     const std::vector<std::pair<string, string>>& replace);
-
-}  // namespace str_util
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_STR_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/str_util_test.cc b/tensorflow/compiler/tf2xla/str_util_test.cc
deleted file mode 100644
index 8817f6902a..0000000000
--- a/tensorflow/compiler/tf2xla/str_util_test.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/str_util.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace str_util {
-
-class ReplaceAllPairsTest : public ::testing::Test {
- protected:
-  void ExpectReplaceAllPairs(
-      string text, const std::vector<std::pair<string, string>>& replace,
-      StringPiece want) {
-    ReplaceAllPairs(&text, replace);
-    EXPECT_EQ(text, want);
-  }
-};
-
-TEST_F(ReplaceAllPairsTest, Simple) {
-  ExpectReplaceAllPairs("", {}, "");
-  ExpectReplaceAllPairs("", {{"", ""}}, "");
-  ExpectReplaceAllPairs("", {{"", "X"}}, "X");
-  ExpectReplaceAllPairs("", {{"", "XYZ"}}, "XYZ");
-  ExpectReplaceAllPairs("", {{"", "XYZ"}, {"", "_"}}, "_X_Y_Z_");
-  ExpectReplaceAllPairs("", {{"", "XYZ"}, {"", "_"}, {"_Y_", "a"}}, "_XaZ_");
-  ExpectReplaceAllPairs("banana", {}, "banana");
-  ExpectReplaceAllPairs("banana", {{"", ""}}, "banana");
-  ExpectReplaceAllPairs("banana", {{"", "_"}}, "_b_a_n_a_n_a_");
-  ExpectReplaceAllPairs("banana", {{"", "__"}}, "__b__a__n__a__n__a__");
-  ExpectReplaceAllPairs("banana", {{"a", "a"}}, "banana");
-  ExpectReplaceAllPairs("banana", {{"a", ""}}, "bnn");
-  ExpectReplaceAllPairs("banana", {{"a", "X"}}, "bXnXnX");
-  ExpectReplaceAllPairs("banana", {{"a", "XX"}}, "bXXnXXnXX");
-  ExpectReplaceAllPairs("banana", {{"a", "XX"}, {"XnX", "z"}}, "bXzzX");
-  ExpectReplaceAllPairs("a{{foo}}b{{bar}}c{{foo}}",
-                        {{"{{foo}}", "0"}, {"{{bar}}", "123456789"}},
-                        "a0b123456789c0");
-}
-
-}  // namespace str_util
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 48568c825b..f34af2d67d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -197,8 +197,8 @@ Status RewriteAndPruneGraph(
   if (!missing_feeds.empty() || !missing_fetches.empty()) {
     return errors::Aborted(
         "Post graph-pruning",
-        ", missing feeds: ", str_util::Join(missing_feeds, ", "),
-        ", missing fetches: ", str_util::Join(missing_fetches, ", "));
+        ", missing feeds: ", absl::StrJoin(missing_feeds, ", "),
+        ", missing fetches: ", absl::StrJoin(missing_fetches, ", "));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
index 7aca889a26..567d212b5e 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -54,10 +54,10 @@ void PrintSupportedOps(const string& device, const string& regen_run) {
       }
       std::sort(types.begin(), types.end());
       constraints.push_back("`" + constraint.name() + "={" +
-                            str_util::Join(types, ",") + "}`");
+                            absl::StrJoin(types, ",") + "}`");
     }
     std::cout << "`" << kdef->op() << "` | "
-              << str_util::Join(constraints, "<br>") << std::endl;
+              << absl::StrJoin(constraints, "<br>") << std::endl;
   }
 
   std::cout << "\nTo regenerate this table, run:\n\n```shell\n"
@@ -76,7 +76,7 @@ void SupportedOpsMain(int argc, char** argv, const char* regen_run) {
       {"device", &device,
        "Name of the compilation device for which to print supported ops, "
        "one of: " +
-           str_util::Join(device_names, ",")},
+           absl::StrJoin(device_names, ",")},
   };
   string usage = Flags::Usage(argv[0], flag_list);
   bool parsed_flags_ok = Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 0e07485d18..ebdf2fd741 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <set>
 #include <unordered_map>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -268,7 +268,7 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
     if (edge->IsControlEdge()) continue;
     const Node* possible_match = out_edges ? edge->dst() : edge->src();
     TF_ASSIGN_OR_RETURN(
-        tensorflow::gtl::optional<xla::OpSharding> sharding,
+        absl::optional<xla::OpSharding> sharding,
         ParseShardingFromDevice(
             *possible_match,
             /*num_cores_per_replica=*/std::numeric_limits<int32>::max()));
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index ae51446204..2b1f724dc7 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/data_flow_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
@@ -25,16 +26,15 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-void ExpectErrorContains(const Status& status, StringPiece str) {
+void ExpectErrorContains(const Status& status, absl::string_view str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index e89f473328..d98237bd5c 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -103,7 +103,7 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel,
   auto sharding_parse_result = ParseShardingFromDevice(
       op_kernel->def(), std::numeric_limits<int>::max());
   OP_REQUIRES_OK(context, sharding_parse_result.status());
-  tensorflow::gtl::optional<xla::OpSharding> op_sharding =
+  absl::optional<xla::OpSharding> op_sharding =
       sharding_parse_result.ValueOrDie();
 
   // If no sharding metadata is found, XLA is free to use whatever device it
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 226c89bcf1..eabfc6b6e2 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -310,7 +311,7 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
   // unique_ptr so we can capture the cleanup status in the end.
   xla_context->Ref();
   Status status;
-  auto step_container = xla::MakeUnique<ScopedStepContainer>(
+  auto step_container = absl::make_unique<ScopedStepContainer>(
       step_id, [&status, device](const string& name) {
         status = device->resource_manager()->Cleanup(name);
       });
@@ -413,7 +414,7 @@ Status BuildComputation(
 
       // Request that the value be returned on a specific core.
       xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
+          builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
 
       xla::XlaOp handle;
@@ -464,8 +465,6 @@ Status XlaCompiler::BuildArguments(
   // XLA computation as runtime parameters.
   input_mapping->clear();
   input_mapping->reserve(args.size());
-  std::vector<int> resources;
-  resources.reserve(args.size());
 
   // Fills in constant arguments, and computes non-constant argument order.
   for (std::vector<XlaCompiler::Argument>::size_type i = 0; i < args.size();
@@ -484,8 +483,9 @@ Status XlaCompiler::BuildArguments(
             /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
         arg_expression.set_resource(resource);
         if (arg.initialized) {
-          resources.push_back(i);
+          input_mapping->push_back(i);
         }
+
         break;
       case XlaCompiler::Argument::kParameter: {
         input_mapping->push_back(i);
@@ -499,10 +499,6 @@ Status XlaCompiler::BuildArguments(
     }
   }
 
-  // Append parameters containing variable values after the other runtime
-  // parameters.
-  input_mapping->insert(input_mapping->end(), resources.begin(),
-                        resources.end());
   if (input_mapping->empty()) {
     return Status::OK();
   }
@@ -570,7 +566,7 @@ Status XlaCompiler::BuildArguments(
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
       const int core = (*arg_cores)[input_mapping->at(i)];
       xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
+          builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] = xla::GetTupleElement(tuple, i);
     }
@@ -578,7 +574,7 @@ Status XlaCompiler::BuildArguments(
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
       const int core = (*arg_cores)[input_mapping->at(i)];
       xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
+          builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] = xla::Parameter(builder, i, (*input_shapes)[i],
                                       strings::StrCat("arg", i));
@@ -791,14 +787,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   VLOG(2) << "XLA output shape: "
           << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
-  // Copy the host transfer metadata to the result.
-  for (const auto& send : host_compute_sends_) {
-    *result->host_compute_metadata.add_device_to_host() = send.second;
-  }
-  for (const auto& recv : host_compute_recvs_) {
-    *result->host_compute_metadata.add_host_to_device() = recv.second;
-  }
-
   // Tensorflow expects a major-to-minor order of results.
   xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
 
@@ -816,6 +804,30 @@ Status XlaCompiler::GetChannelHandle(const string& key,
   return Status::OK();
 }
 
+Status XlaCompiler::GetHostToDeviceChannelHandle(const string& key,
+                                                 xla::ChannelHandle* channel) {
+  auto result = channels_.emplace(key, xla::ChannelHandle());
+  if (result.second) {
+    TF_ASSIGN_OR_RETURN(result.first->second,
+                        client()->CreateHostToDeviceChannelHandle());
+  }
+  *channel = result.first->second;
+  VLOG(1) << "Host to device channel: " << key << " " << channel->DebugString();
+  return Status::OK();
+}
+
+Status XlaCompiler::GetDeviceToHostChannelHandle(const string& key,
+                                                 xla::ChannelHandle* channel) {
+  auto result = channels_.emplace(key, xla::ChannelHandle());
+  if (result.second) {
+    TF_ASSIGN_OR_RETURN(result.first->second,
+                        client()->CreateDeviceToHostChannelHandle());
+  }
+  *channel = result.first->second;
+  VLOG(1) << "Device to host channel: " << key << " " << channel->DebugString();
+  return Status::OK();
+}
+
 namespace {
 
 void SetTransfer(const string& key, gtl::ArraySlice<DataType> types,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 25332c8d8e..da1ae02f32 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -212,9 +212,9 @@ class XlaCompiler {
 
   struct CompilationResult {
     // Vector that maps from the parameters of the XLA computation to their
-    // original argument positions. To handle compile-time constant inputs and
-    // resources, the parameters to the XLA computation may be a subset of the
-    // original arguments, and are not necessarily in the same order.)
+    // original argument positions. To handle compile-time constant inputs, the
+    // parameters to the XLA computation may be a subset of the original
+    // arguments. The relative ordering of parameters are maintained.
     std::vector<int> input_mapping;
 
     // Input shapes of the computation. If we are flattening inputs, these are
@@ -332,6 +332,16 @@ class XlaCompiler {
   // same XlaCompiler.
   Status GetChannelHandle(const string& key, xla::ChannelHandle* channel);
 
+  // Retrieves the host-to-device channel handle associated with `key`.
+  // Allocates a new channel handle if none exists.
+  Status GetHostToDeviceChannelHandle(const string& key,
+                                      xla::ChannelHandle* channel);
+
+  // Retrieves the device-to-host channel handle associated with `key`.
+  // Allocates a new channel handle if none exists.
+  Status GetDeviceToHostChannelHandle(const string& key,
+                                      xla::ChannelHandle* channel);
+
   // Sets the shapes and types for the device to host transfer associated with
   // 'key'.
   Status SetDeviceToHostMetadata(const string& key,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index be00ed8813..740f6dc25c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "absl/strings/match.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/data_flow_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -280,6 +280,54 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(*param0_literal, *actual_literal));
 }
 
+// Tests that the compiler doesn't reorder the parameters.
+TEST_F(XlaCompilerTest, MixedOrderArguments) {
+  for (bool swap_order : {false, true}) {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto var =
+        ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, swap_order ? 0 : 1);
+    auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, swap_order ? 1 : 0);
+    // Adds an identity op around the resource to make sure identity ops
+    // propagate resources correctly.
+    auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+    auto write = ops::AssignAddVariableOp(scope, identity, a);
+    auto read = ops::ReadVariableOp(
+        scope.WithControlDependencies(std::vector<Operation>{write}), var,
+        DT_INT32);
+    auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+    auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+    // Builds a description of the arguments.
+    std::vector<XlaCompiler::Argument> args(2);
+    args[0].kind = XlaCompiler::Argument::kParameter;
+    args[0].type = DT_INT32;
+    args[0].shape = TensorShape({2});
+    args[1].kind = XlaCompiler::Argument::kResource;
+    args[1].resource_kind = XlaResource::kVariable;
+    args[1].initialized = true;
+    args[1].type = DT_INT32;
+    args[1].shape = TensorShape({2});
+
+    if (swap_order) {
+      // Even after swapping arguments, the compiler should maintain the new
+      // ordering of parameters.
+      std::swap(args[0], args[1]);
+    }
+    // Compiles the graph.
+    XlaCompiler compiler(DefaultOptions());
+
+    XlaCompiler::CompileOptions compile_options;
+    compile_options.always_return_tuple = false;
+    XlaCompiler::CompilationResult result;
+    TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
+                                       args, &result));
+
+    EXPECT_THAT(result.input_mapping, ::testing::ElementsAre(0, 1));
+  }
+}
+
 TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
   // Builds a graph that adds reshapes a tensor, but with the shape not
   // statically known.
@@ -309,10 +357,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
                             std::move(graph), args, &result);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "depends on a parameter"))
+      absl::StrContains(status.error_message(), "depends on a parameter"))
       << status.error_message();
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "[[{{node C}} = Reshape"))
+      absl::StrContains(status.error_message(), "[[{{node C}} = Reshape"))
       << status.error_message();
 }
 
@@ -727,8 +775,7 @@ TEST_F(XlaCompilerTest, UndefinedFunctionFails) {
       compiler.CompileFunction(XlaCompiler::CompileOptions(), name_attr,
                                /*args=*/{}, &result);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
-                                    "is not defined."))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "is not defined."))
       << status.error_message();
 }
 
@@ -807,12 +854,10 @@ TEST_F(XlaCompilerTest, LocalFunctionWithWrongArgumentsFail) {
 
   ASSERT_FALSE(status.ok());
   // Flib lookup failure.
-  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
-                                    "is not defined."))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "is not defined."))
       << status.error_message();
   // Local flib lookup failure.
-  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
-                                    "Attr T is not found"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Attr T is not found"))
       << status.error_message();
 }
 
@@ -821,7 +866,10 @@ TEST_F(XlaCompilerTest, Variables) {
   Scope scope = Scope::NewRootScope().ExitOnError();
   auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
   auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
-  auto write = ops::AssignAddVariableOp(scope, var, a);
+  // Adds an identity op around the resource to make sure identity ops propagate
+  // resources correctly.
+  auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+  auto write = ops::AssignAddVariableOp(scope, identity, a);
   auto read = ops::ReadVariableOp(
       scope.WithControlDependencies(std::vector<Operation>{write}), var,
       DT_INT32);
@@ -1075,9 +1123,9 @@ TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
   status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill",
                                  std::move(graph), args, &result);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "InvalidOp"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "InvalidOp"))
       << status.error_message();
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node fill_fn}}"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node fill_fn}}"))
       << status.error_message();
 }
 
@@ -1100,10 +1148,10 @@ TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
   status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "invalid_type",
                                  std::move(graph), args, &result);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "is not in the list of allowed values"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "is not in the list of allowed values"))
       << status.error_message();
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node Shape}}"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node Shape}}"))
       << status.error_message();
 }
 
@@ -1127,9 +1175,9 @@ TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
                                    std::move(graph_copy), args, &result);
     ASSERT_FALSE(status.ok());
     EXPECT_TRUE(
-        str_util::StrContains(status.error_message(),
-                              "The following nodes are unreachable "
-                              "from the source in the graph: {{node NoOp}}"))
+        absl::StrContains(status.error_message(),
+                          "The following nodes are unreachable "
+                          "from the source in the graph: {{node NoOp}}"))
         << status.error_message();
   }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 82028c8b9c..9e8f5f2a1a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -99,6 +99,25 @@ Status XlaOpKernelContext::ConstantInput(int index,
       index, context_->input(index).shape().dim_sizes(), constant_literal);
 }
 
+static xla::StatusOr<int> InputIndex(XlaOpKernelContext* context,
+                                     StringPiece name) {
+  int start, stop;
+  TF_RETURN_IF_ERROR(context->op_kernel().InputRange(name, &start, &stop));
+  if (stop != start + 1) {
+    return errors::InvalidArgument("OpKernel used list-valued input name '",
+                                   name,
+                                   "' when single-valued input was "
+                                   "expected");
+  }
+  return start;
+}
+
+Status XlaOpKernelContext::ConstantInput(StringPiece name,
+                                         xla::Literal* constant_literal) {
+  TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name));
+  return ConstantInput(index, constant_literal);
+}
+
 Status XlaOpKernelContext::ConstantInputReshaped(
     int index, gtl::ArraySlice<int64> new_dims,
     xla::Literal* constant_literal) {
@@ -246,6 +265,12 @@ Status XlaOpKernelContext::ConstantInputAsIntScalar(int index, int64* out) {
   return LiteralToInt64Scalar(literal, out);
 }
 
+Status XlaOpKernelContext::ConstantInputAsIntScalar(StringPiece name,
+                                                    int64* out) {
+  TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name));
+  return ConstantInputAsIntScalar(index, out);
+}
+
 Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) {
   xla::Literal literal;
   TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
@@ -280,6 +305,20 @@ Status XlaOpKernelContext::ConstantInputAsIntVector(int index,
   return LiteralToInt64Vector(literal, out);
 }
 
+Status XlaOpKernelContext::ConstantInputAsIntVector(StringPiece name,
+                                                    std::vector<int64>* out) {
+  TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name));
+  return ConstantInputAsIntVector(index, out);
+}
+
+Status XlaOpKernelContext::ConstantInputReshapedToIntVector(
+    int index, std::vector<int64>* out) {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInputReshaped(
+      index, {InputShape(index).num_elements()}, &literal));
+  return LiteralToInt64Vector(literal, out);
+}
+
 Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
                                                        xla::Literal* out) {
   xla::Literal literal;
@@ -305,6 +344,12 @@ Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
   }
 }
 
+Status XlaOpKernelContext::ConstantInputAsInt64Literal(StringPiece name,
+                                                       xla::Literal* out) {
+  TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name));
+  return ConstantInputAsInt64Literal(index, out);
+}
+
 // TODO(phawkins): validate that the dimensions form a valid shape, fail
 // gracefully if they do not.
 Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index ac9dfe3369..3e26ba4f01 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -106,6 +106,7 @@ class XlaOpKernelContext {
   // expression cannot be evaluated, e.g., because it depends on unbound
   // parameters, returns a non-OK status.
   Status ConstantInput(int index, xla::Literal* constant_literal);
+  Status ConstantInput(StringPiece name, xla::Literal* constant_literal);
 
   // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
   // InputShape(index), and stores it in `*constant_literal`. If the input
@@ -117,15 +118,22 @@ class XlaOpKernelContext {
 
   // Converts a constant scalar int32 or int64 tensor into an int64.
   Status ConstantInputAsIntScalar(int index, int64* out);
+  Status ConstantInputAsIntScalar(StringPiece name, int64* out);
 
   // Converts a constant scalar float32 or float64 tensor into a float64.
   Status ConstantInputAsFloatScalar(int index, double* out);
 
   // Converts a constant 1D int32 or int64 tensor into a vector of int64s.
   Status ConstantInputAsIntVector(int index, std::vector<int64>* out);
+  Status ConstantInputAsIntVector(StringPiece name, std::vector<int64>* out);
+
+  // Reshapes and converts a constant int32 or int64 tensor into a vector of
+  // int64s.
+  Status ConstantInputReshapedToIntVector(int index, std::vector<int64>* out);
 
   // Converts a constant int32 or int64 Tensor into an xla int64 Literal.
   Status ConstantInputAsInt64Literal(int index, xla::Literal* out);
+  Status ConstantInputAsInt64Literal(StringPiece name, xla::Literal* out);
 
   // Converts a constant 1D int32 or int64 tensor into a TensorShape.
   Status ConstantInputAsShape(int index, TensorShape* shape);
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 46785bc1f0..e25c7e8c9e 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -325,6 +325,17 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return kernels;
 }
 
+/*static*/ std::vector<string> XlaOpRegistry::GetAllRegisteredOps() {
+  std::vector<string> ops;
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  for (const auto& pair : registry.ops_) {
+    ops.push_back(pair.first);
+  }
+  std::sort(ops.begin(), ops.end());
+  return ops;
+}
+
 /* static */ const std::unordered_set<string>*
 XlaOpRegistry::CompileTimeConstantInputs(const string& op) {
   XlaOpRegistry& registry = Instance();
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index fc14834ca6..6ce0e2580b 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -128,6 +128,9 @@ class XlaOpRegistry {
       const string& compilation_device_name,
       bool include_compilation_only_kernels);
 
+  // Returns all operations for which there are XLA kernels on any device.
+  static std::vector<string> GetAllRegisteredOps();
+
   // Returns the set of compile-time constant inputs to 'op'. Returns nullptr
   // if the op is not registered.
   static const std::unordered_set<string>* CompileTimeConstantInputs(
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index fdf13bb18c..26bd1ac4f7 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -113,6 +113,7 @@ cc_library(
         ":statusor",
         ":types",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -161,7 +162,6 @@ cc_library(
         "iterator_util.h",
         "map_util.h",
         "overflow_util.h",
-        "ptr_util.h",
         "util.h",
     ],
     visibility = ["//visibility:public"],
@@ -172,7 +172,9 @@ cc_library(
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -210,6 +212,7 @@ tf_cc_test(
         ":test",
         ":util",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -236,10 +239,12 @@ cc_library(
         ":types",
         ":util",
         ":xla_data_proto",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -256,6 +261,7 @@ tf_cc_test(
         ":xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -297,6 +303,8 @@ cc_library(
         ":util",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -315,6 +323,8 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -335,6 +345,8 @@ cc_library(
         ":util",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -353,6 +365,7 @@ cc_library(
         ":literal_util",
         ":util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -364,6 +377,7 @@ cc_library(
     deps = [
         ":util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -373,8 +387,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":types",
-        "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -385,6 +399,7 @@ cc_library(
         ":status",
         ":types",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -405,8 +420,9 @@ cc_library(
     deps = [
         ":array",
         ":types",
-        ":util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -451,6 +467,7 @@ cc_library(
         ":array2d",
         ":types",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -489,6 +506,7 @@ cc_library(
         ":util",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -503,6 +521,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -521,6 +540,8 @@ cc_library(
         ":xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -551,6 +572,7 @@ cc_library(
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -576,10 +598,11 @@ cc_library(
     deps = [
         ":shape_util",
         ":status_macros",
-        ":util",
         ":xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -593,6 +616,7 @@ tf_cc_test(
         ":xla_data_proto",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -619,6 +643,7 @@ cc_library(
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -642,6 +667,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -660,6 +686,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -672,6 +699,7 @@ cc_library(
         ":shape_util",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 2d5d078aa7..c8e483712e 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -27,12 +27,12 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -507,9 +507,7 @@ class Array {
         }
       }
 
-      pieces.push_back(
-          tensorflow::strings::AlphaNum(values_[calculate_index(index)])
-              .data());
+      pieces.push_back(absl::StrCat(values_[calculate_index(index)]));
 
       // Emit comma if it isn't the last element
       if (index.back() != sizes_.back() - 1) {
@@ -527,7 +525,7 @@ class Array {
         }
       }
     } while (next_index(&index));
-    return tensorflow::str_util::Join(pieces, "");
+    return absl::StrJoin(pieces, "");
   }
 
  private:
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index a17e81f448..782c966b4c 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -24,12 +24,11 @@ limitations under the License.
 #include <random>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/bits.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -101,7 +100,7 @@ class Array2D : public Array<T> {
 template <typename NativeT = float>
 std::unique_ptr<Array2D<NativeT>> MakeLinspaceArray2D(double from, double to,
                                                       int64 n1, int64 n2) {
-  auto array = MakeUnique<Array2D<NativeT>>(n1, n2);
+  auto array = absl::make_unique<Array2D<NativeT>>(n1, n2);
   int64 count = n1 * n2;
   NativeT step =
       static_cast<NativeT>((count > 1) ? (to - from) / (count - 1) : 0);
diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h
index a75fffc605..14e7bf1814 100644
--- a/tensorflow/compiler/xla/array4d.h
+++ b/tensorflow/compiler/xla/array4d.h
@@ -26,12 +26,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index ad3fcee05b..9ad8ee2014 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -71,12 +71,13 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -90,6 +91,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -104,7 +107,6 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
@@ -117,6 +119,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
         "@llvm//:support",
     ],
 )
@@ -130,11 +133,11 @@ cc_library(
         ":xla_computation",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
         "@llvm//:support",
     ],
 )
@@ -159,6 +162,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -186,6 +190,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -211,6 +216,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index d0ce5e8a6a..1fdf8f6260 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -18,15 +18,15 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -89,7 +89,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
         "TransferToServer request");
   }
 
-  return MakeUnique<GlobalData>(stub_, response.data());
+  return absl::make_unique<GlobalData>(stub_, response.data());
 }
 
 Status Client::TransferToInfeed(const LiteralSlice& literal, int64 replica_id,
@@ -248,7 +248,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     }
   }
 
-  return MakeUnique<GlobalData>(stub_, response.output());
+  return absl::make_unique<GlobalData>(stub_, response.output());
 }
 
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
@@ -278,7 +278,7 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   std::vector<std::unique_ptr<GlobalData>> outputs;
   for (size_t i = 0; i < computations.size(); ++i) {
     outputs.push_back(
-        MakeUnique<GlobalData>(stub_, response.responses(i).output()));
+        absl::make_unique<GlobalData>(stub_, response.responses(i).output()));
     if (computations[i].execution_profile != nullptr) {
       *computations[i].execution_profile = response.responses(i).profile();
     }
@@ -340,7 +340,7 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::DeconstructTuple(
 
   std::vector<std::unique_ptr<GlobalData>> handles;
   for (auto& handle : response.element_handles()) {
-    handles.push_back(MakeUnique<GlobalData>(stub_, handle));
+    handles.push_back(absl::make_unique<GlobalData>(stub_, handle));
   }
   return std::move(handles);
 }
@@ -369,7 +369,7 @@ StatusOr<ComputationStats> Client::GetComputationStats(
 StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(const auto& result, computation.GetProgramShape());
-  return MakeUnique<ProgramShape>(result);
+  return absl::make_unique<ProgramShape>(result);
 }
 
 StatusOr<Shape> Client::GetShape(const GlobalData& data) {
@@ -400,7 +400,7 @@ StatusOr<string> Client::ExecutionStatsAsString(
     int64 nanoseconds = profile.compute_time_ns();
     int64 cycle_count = profile.compute_cycle_count();
     double gflops = total_flops / nanoseconds;
-    return tensorflow::strings::StrCat(
+    return absl::StrCat(
         "[Execution Statistics] flop count: ", computation_stats.flop_count(),
         ", transcendental count: ", computation_stats.transcendental_count(),
         ", compute execution time: ", nanoseconds, " nsec",
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 803a9e4009..27b7fa7b29 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -94,10 +95,10 @@ ClientLibrary::~ClientLibrary() = default;
   service_options.set_intra_op_parallelism_threads(
       options.intra_op_parallelism_threads());
 
-  auto instance = MakeUnique<LocalInstance>();
+  auto instance = absl::make_unique<LocalInstance>();
   TF_ASSIGN_OR_RETURN(instance->service,
                       LocalService::NewService(service_options));
-  instance->client = MakeUnique<LocalClient>(instance->service.get());
+  instance->client = absl::make_unique<LocalClient>(instance->service.get());
   LocalClient* cl = instance->client.get();
 
   client_library.local_instances_.insert(
@@ -134,10 +135,11 @@ ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) {
     return it->second->client.get();
   }
 
-  auto instance = MakeUnique<CompileOnlyInstance>();
+  auto instance = absl::make_unique<CompileOnlyInstance>();
   TF_ASSIGN_OR_RETURN(instance->service,
                       CompileOnlyService::NewService(platform));
-  instance->client = MakeUnique<CompileOnlyClient>(instance->service.get());
+  instance->client =
+      absl::make_unique<CompileOnlyClient>(instance->service.get());
   CompileOnlyClient* cl = instance->client.get();
 
   client_library.compile_only_instances_.insert(
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index 5c9abad4c3..040344c9a6 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 
+#include "absl/memory/memory.h"
 #include "llvm/ADT/Triple.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace xla {
@@ -41,7 +41,7 @@ CompileOnlyClient::CompileAheadOfTime(
                                                metadata);
 }
 
-int64 CompileOnlyClient::PointerSizeForTriple(tensorflow::StringPiece triple) {
+int64 CompileOnlyClient::PointerSizeForTriple(absl::string_view triple) {
   llvm::Triple llvm_triple(
       llvm::Triple::normalize(llvm::StringRef(triple.data(), triple.size())));
   if (llvm_triple.isArch64Bit()) {
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index a551edeab0..d0c83cbfcc 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -57,7 +57,7 @@ class CompileOnlyClient : public Client {
       std::unique_ptr<AotCompilationMetadata>* metadata = nullptr);
 
   // Returns the size of a pointer in bytes for a given triple.
-  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
+  static int64 PointerSizeForTriple(absl::string_view triple);
 
  private:
   CompileOnlyService* compiler_service_;
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 7dee41f6a0..5a73408db5 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -71,41 +71,41 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
   return *this;
 }
 
-const tensorflow::gtl::optional<string>&
-ExecutableBuildOptions::generate_hlo_graph() const {
+const absl::optional<string>& ExecutableBuildOptions::generate_hlo_graph()
+    const {
   return generate_hlo_graph_;
 }
 
 ExecutableBuildOptions& ExecutableBuildOptions::set_dump_optimized_hlo_proto_to(
-    tensorflow::StringPiece dirpath) {
-  dump_optimized_hlo_proto_to_ = dirpath.ToString();
+    absl::string_view dirpath) {
+  dump_optimized_hlo_proto_to_ = string(dirpath);
   return *this;
 }
 
-const tensorflow::gtl::optional<string>&
+const absl::optional<string>&
 ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
   return dump_optimized_hlo_proto_to_;
 }
 
 ExecutableBuildOptions&
 ExecutableBuildOptions::set_dump_unoptimized_hlo_proto_to(
-    tensorflow::StringPiece dirpath) {
-  dump_unoptimized_hlo_proto_to_ = dirpath.ToString();
+    absl::string_view dirpath) {
+  dump_unoptimized_hlo_proto_to_ = string(dirpath);
   return *this;
 }
 
-const tensorflow::gtl::optional<string>&
+const absl::optional<string>&
 ExecutableBuildOptions::dump_unoptimized_hlo_proto_to() const {
   return dump_unoptimized_hlo_proto_to_;
 }
 
 ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
-    tensorflow::StringPiece dirpath) {
-  dump_per_pass_hlo_proto_to_ = dirpath.ToString();
+    absl::string_view dirpath) {
+  dump_per_pass_hlo_proto_to_ = string(dirpath);
   return *this;
 }
 
-const tensorflow::gtl::optional<string>&
+const absl::optional<string>&
 ExecutableBuildOptions::dump_per_pass_hlo_proto_to() const {
   return dump_per_pass_hlo_proto_to_;
 }
@@ -115,7 +115,7 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_hlo_profile(bool enabled) {
   return *this;
 }
 
-tensorflow::gtl::optional<bool> ExecutableBuildOptions::hlo_profile() const {
+absl::optional<bool> ExecutableBuildOptions::hlo_profile() const {
   return hlo_profile_;
 }
 
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 9dc9be4423..888d2f28eb 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
 
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 
@@ -57,34 +57,33 @@ class ExecutableBuildOptions {
 
   // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
   ExecutableBuildOptions& set_generate_hlo_graph(string regex);
-  const tensorflow::gtl::optional<string>& generate_hlo_graph() const;
+  const absl::optional<string>& generate_hlo_graph() const;
 
   // If set, specifies a dirpath to dump the end-of-optimization-pipeline HLO
   // protobuf to (as in DebugOptions).
   ExecutableBuildOptions& set_dump_optimized_hlo_proto_to(
-      tensorflow::StringPiece dirpath);
-  const tensorflow::gtl::optional<string>& dump_optimized_hlo_proto_to() const;
+      absl::string_view dirpath);
+  const absl::optional<string>& dump_optimized_hlo_proto_to() const;
 
   // If set, specifies a dirpath to dump the start-of-optimization-pipeline HLO
   // protobuf to (as in DebugOptions).
   ExecutableBuildOptions& set_dump_unoptimized_hlo_proto_to(
-      tensorflow::StringPiece dirpath);
-  const tensorflow::gtl::optional<string>& dump_unoptimized_hlo_proto_to()
-      const;
+      absl::string_view dirpath);
+  const absl::optional<string>& dump_unoptimized_hlo_proto_to() const;
 
   // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
   // to (as in DebugOptions).
   ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
-      tensorflow::StringPiece dirpath);
-  const tensorflow::gtl::optional<string>& dump_per_pass_hlo_proto_to() const;
+      absl::string_view dirpath);
+  const absl::optional<string>& dump_per_pass_hlo_proto_to() const;
 
   // If true, specifies that we should record an HLO profile during execution
   // and log it after execution (as in DebugOptions). If nullopt the default is
   // used.
   ExecutableBuildOptions& set_hlo_profile(bool enabled);
-  tensorflow::gtl::optional<bool> hlo_profile() const;
+  absl::optional<bool> hlo_profile() const;
 
-  void add_disabled_hlo_pass(tensorflow::StringPiece pass_name) {
+  void add_disabled_hlo_pass(absl::string_view pass_name) {
     disabled_hlo_passes_.push_back(std::string(pass_name));
   }
   const tensorflow::gtl::ArraySlice<std::string> disabled_hlo_passes() const {
@@ -96,14 +95,14 @@ class ExecutableBuildOptions {
   string ToString() const;
 
  private:
-  tensorflow::gtl::optional<bool> hlo_profile_;
+  absl::optional<bool> hlo_profile_;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
-  tensorflow::gtl::optional<string> generate_hlo_graph_;
-  tensorflow::gtl::optional<string> dump_optimized_hlo_proto_to_;
-  tensorflow::gtl::optional<string> dump_unoptimized_hlo_proto_to_;
-  tensorflow::gtl::optional<string> dump_per_pass_hlo_proto_to_;
+  absl::optional<string> generate_hlo_graph_;
+  absl::optional<string> dump_optimized_hlo_proto_to_;
+  absl::optional<string> dump_unoptimized_hlo_proto_to_;
+  absl::optional<string> dump_per_pass_hlo_proto_to_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
   std::vector<std::string> disabled_hlo_passes_;
 };
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index a2f32ab97e..8736f18dcf 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -31,7 +31,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -65,6 +65,17 @@ xla_test(
 )
 
 cc_library(
+    name = "conv_grad_size_util",
+    srcs = ["conv_grad_size_util.cc"],
+    hdrs = ["conv_grad_size_util.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
     name = "math",
     srcs = ["math.cc"],
     hdrs = ["math.h"],
@@ -128,9 +139,9 @@ cc_library(
     deps = [
         ":arithmetic",
         ":constants",
-        "//tensorflow/compiler/tf2xla/lib:util",
+        ":conv_grad_size_util",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
@@ -142,6 +153,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
@@ -209,5 +221,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 9225b1acd6..e86c10f030 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -24,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace {
@@ -39,7 +39,7 @@ XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
     b = builder->CreateSubBuilder(name);
   } else {
     b = builder->CreateSubBuilder(
-        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
+        absl::StrCat(name, "_", PrimitiveType_Name(type)));
   }
 
   const Shape scalar = ShapeUtil::MakeShape(type, {});
diff --git a/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc
new file mode 100644
index 0000000000..a4c50a5491
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+StatusOr<SpatialDimensionOutputSizeAndPadding> GetWindowedOutputSize(
+    int64 input_size, int64 filter_size, int64 dilation_rate, int64 stride,
+    Padding padding_type) {
+  if (stride <= 0) {
+    return tensorflow::errors::InvalidArgument("Stride must be > 0, but got ",
+                                               stride);
+  }
+  if (dilation_rate < 1) {
+    return tensorflow::errors::InvalidArgument(
+        "Dilation rate must be >= 1, but got ", dilation_rate);
+  }
+
+  int64 effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  SpatialDimensionOutputSizeAndPadding dim;
+  switch (padding_type) {
+    case Padding::kValid:
+      dim.output_size = (input_size - effective_filter_size + stride) / stride;
+      dim.pad_before = dim.pad_after = 0;
+      break;
+    case Padding::kSame:
+      dim.output_size = (input_size + stride - 1) / stride;
+      const int64 padding_needed =
+          std::max(int64{0}, (dim.output_size - 1) * stride +
+                                 effective_filter_size - input_size);
+      // For odd values of total padding, add more padding on the "after" side
+      // of the given dimension.
+      dim.pad_before = padding_needed / 2;
+      dim.pad_after = padding_needed - dim.pad_before;
+      break;
+  }
+  if (dim.output_size < 0) {
+    return tensorflow::errors::InvalidArgument(
+        "Computed output size would be negative: ", dim.output_size,
+        " [input_size: ", input_size,
+        ", effective_filter_size: ", effective_filter_size,
+        ", stride: ", stride, "]");
+  }
+  return dim;
+}
+
+}  // namespace
+
+StatusOr<SpatialDimensionOutputSizeAndPadding>
+ConvGradExtractAndVerifyDimension(int64 input_size, int64 filter_size,
+                                  int64 output_size, int64 dilation,
+                                  int64 stride, Padding padding) {
+  TF_ASSIGN_OR_RETURN(SpatialDimensionOutputSizeAndPadding output_dim,
+                      GetWindowedOutputSize(input_size, filter_size, dilation,
+                                            stride, padding));
+  if (output_size != output_dim.output_size) {
+    return tensorflow::errors::InvalidArgument(
+        "Size of out_backprop doesn't match computed: ", "actual = ",
+        output_size, ", computed = ", output_dim.output_size,
+        " input: ", input_size, " filter: ", filter_size,
+        " output: ", output_size, " stride: ", stride, " dilation: ", dilation);
+  }
+
+  SpatialDimensionOutputSizeAndPadding dim;
+  int64 effective_filter_size = (filter_size - 1) * dilation + 1;
+  dim.output_size = (output_dim.output_size - 1) * stride + 1;
+  const auto padded_out_size = input_size + effective_filter_size - 1;
+  dim.pad_before = effective_filter_size - 1 - output_dim.pad_before;
+  dim.pad_after = padded_out_size - dim.output_size - dim.pad_before;
+  VLOG(2) << "expanded_out = " << dim.output_size
+          << ", effective_filter_size = " << effective_filter_size
+          << ", padded_out = " << padded_out_size
+          << ", pad_before = " << dim.pad_before
+          << ", pad_after = " << dim.pad_after << ", dilation = " << dilation
+          << ", strides = " << stride;
+  return dim;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/conv_grad_size_util.h b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.h
new file mode 100644
index 0000000000..c18087ce6b
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
+
+#include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Information about a single spatial dimension for a convolution gradients and
+// windowed operations.
+struct SpatialDimensionOutputSizeAndPadding {
+  // Effective size of the operation output (potentially expanded).
+  int64 output_size;
+  // Number of padding elements to be added before/after this dimension of
+  // the input when computing the input gradient.
+  int64 pad_before;
+  int64 pad_after;
+};
+
+// Verifies that the dimensions all match, and computes the size and padding of
+// a spatial dimension for convolution gradient operations.
+StatusOr<SpatialDimensionOutputSizeAndPadding>
+ConvGradExtractAndVerifyDimension(int64 input_size, int64 filter_size,
+                                  int64 output_size, int64 dilation,
+                                  int64 stride, Padding padding);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONV_GRAD_SIZE_UTIL_H_
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 0221de7672..e569610b85 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -207,7 +207,11 @@ XlaOp Lgamma(XlaOp input) {
 
   XlaOp log_y = log_sqrt_two_pi + (z + one_half) * log_t - t + Log(x);
 
-  XlaOp reflection = log_pi - Log(Sin(pi * input)) - log_y;
+  // If z = a + 0j, the analytic continuation of log reduces to taking the
+  // absolute value of the real part.
+  // Re(log(z)) = Re(log|z| + arg(z)j)
+  //            = log|a|
+  XlaOp reflection = log_pi - Log(Abs(Sin(pi * input))) - log_y;
   XlaOp result = Select(need_to_reflect, reflection, log_y);
   return result;
 }
diff --git a/tensorflow/compiler/xla/client/lib/pooling.cc b/tensorflow/compiler/xla/client/lib/pooling.cc
index 7199269a6c..3ae9ae36f6 100644
--- a/tensorflow/compiler/xla/client/lib/pooling.cc
+++ b/tensorflow/compiler/xla/client/lib/pooling.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/pooling.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
 
 namespace xla {
 
@@ -90,10 +90,8 @@ XlaOp ComputeSums(XlaOp operand, XlaOp init_value,
 // Creates a padding configuration out of spatial padding values.
 PaddingConfig MakeSpatialPaddingConfig(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> spatial_padding,
-    tensorflow::gtl::ArraySlice<int64> kernel_size,
-    tensorflow::gtl::ArraySlice<int64> stride,
+    int num_spatial_dims, tensorflow::gtl::ArraySlice<int64> stride,
     const TensorFormat& data_format) {
-  const int num_spatial_dims = kernel_size.size() - 2;
   PaddingConfig padding_config;
   for (int i = 0; i < 2 + num_spatial_dims; ++i) {
     padding_config.add_dimensions();
@@ -109,6 +107,30 @@ PaddingConfig MakeSpatialPaddingConfig(
   return padding_config;
 }
 
+XlaOp AvgPoolDivideByCount(
+    XlaOp pooled, tensorflow::gtl::ArraySlice<int64> input_size,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    PrimitiveType dtype, const TensorFormat& data_format,
+    bool counts_include_padding) {
+  if (counts_include_padding) {
+    // If counts include padding, all windows have the same number of elements
+    // contributing to each average. Divide by the window size everywhere to get
+    // the average.
+    int64 window_size =
+        std::accumulate(window_dimensions.begin(), window_dimensions.end(), 1,
+                        [](int64 a, int64 b) { return a * b; });
+    auto divisor = ConstantR0WithType(pooled.builder(), dtype, window_size);
+
+    return pooled / divisor;
+  } else {
+    return AvgPoolDivideByCountWithGeneralPadding(pooled, dtype, input_size,
+                                                  padding, window_dimensions,
+                                                  window_strides, data_format);
+  }
+}
+
 }  // namespace
 
 XlaOp MaxPool(XlaOp operand, tensorflow::gtl::ArraySlice<int64> kernel_size,
@@ -137,25 +159,16 @@ XlaOp AvgPool(XlaOp operand, tensorflow::gtl::ArraySlice<int64> kernel_size,
     auto init_value = Zero(b, dtype);
     std::vector<int64> input_size(operand_shape.dimensions().begin(),
                                   operand_shape.dimensions().end());
-    auto padding_config =
-        MakeSpatialPaddingConfig(padding, kernel_size, stride, data_format);
+    const int num_dims = kernel_size.size();
+    const int num_spatial_dims = num_dims - 2;
+    auto padding_config = MakeSpatialPaddingConfig(padding, num_spatial_dims,
+                                                   stride, data_format);
     auto padded_operand = Pad(operand, Zero(b, dtype), padding_config);
     auto pooled = ComputeSums(padded_operand, init_value, kernel_size, stride,
                               data_format);
-    if (counts_include_padding) {
-      // If counts include padding, all windows have the same number of elements
-      // contributing to each average. Divide by the window size everywhere to
-      // get the average.
-      int64 window_size =
-          std::accumulate(kernel_size.begin(), kernel_size.end(), 1,
-                          [](int64 x, int64 y) { return x * y; });
-
-      auto divisor = ConstantR0WithType(b, dtype, window_size);
-      return pooled / divisor;
-    } else {
-      return AvgPoolDivideByCountWithGeneralPadding(
-          pooled, dtype, input_size, padding, kernel_size, stride, data_format);
-    }
+    return AvgPoolDivideByCount(pooled, input_size, kernel_size, stride,
+                                padding, dtype, data_format,
+                                counts_include_padding);
   });
 }
 
@@ -180,4 +193,101 @@ std::vector<std::pair<int64, int64>> MakeSpatialPadding(
                      stride_spatial_dimensions, padding);
 }
 
+XlaOp AvgPoolGrad(
+    XlaOp out_backprop, tensorflow::gtl::ArraySlice<int64> gradients_size,
+    tensorflow::gtl::ArraySlice<int64> kernel_size,
+    tensorflow::gtl::ArraySlice<int64> stride,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> spatial_padding,
+    const TensorFormat& data_format, const bool counts_include_padding) {
+  XlaBuilder* b = out_backprop.builder();
+  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    const int num_dims = kernel_size.size();
+
+    if (gradients_size.size() != num_dims) {
+      return tensorflow::errors::InvalidArgument("gradients must be ", num_dims,
+                                                 "-dimensional");
+    }
+
+    TF_ASSIGN_OR_RETURN(Shape out_backprop_xla_shape,
+                        b->GetShape(out_backprop));
+    if (out_backprop_xla_shape.dimensions().size() != num_dims) {
+      return tensorflow::errors::InvalidArgument("out_backprop must be ",
+                                                 num_dims, "-dimensional");
+    }
+
+    // We can think of average-pooling as:
+    // * a convolution with a kernel consisting entirely of 1s, where the
+    //   input feature and output feature are equal, and 0s everywhere else.
+    // * followed by dividing by the counts.
+    //
+    // This then gives us an algorithm to build the gradient:
+    // * divide out_backprop by the counts, followed by
+    // * Conv2DBackpropInput specialized for that kernel, which simplifies to
+    //   a Pad and a ReduceWindow.
+    //
+    // For an explanation of backpropagation for convolution, see the comments
+    // in third_party/tensorflow/core/kernels/conv_grad_ops.h
+
+    // TF filter shape is [ H, W, ..., inC, outC ]
+
+    // The input gradients are computed by a convolution of the output gradients
+    // and the filter, with some appropriate padding. See the comment at the top
+    // of conv_grad_ops.h for details.
+    PrimitiveType dtype = out_backprop_xla_shape.element_type();
+    auto out_backprop_div = AvgPoolDivideByCount(
+        out_backprop, gradients_size, kernel_size, stride, spatial_padding,
+        dtype, data_format, counts_include_padding);
+
+    // Pad the gradients in the spatial dimensions. We use the same padding
+    // as Conv2DBackpropInput.
+    PaddingConfig padding_config = MakeNoPaddingConfig(num_dims);
+    std::vector<int64> padded_gradients_size(gradients_size.begin(),
+                                             gradients_size.end());
+    // First, pad the output gradients the same way as the input. The additional
+    // padding will be removed as a last step before returning the input
+    // gradients.
+    const int num_spatial_dims = num_dims - 2;
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int dim = data_format.spatial_dimension(i);
+      padded_gradients_size[dim] +=
+          (spatial_padding[i].first + spatial_padding[i].second);
+    }
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int dim = data_format.spatial_dimension(i);
+      TF_ASSIGN_OR_RETURN(
+          SpatialDimensionOutputSizeAndPadding conv_backprop_spatial_dim,
+          ConvGradExtractAndVerifyDimension(
+              /*input_size=*/padded_gradients_size[dim],
+              /*filter_size=*/kernel_size[dim],
+              /*output_size=*/out_backprop_xla_shape.dimensions(dim),
+              /*dilation=*/1,
+              /*stride=*/stride[dim], /*padding=*/Padding::kValid));
+      auto* padding = padding_config.mutable_dimensions(dim);
+      padding->set_edge_padding_low(conv_backprop_spatial_dim.pad_before);
+      padding->set_edge_padding_high(conv_backprop_spatial_dim.pad_after);
+      padding->set_interior_padding(stride[dim] - 1);
+    }
+
+    auto zero = Zero(b, dtype);
+    auto padded_gradients = Pad(out_backprop_div, zero, padding_config);
+
+    // in_backprop = padded_gradients <conv> ones
+    std::vector<int64> ones(num_dims, 1LL);
+    auto in_backprop =
+        ReduceWindow(padded_gradients, Zero(b, dtype),
+                     CreateScalarAddComputation(dtype, b), kernel_size,
+                     /*window_strides=*/ones, Padding::kValid);
+    // The input padding doesn't contribute to the gradient, remove it.
+    std::vector<std::pair<int64, int64>> neg_spatial_padding;
+    neg_spatial_padding.reserve(spatial_padding.size());
+    for (const std::pair<int64, int64>& spatial_padding_dim : spatial_padding) {
+      neg_spatial_padding.emplace_back(-spatial_padding_dim.first,
+                                       -spatial_padding_dim.second);
+    }
+    auto remove_padding_config = MakeSpatialPaddingConfig(
+        neg_spatial_padding, num_spatial_dims, stride, data_format);
+    return Pad(in_backprop, zero, remove_padding_config);
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/pooling.h b/tensorflow/compiler/xla/client/lib/pooling.h
index 1699c585d3..291c711a00 100644
--- a/tensorflow/compiler/xla/client/lib/pooling.h
+++ b/tensorflow/compiler/xla/client/lib/pooling.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_
 
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace xla {
 
@@ -45,7 +45,7 @@ class TensorFormat {
   // The number of the dimension that represents the features.
   int feature_dimension_;
   // The dimension numbers for the spatial dimensions.
-  tensorflow::gtl::InlinedVector<int, 4> spatial_dimensions_;
+  absl::InlinedVector<int, 4> spatial_dimensions_;
 };
 
 // Computes the max pool of 'operand'.
@@ -68,6 +68,14 @@ std::vector<std::pair<int64, int64>> MakeSpatialPadding(
     tensorflow::gtl::ArraySlice<int64> stride, Padding padding,
     const TensorFormat& data_format);
 
+// Computes the average pool gradient.
+XlaOp AvgPoolGrad(
+    XlaOp out_backprop, tensorflow::gtl::ArraySlice<int64> gradients_size,
+    tensorflow::gtl::ArraySlice<int64> kernel_size,
+    tensorflow::gtl::ArraySlice<int64> stride,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> spatial_padding,
+    const TensorFormat& data_format, const bool counts_include_padding);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_
diff --git a/tensorflow/compiler/xla/client/lib/pooling_test.cc b/tensorflow/compiler/xla/client/lib/pooling_test.cc
index 4b4553b60d..1890047918 100644
--- a/tensorflow/compiler/xla/client/lib/pooling_test.cc
+++ b/tensorflow/compiler/xla/client/lib/pooling_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/pooling.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -22,7 +23,7 @@ namespace xla {
 namespace {
 
 TensorFormat MakeNCHWFormat(int num_spatial_dims) {
-  tensorflow::gtl::InlinedVector<int64, 4> spatial_dimensions;
+  absl::InlinedVector<int64, 4> spatial_dimensions;
   for (int i = 0; i < num_spatial_dims; ++i) {
     spatial_dimensions.push_back(i + 2);
   }
@@ -181,5 +182,109 @@ XLA_TEST_F(PoolingTest,
                              error_spec_);
 }
 
+XLA_TEST_F(PoolingTest, AvgPool2DGradNoPadding) {
+  XlaBuilder builder(TestName());
+  for (bool counts_include_padding : {false, true}) {
+    XlaOp out_backprop = ConstantR4FromArray4D<float>(&builder, {{{{1.}}}});
+    auto data_format = MakeNCHWFormat(2);
+    auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+    auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+    AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride,
+                {{0, 0}, {0, 0}}, MakeNCHWFormat(2),
+                /*counts_include_padding=*/counts_include_padding);
+    // Without padding, counts_include_padding makes no difference.
+    ComputeAndCompareR4<float>(
+        &builder, {{{{0.25, 0.25, 0.}, {0.25, 0.25, 0.}, {0., 0., 0.}}}}, {},
+        error_spec_);
+  }
+}
+
+XLA_TEST_F(PoolingTest, AvgPool2DGradNoPaddingWithStride) {
+  XlaBuilder builder(TestName());
+  for (bool counts_include_padding : {false, true}) {
+    XlaOp out_backprop =
+        ConstantR4FromArray4D<float>(&builder, {{{{1., 1.}, {1., 1.}}}});
+    auto data_format = MakeNCHWFormat(2);
+    auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+    auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format);
+    AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride,
+                {{0, 0}, {0, 0}}, MakeNCHWFormat(2),
+                /*counts_include_padding=*/counts_include_padding);
+    // Without padding, counts_include_padding makes no difference.
+    ComputeAndCompareR4<float>(
+        &builder, {{{{0.25, 0.5, 0.25}, {0.5, 1., 0.5}, {0.25, 0.5, 0.25}}}},
+        {}, error_spec_);
+  }
+}
+
+XLA_TEST_F(PoolingTest, AvgPool2DGradWithPadding) {
+  XlaBuilder builder(TestName());
+
+  XlaOp out_backprop =
+      ConstantR4FromArray4D<float>(&builder, {{{{1., 1.}, {1., 1.}}}});
+  auto data_format = MakeNCHWFormat(2);
+  auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+  auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+  AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}},
+              MakeNCHWFormat(2),
+              /*counts_include_padding=*/true);
+  ComputeAndCompareR4<float>(
+      &builder,
+      {{{{0.25, 0.25, 0.25}, {0.25, 0.25, 0.25}, {0.25, 0.25, 0.25}}}}, {},
+      error_spec_);
+}
+
+XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountNotIncludePadding) {
+  XlaBuilder builder(TestName());
+
+  XlaOp out_backprop =
+      ConstantR4FromArray4D<float>(&builder, {{{{1., 1.}, {1., 1.}}}});
+  auto data_format = MakeNCHWFormat(2);
+  auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+  auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+  AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}},
+              MakeNCHWFormat(2), false);
+  ComputeAndCompareR4<float>(
+      &builder, {{{{1., 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}}}}, {},
+      error_spec_);
+}
+
+XLA_TEST_F(PoolingTest, AvgPool2DGradWithPaddingCountWithStride) {
+  XlaBuilder builder(TestName());
+
+  XlaOp out_backprop =
+      ConstantR4FromArray4D<float>(&builder, {{{{1., 1., 1., 1.},
+                                                {1., 1., 1., 1.},
+                                                {1., 1., 1., 1.},
+                                                {1., 1., 1., 1.}}}});
+  auto data_format = MakeNCHWFormat(2);
+  auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+  auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format);
+  AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}},
+              MakeNCHWFormat(2), true);
+  ComputeAndCompareR4<float>(&builder,
+                             {{{{1., 1., 1.}, {1., 1., 1.}, {1., 1., 1.}}}}, {},
+                             error_spec_);
+}
+
+XLA_TEST_F(PoolingTest,
+           AvgPool2DGradWithPaddingCountWithStrideNotIncludePadding) {
+  XlaBuilder builder(TestName());
+
+  XlaOp out_backprop =
+      ConstantR4FromArray4D<float>(&builder, {{{{1., 1., 1., 1.},
+                                                {1., 1., 1., 1.},
+                                                {1., 1., 1., 1.},
+                                                {1., 1., 1., 1.}}}});
+  auto data_format = MakeNCHWFormat(2);
+  auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format);
+  auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format);
+  AvgPoolGrad(out_backprop, {1, 1, 3, 3}, kernel_size, stride, {{1, 1}, {1, 1}},
+              MakeNCHWFormat(2), false);
+  ComputeAndCompareR4<float>(
+      &builder, {{{{2.25, 1.5, 2.25}, {1.5, 1., 1.5}, {2.25, 1.5, 2.25}}}}, {},
+      error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 081fec7ad9..6861521acc 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -23,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -61,8 +61,7 @@ XlaOp BuildFakeDataOpOnDevice(const Shape& shape, XlaBuilder* builder) {
 
 std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
                                                        Client* client) {
-  XlaBuilder b(
-      tensorflow::strings::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
+  XlaBuilder b(absl::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
   BuildFakeDataOpOnDevice(shape, &b);
   XlaComputation computation = b.Build().ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index cffb24e29b..1cd3e9b22f 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "llvm/ADT/Triple.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
@@ -257,9 +257,9 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       local_service_->CompileExecutable(
                           computation, argument_layouts, updated_options));
-  return WrapUnique(new LocalExecutable(std::move(executable),
-                                        local_service_->mutable_backend(),
-                                        updated_options));
+  return absl::WrapUnique(new LocalExecutable(std::move(executable),
+                                              local_service_->mutable_backend(),
+                                              updated_options));
 }
 
 StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
diff --git a/tensorflow/compiler/xla/client/sharding_builder.h b/tensorflow/compiler/xla/client/sharding_builder.h
index 34763e54d9..59df3a8762 100644
--- a/tensorflow/compiler/xla/client/sharding_builder.h
+++ b/tensorflow/compiler/xla/client/sharding_builder.h
@@ -56,4 +56,4 @@ OpSharding Tuple(const ShapeTree<OpSharding>& shardings);
 }  // namespace sharding_builder
 }  // namespace xla
 
-#endif
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_SHARDING_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index b3b00e2fff..9f902d7298 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -21,19 +21,24 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace xla {
 
-using tensorflow::strings::StrCat;
+using absl::StrCat;
 
 namespace {
 
@@ -194,7 +199,6 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       // TODO(b/33009255): Implmement constant folding for cross replica sum.
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
-    case HloOpcode::kHostCompute:
     case HloOpcode::kCall:
       // TODO(b/32495713): We aren't checking the to_apply computation itself,
       // so we conservatively say that computations containing the Call op
@@ -221,8 +225,7 @@ XlaComputation XlaBuilder::BuildAndNoteError() {
   auto build_status = Build();
   if (!build_status.ok()) {
     parent_builder_->ReportError(
-        AddStatus(build_status.status(),
-                  tensorflow::strings::StrCat("error from: ", name_)));
+        AddStatus(build_status.status(), absl::StrCat("error from: ", name_)));
     return {};
   }
   return build_status.ConsumeValueOrDie();
@@ -469,8 +472,8 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
-    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
-                [](const Shape& shape) { return &shape; });
+    absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
@@ -622,8 +625,8 @@ XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
 
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
-    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
-                [](const Shape& shape) { return &shape; });
+    absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
@@ -703,8 +706,7 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
     TF_ASSIGN_OR_RETURN(const Shape& original_shape, GetShape(operand));
 
     VLOG(3) << "original shape: " << ShapeUtil::HumanString(original_shape);
-    VLOG(3) << "dims to collapse: "
-            << tensorflow::str_util::Join(dimensions, ",");
+    VLOG(3) << "dims to collapse: " << absl::StrJoin(dimensions, ",");
 
     std::vector<int64> new_sizes;
     for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) {
@@ -715,8 +717,7 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
       }
     }
 
-    VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",")
-            << "]";
+    VLOG(3) << "new sizes: [" << absl::StrJoin(new_sizes, ",") << "]";
 
     return Reshape(operand, new_sizes);
   });
@@ -749,8 +750,8 @@ XlaOp XlaBuilder::Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements) {
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
-    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
-                [](const Shape& shape) { return &shape; });
+    absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferVariadicOpShape(
                             HloOpcode::kTuple, operand_shape_ptrs));
@@ -807,7 +808,8 @@ XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
   return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions);
 }
 
-XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs) {
+XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs,
+                      const PrecisionConfigProto* precision_config_proto) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
 
@@ -815,12 +817,14 @@ XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs) {
     dimension_numbers.add_lhs_contracting_dimensions(
         lhs_shape.dimensions_size() == 1 ? 0 : 1);
     dimension_numbers.add_rhs_contracting_dimensions(0);
-    return DotGeneral(lhs, rhs, dimension_numbers);
+    return DotGeneral(lhs, rhs, dimension_numbers, precision_config_proto);
   });
 }
 
-XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                             const DotDimensionNumbers& dimension_numbers) {
+XlaOp XlaBuilder::DotGeneral(
+    const XlaOp& lhs, const XlaOp& rhs,
+    const DotDimensionNumbers& dimension_numbers,
+    const PrecisionConfigProto* precision_config_proto) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
@@ -829,6 +833,9 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
     *instr.mutable_dot_dimension_numbers() = dimension_numbers;
+    if (precision_config_proto != nullptr) {
+      *instr.mutable_precision_config() = *precision_config_proto;
+    }
     return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs});
   });
 }
@@ -882,24 +889,31 @@ Status XlaBuilder::VerifyConvolution(
 
 XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
-                       Padding padding) {
+                       Padding padding, int64 feature_group_count,
+                       const PrecisionConfigProto* precision_config_proto) {
   return ConvWithGeneralDimensions(
       lhs, rhs, window_strides, padding,
-      CreateDefaultConvDimensionNumbers(window_strides.size()));
+      CreateDefaultConvDimensionNumbers(window_strides.size()),
+      feature_group_count, precision_config_proto);
 }
 
 XlaOp XlaBuilder::ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    int64 feature_group_count,
+    const PrecisionConfigProto* precision_config_proto) {
   return ConvGeneral(lhs, rhs, window_strides, padding,
-                     CreateDefaultConvDimensionNumbers(window_strides.size()));
+                     CreateDefaultConvDimensionNumbers(window_strides.size()),
+                     feature_group_count, precision_config_proto);
 }
 
 XlaOp XlaBuilder::ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count,
+    const PrecisionConfigProto* precision_config_proto) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
@@ -926,7 +940,8 @@ XlaOp XlaBuilder::ConvWithGeneralDimensions(
     return ConvGeneral(lhs, rhs, window_strides,
                        MakePadding(base_area_dimensions, window_dimensions,
                                    window_strides, padding),
-                       dimension_numbers);
+                       dimension_numbers, feature_group_count,
+                       precision_config_proto);
   });
 }
 
@@ -934,9 +949,12 @@ XlaOp XlaBuilder::ConvGeneral(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count,
+    const PrecisionConfigProto* precision_config_proto) {
   return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
-                            dimension_numbers);
+                            dimension_numbers, feature_group_count,
+                            precision_config_proto);
 }
 
 XlaOp XlaBuilder::ConvGeneralDilated(
@@ -945,7 +963,9 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count,
+    const PrecisionConfigProto* precision_config_proto) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
@@ -964,12 +984,17 @@ XlaOp XlaBuilder::ConvGeneralDilated(
                         MakeWindow(window_dimensions, window_strides, padding,
                                    lhs_dilation, rhs_dilation));
 
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, instr.window(),
-                                           dimension_numbers));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferConvolveShape(
+                            lhs_shape, rhs_shape, instr.window(),
+                            dimension_numbers, feature_group_count));
 
     *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+    instr.set_feature_group_count(feature_group_count);
+
+    if (precision_config_proto != nullptr) {
+      *instr.mutable_precision_config() = *precision_config_proto;
+    }
 
     return AddInstruction(std::move(instr), HloOpcode::kConvolution,
                           {lhs, rhs});
@@ -987,7 +1012,7 @@ StatusOr<Window> XlaBuilder::MakeWindow(
       return Status::OK();
     } else {
       return InvalidArgument(
-          "%s", tensorflow::strings::StrCat(
+          "%s", absl::StrCat(
                     "Window has different number of window dimensions than of ",
                     x_name,
                     "\nNumber of window dimensions: ", window_dimensions.size(),
@@ -1073,6 +1098,23 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
           "Replicated sharding is not yet supported for infeeds");
     }
 
+    // Infeed takes a single token operand. Generate the token to pass to the
+    // infeed.
+    XlaOp token;
+    auto make_token = [&]() {
+      HloInstructionProto token_instr;
+      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+      return AddInstruction(std::move(token_instr), HloOpcode::kAfterAll, {});
+    };
+    if (sharding()) {
+      // Arbitrarily assign token to device 0.
+      OpSharding sharding = sharding_builder::AssignDevice(0);
+      XlaScopedShardingAssignment scoped_sharding(this, sharding);
+      TF_ASSIGN_OR_RETURN(token, make_token());
+    } else {
+      TF_ASSIGN_OR_RETURN(token, make_token());
+    }
+
     // The sharding is set by the client according to the data tuple shape.
     // However, the shape of the infeed instruction is a tuple containing the
     // data and a token. For tuple sharding type, the sharding must be changed
@@ -1088,11 +1130,11 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
           sharding_builder::AssignDevice(0);
       XlaScopedShardingAssignment scoped_sharding(this,
                                                   infeed_instruction_sharding);
-      TF_ASSIGN_OR_RETURN(
-          infeed, AddInstruction(std::move(instr), HloOpcode::kInfeed, {}));
+      TF_ASSIGN_OR_RETURN(infeed, AddInstruction(std::move(instr),
+                                                 HloOpcode::kInfeed, {token}));
     } else {
-      TF_ASSIGN_OR_RETURN(
-          infeed, AddInstruction(std::move(instr), HloOpcode::kInfeed, {}));
+      TF_ASSIGN_OR_RETURN(infeed, AddInstruction(std::move(instr),
+                                                 HloOpcode::kInfeed, {token}));
     }
 
     // The infeed instruction produces a tuple of the infed data and a token
@@ -1158,8 +1200,15 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
 
     instr.set_outfeed_config(outfeed_config);
 
+    // Outfeed takes a token as its second operand. Generate the token to pass
+    // to the outfeed.
+    HloInstructionProto token_instr;
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
+                                                    HloOpcode::kAfterAll, {}));
+
     TF_RETURN_IF_ERROR(
-        AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand})
+        AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand, token})
             .status());
 
     // The outfeed instruction produces a token. However, existing users expect
@@ -1233,7 +1282,7 @@ XlaOp XlaBuilder::CustomCall(const string& call_target_name,
                              const Shape& shape) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    if (tensorflow::str_util::StartsWith(call_target_name, "$")) {
+    if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
           "are reserved for internal use.",
@@ -1245,18 +1294,6 @@ XlaOp XlaBuilder::CustomCall(const string& call_target_name,
   });
 }
 
-XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
-                              const string& channel_name,
-                              int64 cost_estimate_ns, const Shape& shape) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-    *instr.mutable_shape() = shape;
-    instr.set_channel_name(channel_name);
-    instr.set_cost_estimate_ns(cost_estimate_ns);
-    return AddInstruction(std::move(instr), HloOpcode::kHostCompute, operands);
-  });
-}
-
 XlaOp XlaBuilder::Complex(
     const XlaOp& real, const XlaOp& imag,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
@@ -1431,7 +1468,7 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   });
 }
 
-XlaOp XlaBuilder::Sort(XlaOp keys, tensorflow::gtl::optional<XlaOp> values,
+XlaOp XlaBuilder::Sort(XlaOp keys, absl::optional<XlaOp> values,
                        int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1509,8 +1546,8 @@ XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
-    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
-                [](const Shape& shape) { return &shape; });
+    absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
@@ -1600,27 +1637,27 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
   });
 }
 
-XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
+XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices,
                          const GatherDimensionNumbers& dimension_numbers,
-                         tensorflow::gtl::ArraySlice<int64> window_bounds) {
+                         tensorflow::gtl::ArraySlice<int64> slice_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
-    TF_ASSIGN_OR_RETURN(const Shape& gather_indices_shape,
-                        GetShape(gather_indices));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
-        ShapeInference::InferGatherShape(input_shape, gather_indices_shape,
-                                         dimension_numbers, window_bounds));
+        ShapeInference::InferGatherShape(input_shape, start_indices_shape,
+                                         dimension_numbers, slice_sizes));
 
     *instr.mutable_gather_dimension_numbers() = dimension_numbers;
-    for (int64 bound : window_bounds) {
-      instr.add_gather_window_bounds(bound);
+    for (int64 bound : slice_sizes) {
+      instr.add_gather_slice_sizes(bound);
     }
 
     return AddInstruction(std::move(instr), HloOpcode::kGather,
-                          {input, gather_indices});
+                          {input, start_indices});
   });
 }
 
@@ -1843,7 +1880,7 @@ XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
 
 XlaOp XlaBuilder::CrossReplicaSum(
     const XlaOp& operand,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids) {
+    tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
     const Shape& scalar_shape = ShapeUtil::MakeShape(shape.element_type(), {});
@@ -1851,23 +1888,24 @@ XlaOp XlaBuilder::CrossReplicaSum(
     b->Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
            b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
     TF_ASSIGN_OR_RETURN(auto computation, b->Build());
-    return CrossReplicaSum(operand, computation, replica_group_ids,
-                           /*channel_id=*/tensorflow::gtl::nullopt);
+    return CrossReplicaSum(operand, computation, replica_groups,
+                           /*channel_id=*/absl::nullopt);
   });
 }
 
 XlaOp XlaBuilder::CrossReplicaSum(
     const XlaOp& operand, const XlaComputation& computation,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-    const tensorflow::gtl::optional<ChannelHandle>& channel_id) {
+    tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups,
+    const absl::optional<ChannelHandle>& channel_id) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
-    for (int64 replica_group_id : replica_group_ids) {
-      instr.add_replica_group_ids(replica_group_id);
+
+    for (const ReplicaGroup& group : replica_groups) {
+      *instr.add_replica_groups() = group;
     }
 
     if (channel_id.has_value()) {
@@ -1914,8 +1952,8 @@ XlaOp XlaBuilder::AllToAll(const XlaOp& operand, int64 split_dimension,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(auto slice_shapes, this->GetOperandShapes(slices));
     std::vector<const Shape*> slice_shape_ptrs;
-    c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs),
-                [](const Shape& shape) { return &shape; });
+    absl::c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
@@ -2265,7 +2303,7 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
 
 std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
     const string& computation_name) {
-  auto sub_builder = MakeUnique<XlaBuilder>(computation_name);
+  auto sub_builder = absl::make_unique<XlaBuilder>(computation_name);
   sub_builder->parent_builder_ = this;
   sub_builder->die_immediately_on_error_ = this->die_immediately_on_error_;
   return sub_builder;
@@ -2528,42 +2566,57 @@ XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
   return lhs.builder()->Le(lhs, rhs, broadcast_dimensions);
 }
 
-XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs) {
-  return lhs.builder()->Dot(lhs, rhs);
+XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
+          const PrecisionConfigProto* precision_config_proto) {
+  return lhs.builder()->Dot(lhs, rhs, precision_config_proto);
 }
 
 XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                 const DotDimensionNumbers& dimension_numbers) {
-  return lhs.builder()->DotGeneral(lhs, rhs, dimension_numbers);
+                 const DotDimensionNumbers& dimension_numbers,
+                 const PrecisionConfigProto* precision_config_proto) {
+  return lhs.builder()->DotGeneral(lhs, rhs, dimension_numbers,
+                                   precision_config_proto);
 }
 
 XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  return lhs.builder()->Conv(lhs, rhs, window_strides, padding);
+           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+           int64 feature_group_count,
+           const PrecisionConfigProto* precision_config_proto) {
+  return lhs.builder()->Conv(lhs, rhs, window_strides, padding,
+                             feature_group_count, precision_config_proto);
 }
 
 XlaOp ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    int64 feature_group_count,
+    const PrecisionConfigProto* precision_config_proto) {
   return lhs.builder()->ConvWithGeneralPadding(lhs, rhs, window_strides,
-                                               padding);
+                                               padding, feature_group_count,
+                                               precision_config_proto);
 }
 
 XlaOp ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  return lhs.builder()->ConvWithGeneralDimensions(lhs, rhs, window_strides,
-                                                  padding, dimension_numbers);
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count,
+    const PrecisionConfigProto* precision_config_proto) {
+  return lhs.builder()->ConvWithGeneralDimensions(
+      lhs, rhs, window_strides, padding, dimension_numbers, feature_group_count,
+      precision_config_proto);
 }
 
 XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                   tensorflow::gtl::ArraySlice<int64> window_strides,
                   tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  const ConvolutionDimensionNumbers& dimension_numbers) {
+                  const ConvolutionDimensionNumbers& dimension_numbers,
+                  int64 feature_group_count,
+                  const PrecisionConfigProto* precision_config_proto) {
   return lhs.builder()->ConvGeneral(lhs, rhs, window_strides, padding,
-                                    dimension_numbers);
+                                    dimension_numbers, feature_group_count,
+                                    precision_config_proto);
 }
 
 XlaOp ConvGeneralDilated(
@@ -2572,10 +2625,12 @@ XlaOp ConvGeneralDilated(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  return lhs.builder()->ConvGeneralDilated(lhs, rhs, window_strides, padding,
-                                           lhs_dilation, rhs_dilation,
-                                           dimension_numbers);
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count,
+    const PrecisionConfigProto* precision_config_proto) {
+  return lhs.builder()->ConvGeneralDilated(
+      lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation,
+      dimension_numbers, feature_group_count, precision_config_proto);
 }
 
 XlaOp Fft(const XlaOp& operand, FftType fft_type,
@@ -2603,13 +2658,6 @@ XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
   return builder->CustomCall(call_target_name, operands, shape);
 }
 
-XlaOp HostCompute(XlaBuilder* builder,
-                  tensorflow::gtl::ArraySlice<XlaOp> operands,
-                  const string& channel_name, int64 cost_estimate_ns,
-                  const Shape& shape) {
-  return builder->HostCompute(operands, channel_name, cost_estimate_ns, shape);
-}
-
 XlaOp Complex(const XlaOp& real, const XlaOp& imag,
               tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
   return real.builder()->Complex(real, imag, broadcast_dimensions);
@@ -2719,17 +2767,17 @@ XlaOp ReduceWindowWithGeneralPadding(
       padding);
 }
 
-XlaOp CrossReplicaSum(const XlaOp& operand,
-                      tensorflow::gtl::ArraySlice<int64> replica_group_ids) {
-  return operand.builder()->CrossReplicaSum(operand, replica_group_ids);
+XlaOp CrossReplicaSum(
+    const XlaOp& operand,
+    tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups) {
+  return operand.builder()->CrossReplicaSum(operand, replica_groups);
 }
 
-XlaOp CrossReplicaSum(
-    const XlaOp& operand, const XlaComputation& computation,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-    const tensorflow::gtl::optional<ChannelHandle>& channel_id) {
+XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
+                      tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups,
+                      const absl::optional<ChannelHandle>& channel_id) {
   return operand.builder()->CrossReplicaSum(operand, computation,
-                                            replica_group_ids, channel_id);
+                                            replica_groups, channel_id);
 }
 
 XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
@@ -2824,8 +2872,7 @@ XlaOp Rev(const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions) {
   return operand.builder()->Rev(operand, dimensions);
 }
 
-XlaOp Sort(XlaOp keys, tensorflow::gtl::optional<XlaOp> values,
-           int64 dimension) {
+XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values, int64 dimension) {
   return keys.builder()->Sort(keys, std::move(values), dimension);
 }
 
@@ -2868,11 +2915,11 @@ XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                                             mantissa_bits);
 }
 
-XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
              const GatherDimensionNumbers& dimension_numbers,
-             tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  return input.builder()->Gather(input, gather_indices, dimension_numbers,
-                                 window_bounds);
+             tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return input.builder()->Gather(input, start_indices, dimension_numbers,
+                                 slice_sizes);
 }
 
 XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 9403d7ca8d..baa2ae5184 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <type_traits>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
@@ -154,12 +154,10 @@ class XlaBuilder {
 
   // Clears the sharding. Ops will be sharded according to the default placement
   // policy.
-  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
+  void ClearSharding() { sharding_ = absl::nullopt; }
 
   // Returns the OpSharding that will be attached to all instructions.
-  const tensorflow::gtl::optional<OpSharding>& sharding() const {
-    return sharding_;
-  }
+  const absl::optional<OpSharding>& sharding() const { return sharding_; }
 
   // Sets the builder to a mode where it will die immediately when an error is
   // encountered, rather than producing it in a deferred fashion when Build() is
@@ -503,31 +501,39 @@ class XlaBuilder {
            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
 
   // Enqueues a dot instruction onto the computation.
-  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
+            const PrecisionConfigProto* precision_config_proto = nullptr);
 
   // Enqueues a general dot instruction onto the computation.
-  XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                   const DotDimensionNumbers& dimension_numbers);
+  XlaOp DotGeneral(
+      const XlaOp& lhs, const XlaOp& rhs,
+      const DotDimensionNumbers& dimension_numbers,
+      const PrecisionConfigProto* precision_config_proto = nullptr);
 
   // Enqueues a convolution instruction onto the computation, which uses the
   // default convolution dimension numbers.
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-             tensorflow::gtl::ArraySlice<int64> window_strides,
-             Padding padding);
+             tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+             int64 feature_group_count = 1,
+             const PrecisionConfigProto* precision_config_proto = nullptr);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided padding configuration in the format returned by MakePadding().
   XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      int64 feature_group_count = 1,
+      const PrecisionConfigProto* precision_config_proto = nullptr);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided dimension numbers configuration.
   XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1,
+      const PrecisionConfigProto* precision_config_proto = nullptr);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided padding configuration as well as the dimension numbers.
@@ -535,7 +541,9 @@ class XlaBuilder {
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1,
+      const PrecisionConfigProto* precision_config_proto = nullptr);
 
   // Enqueues a convolution instruction onto the computation, with the caller
   // provided padding configuration, dilation factors and dimension numbers.
@@ -545,7 +553,9 @@ class XlaBuilder {
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
       tensorflow::gtl::ArraySlice<int64> lhs_dilation,
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1,
+      const PrecisionConfigProto* precision_config_proto = nullptr);
 
   // Enqueues an FFT instruction onto the computation, of the given type and
   // with the given FFT length.
@@ -582,16 +592,6 @@ class XlaBuilder {
                    tensorflow::gtl::ArraySlice<XlaOp> operands,
                    const Shape& shape);
 
-  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
-  // During code generation, host send and receive operations will be generated
-  // to transfer |operands| to the host and a single result of |shape| back to
-  // the device.  Host send/recv operations are emitted using |channel_name|.
-  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-  // instruction scheduling.
-  XlaOp HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
-                    const string& channel_name, int64 cost_estimate_ns,
-                    const Shape& shape);
-
   // The following methods enqueue element-wise binary arithmetic operations
   // onto the computation. The shapes of the operands have to match unless one
   // of the operands is a scalar, or an explicit broadcast dimension is given
@@ -685,7 +685,7 @@ class XlaBuilder {
   // sum for each subgroup.
   XlaOp CrossReplicaSum(
       const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
+      tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups = {});
 
   // Enqueues an operation that do an AllReduce of the operand cross cores. Here
   // AllReduce means doing a reduction on the input operand cross cores and then
@@ -694,10 +694,11 @@ class XlaBuilder {
   // scalars, e.g., add, min, or max. The way that AllReduce is applied is
   // configured by:
   //
-  // - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
-  // replicas belong to one group. Allreduce will be applied within subgroups.
-  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
-  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  // - `replica_groups`: each ReplicaGroup contains a list of replica id. If
+  // empty, all replicas belong to one group. Allreduce will be applied within
+  // subgroups. For example, we have 4 replicas, then
+  // replica_groups={{0,2},{1,3}} means, replica 0 and 2 are in subgroup 0,
+  // replica 1 and 3 are in subgroup 1.
   //
   // - `channel_id`: for Allreduce nodes from different modules, if they have
   // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will
@@ -706,13 +707,10 @@ class XlaBuilder {
   // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
   XlaOp CrossReplicaSum(
       const XlaOp& operand, const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
-      const tensorflow::gtl::optional<ChannelHandle>& channel_id =
-          tensorflow::gtl::nullopt);
+      tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups = {},
+      const absl::optional<ChannelHandle>& channel_id = absl::nullopt);
 
   // Enqueues an operation that do an Alltoall of the operand cross cores.
-  //
-  // TODO(b/110096724): This is NOT YET ready to use.
   XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
                  int64 concat_dimension, int64 split_count,
                  const std::vector<ReplicaGroup>& replica_groups);
@@ -837,8 +835,7 @@ class XlaBuilder {
   // * The result is a tuple that consists of a sorted tensor of keys (along the
   // provided dimension, as above) as the first element, and a tensor with their
   // corresponding values as the second element.
-  XlaOp Sort(XlaOp keys,
-             tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
+  XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values = absl::nullopt,
              int64 dimension = -1);
 
   // Enqueues a clamp instruction onto the computation.
@@ -873,9 +870,9 @@ class XlaBuilder {
                         const int mantissa_bits);
 
   // Enqueues a Gather node onto the computation.
-  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+  XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                const GatherDimensionNumbers& dimension_numbers,
-               tensorflow::gtl::ArraySlice<int64> window_bounds);
+               tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
   // Enqueues a Scatter node onto the computation.
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
@@ -1045,7 +1042,7 @@ class XlaBuilder {
 
   // Sharding for this operator. This is structured as a "model"-like operation,
   // in order to simplify client code, similar to metadata_.
-  tensorflow::gtl::optional<OpSharding> sharding_;
+  absl::optional<OpSharding> sharding_;
 
   // Mode bit that indicates whether to die when a first error is encountered.
   bool die_immediately_on_error_ = false;
@@ -1156,32 +1153,43 @@ class XlaBuilder {
                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
   friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+  friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
+                   const PrecisionConfigProto* precision_config_proto);
   friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                          const DotDimensionNumbers& dimension_numbers);
+                          const DotDimensionNumbers& dimension_number,
+                          const PrecisionConfigProto* precision_config_proto);
   friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
                     tensorflow::gtl::ArraySlice<int64> window_strides,
-                    Padding padding);
+                    Padding padding, int64 feature_group_count,
+                    const PrecisionConfigProto* precision_config_proto);
   friend XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      int64 feature_group_count,
+      const PrecisionConfigProto* precision_config_proto);
   friend XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count,
+      const PrecisionConfigProto* precision_config_proto);
   friend XlaOp ConvGeneral(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count,
+      const PrecisionConfigProto* precision_config_proto);
   friend XlaOp ConvGeneralDilated(
       const XlaOp& lhs, const XlaOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
       tensorflow::gtl::ArraySlice<int64> lhs_dilation,
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count,
+      const PrecisionConfigProto* precision_config_proto);
   friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
                    tensorflow::gtl::ArraySlice<int64> fft_length);
   friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
@@ -1193,10 +1201,6 @@ class XlaBuilder {
   friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                           tensorflow::gtl::ArraySlice<XlaOp> operands,
                           const Shape& shape);
-  friend XlaOp HostCompute(XlaBuilder* builder,
-                           tensorflow::gtl::ArraySlice<XlaOp> operands,
-                           const string& channel_name, int64 cost_estimate_ns,
-                           const Shape& shape);
   friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                        tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
   friend XlaOp Conj(const XlaOp& operand);
@@ -1248,11 +1252,11 @@ class XlaBuilder {
       tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
   friend XlaOp CrossReplicaSum(
       const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids);
+      tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups);
   friend XlaOp CrossReplicaSum(
       const XlaOp& operand, const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-      const tensorflow::gtl::optional<ChannelHandle>& channel_id);
+      tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups,
+      const absl::optional<ChannelHandle>& channel_id);
   friend XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
                         int64 concat_dimension, int64 split_count,
                         const std::vector<ReplicaGroup>& replica_groups);
@@ -1301,8 +1305,7 @@ class XlaBuilder {
                          tensorflow::gtl::ArraySlice<int64> permutation);
   friend XlaOp Rev(const XlaOp& operand,
                    tensorflow::gtl::ArraySlice<int64> dimensions);
-  friend XlaOp Sort(XlaOp keys, tensorflow::gtl::optional<XlaOp> values,
-                    int64 dimension);
+  friend XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values, int64 dimension);
   friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
   friend XlaOp Map(XlaBuilder* builder,
                    tensorflow::gtl::ArraySlice<XlaOp> operands,
@@ -1320,9 +1323,9 @@ class XlaBuilder {
                            const XlaComputation& false_computation);
   friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                                const int mantissa_bits);
-  friend XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+  friend XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                       const GatherDimensionNumbers& dimension_numbers,
-                      tensorflow::gtl::ArraySlice<int64> window_bounds);
+                      tensorflow::gtl::ArraySlice<int64> slice_sizes);
   friend XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                        const XlaOp& updates,
                        const XlaComputation& update_computation,
@@ -1365,7 +1368,7 @@ class XlaBuilder {
 class XlaScopedShardingAssignment {
  public:
   XlaScopedShardingAssignment(xla::XlaBuilder* builder,
-                              tensorflow::gtl::optional<OpSharding> sharding)
+                              absl::optional<OpSharding> sharding)
       : builder_(builder), prev_sharding_(builder->sharding()) {
     SetSharding(sharding);
   }
@@ -1377,7 +1380,7 @@ class XlaScopedShardingAssignment {
   ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
 
  private:
-  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
+  void SetSharding(const absl::optional<OpSharding>& sharding) {
     if (sharding.has_value()) {
       builder_->SetSharding(sharding.value());
     } else {
@@ -1386,7 +1389,7 @@ class XlaScopedShardingAssignment {
   }
 
   xla::XlaBuilder* const builder_;
-  tensorflow::gtl::optional<OpSharding> prev_sharding_;
+  absl::optional<OpSharding> prev_sharding_;
 };
 
 // Free functions for building XlaOps. The intention is that these will
@@ -1637,37 +1640,47 @@ XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
 
 // Enqueues a dot instruction onto the computation.
-XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
+          const PrecisionConfigProto* precision_config_proto = nullptr);
 
 // Enqueues a general dot instruction onto the computation.
 XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                 const DotDimensionNumbers& dimension_numbers);
+                 const DotDimensionNumbers& dimension_numbers,
+                 const PrecisionConfigProto* precision_config_proto = nullptr);
 
 // Enqueues a convolution instruction onto the computation, which uses the
 // default convolution dimension numbers.
 XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
+           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+           int64 feature_group_count = 1,
+           const PrecisionConfigProto* precision_config_proto = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration in the format returned by MakePadding().
 XlaOp ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    int64 feature_group_count = 1,
+    const PrecisionConfigProto* precision_config_proto = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided dimension numbers configuration.
 XlaOp ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers);
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count = 1,
+    const PrecisionConfigProto* precision_config_proto = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration as well as the dimension numbers.
 XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                   tensorflow::gtl::ArraySlice<int64> window_strides,
                   tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  const ConvolutionDimensionNumbers& dimension_numbers);
+                  const ConvolutionDimensionNumbers& dimension_numbers,
+                  int64 feature_group_count = 1,
+                  const PrecisionConfigProto* precision_config_proto = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
 // provided padding configuration, dilation factors and dimension numbers.
@@ -1677,7 +1690,9 @@ XlaOp ConvGeneralDilated(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers);
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count = 1,
+    const PrecisionConfigProto* precision_config_proto = nullptr);
 
 // Enqueues an FFT instruction onto the computation, of the given type and
 // with the given FFT length.
@@ -1724,17 +1739,6 @@ XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                  tensorflow::gtl::ArraySlice<XlaOp> operands,
                  const Shape& shape);
 
-// Enqueues a pseudo-op to represent host-side computation data-dependencies.
-// During code generation, host send and receive operations will be generated
-// to transfer |operands| to the host and a single result of |shape| back to
-// the device.  Host send/recv operations are emitted using |channel_name|.
-// Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-// instruction scheduling.
-XlaOp HostCompute(XlaBuilder* builder,
-                  tensorflow::gtl::ArraySlice<XlaOp> operands,
-                  const string& channel_name, int64 cost_estimate_ns,
-                  const Shape& shape);
-
 // The following methods enqueue element-wise binary arithmetic operations
 // onto the computation. The shapes of the operands have to match unless one
 // of the operands is a scalar, or an explicit broadcast dimension is given
@@ -1828,7 +1832,7 @@ XlaOp ReduceWindowWithGeneralPadding(
 // sum for each subgroup.
 XlaOp CrossReplicaSum(
     const XlaOp& operand,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
+    tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups = {});
 
 // Enqueues an operation that do an AllReduce of the operand cross cores. Here
 // AllReduce means doing a reduction on the input operand cross cores and then
@@ -1837,24 +1841,22 @@ XlaOp CrossReplicaSum(
 // scalars, e.g., add, min, or max. The way that AllReduce is applied is
 // configured by:
 //
-// - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
-// replicas belong to one group. Allreduce will be applied within subgroups.
-// For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
-// replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+// - `replica_groups`: each ReplicaGroup contains a list of replica id. If
+// empty, all replicas belong to one group. Allreduce will be applied within
+// subgroups. For example, we have 4 replicas, then replica_groups={{0,2},{1,3}}
+// means, replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
 //
 // - `channel_id`: for Allreduce nodes from different modules, if they have the
 // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
 // applied cross modules.
 //
 // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
-XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
-                      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
-                      const tensorflow::gtl::optional<ChannelHandle>&
-                          channel_id = tensorflow::gtl::nullopt);
+XlaOp CrossReplicaSum(
+    const XlaOp& operand, const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<ReplicaGroup> replica_groups = {},
+    const absl::optional<ChannelHandle>& channel_id = absl::nullopt);
 
 // Enqueues an operation that do an Alltoall of the operand cross cores.
-//
-// TODO(b/110096724): This is NOT YET ready to use.
 XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
                int64 concat_dimension, int64 split_count,
                const std::vector<ReplicaGroup>& replica_groups = {});
@@ -1975,8 +1977,7 @@ XlaOp Rev(const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions);
 // * The result is a tuple that consists of a sorted tensor of keys (along the
 // provided dimension, as above) as the first element, and a tensor with their
 // corresponding values as the second element.
-XlaOp Sort(XlaOp keys,
-           tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
+XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values = absl::nullopt,
            int64 dimension = -1);
 
 // Enqueues a clamp instruction onto the computation.
@@ -2011,9 +2012,9 @@ XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                       const int mantissa_bits);
 
 // Enqueues a Gather node onto the computation.
-XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
              const GatherDimensionNumbers& dimension_numbers,
-             tensorflow::gtl::ArraySlice<int64> window_bounds);
+             tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
 // Enqueues a Scatter node onto the computation.
 XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
diff --git a/tensorflow/compiler/xla/client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc
index 3543d41fc2..22c9e83bb2 100644
--- a/tensorflow/compiler/xla/client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_computation.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -32,7 +32,7 @@ StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
   if (IsNull()) {
     return InvalidArgument("Computation is invalid.");
   }
-  auto session = MakeUnique<HloSnapshot>();
+  auto session = absl::make_unique<HloSnapshot>();
   *session->mutable_hlo()->mutable_hlo_module() = proto_;
   return std::move(session);
 }
diff --git a/tensorflow/compiler/xla/device_util.h b/tensorflow/compiler/xla/device_util.h
index 1a51fdee68..6d51126d88 100644
--- a/tensorflow/compiler/xla/device_util.h
+++ b/tensorflow/compiler/xla/device_util.h
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
@@ -30,8 +30,8 @@ namespace xla {
 // Returns a string that represents the device in terms of platform and ordinal;
 // e.g. the first CUDA device will be "cuda:0"
 string DeviceIdentifier(se::StreamExecutor* stream_exec) {
-  return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":",
-                                     stream_exec->device_ordinal());
+  return absl::StrCat(stream_exec->platform()->Name(), ":",
+                      stream_exec->device_ordinal());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index ffd1fb79e9..693dcb3a3e 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <string>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -36,7 +36,7 @@ namespace xla {
     DCHECK_GE(multi_index[i], 0);
     DCHECK_LT(multi_index[i], shape.dimensions(i))
         << "indexing beyond extent in dimension " << i << ":"
-        << "\n\tindex: " << tensorflow::str_util::Join(multi_index, ",")
+        << "\n\tindex: " << absl::StrJoin(multi_index, ",")
         << "\n\tshape: " << ShapeUtil::HumanString(shape);
   }
 
diff --git a/tensorflow/compiler/xla/iterator_util.h b/tensorflow/compiler/xla/iterator_util.h
index a8bb8c7a7e..3a3ee21e76 100644
--- a/tensorflow/compiler/xla/iterator_util.h
+++ b/tensorflow/compiler/xla/iterator_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_ITERATOR_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_ITERATOR_UTIL_H_
 
 #include <iterator>
 #include <utility>
@@ -95,4 +95,4 @@ UnwrappingIterator<NestedIter> MakeUnwrappingIterator(NestedIter iter) {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_ITERATOR_UTIL_H_
diff --git a/tensorflow/compiler/xla/iterator_util_test.cc b/tensorflow/compiler/xla/iterator_util_test.cc
index 7bc3189507..ec8b66df2d 100644
--- a/tensorflow/compiler/xla/iterator_util_test.cc
+++ b/tensorflow/compiler/xla/iterator_util_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <list>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/test.h"
 
 namespace xla {
@@ -27,7 +27,7 @@ namespace {
 TEST(UnwrappingIteratorTest, Simple) {
   std::vector<std::unique_ptr<int>> v;
   for (int i = 0; i < 3; ++i) {
-    v.push_back(MakeUnique<int>(i));
+    v.push_back(absl::make_unique<int>(i));
   }
   int i = 0;
   for (auto iter = MakeUnwrappingIterator(v.begin());
@@ -51,7 +51,7 @@ TEST(UnwrappingIteratorTest, PostincrementOperator) {
 TEST(UnwrappingIteratorTest, StdFind) {
   std::list<std::unique_ptr<int>> l;
   for (int i = 0; i < 3; ++i) {
-    l.push_back(MakeUnique<int>(i));
+    l.push_back(absl::make_unique<int>(i));
   }
   EXPECT_EQ(l.begin()->get(),
             *std::find(MakeUnwrappingIterator(l.begin()),
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index b72d190d54..61c26434b1 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -31,8 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -211,7 +211,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
           "layout minor_to_major field contains %d elements, "
           "but shape is rank %lld: {%s}; shape: %s",
           layout.minor_to_major_size(), ShapeUtil::Rank(shape),
-          tensorflow::str_util::Join(layout.minor_to_major(), ", ").c_str(),
+          absl::StrJoin(layout.minor_to_major(), ", ").c_str(),
           shape.ShortDebugString().c_str());
     }
 
@@ -403,12 +403,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 
 /* static */ string LayoutUtil::HumanString(const Layout& layout) {
   if (IsSparse(layout)) {
-    return tensorflow::strings::StrCat("sparse{", layout.max_sparse_elements(),
-                                       "}");
+    return absl::StrCat("sparse{", layout.max_sparse_elements(), "}");
   }
   CHECK(IsDense(layout));
-  return tensorflow::strings::StrCat(
-      "{", tensorflow::str_util::Join(layout.minor_to_major(), ","), "}");
+  return absl::StrCat("{", absl::StrJoin(layout.minor_to_major(), ","), "}");
 }
 
 namespace {
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
index 89353448e2..989035896b 100644
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ b/tensorflow/compiler/xla/legacy_flags/BUILD
@@ -56,6 +56,7 @@ cc_library(
             "//tensorflow/compiler/xla/service:hlo",
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
+            "@com_google_absl//absl/strings",
         ],
 )
 
@@ -73,5 +74,6 @@ tf_cc_test(
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
             "//tensorflow/core:test",
+            "@com_google_absl//absl/strings",
         ],
 )
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index 1bf8948ef6..0d3136b0cc 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <vector>
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h"
 #include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace legacy_flags {
@@ -87,7 +87,7 @@ void AllocateFlags() {
   // Custom "sub-parser" lambda for xla_disable_hlo_passes.
   auto setter_for_xla_disable_hlo_passes = [](string comma_separated_values) {
     std::vector<string> disabled_passes =
-        tensorflow::str_util::Split(comma_separated_values, ',');
+        absl::StrSplit(comma_separated_values, ',');
     for (const auto& passname : disabled_passes) {
       flag_values->add_xla_disable_hlo_passes(passname);
     }
@@ -316,6 +316,13 @@ void AllocateFlags() {
                        bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
                        flag_values->xla_cpu_use_mkl_dnn(),
                        "Generate calls to MKL-DNN in the CPU backend."),
+      tensorflow::Flag(
+          "xla_gpu_crash_on_verification_failures",
+          bool_setter_for(
+              &DebugOptions::set_xla_gpu_crash_on_verification_failures),
+          flag_values->xla_gpu_crash_on_verification_failures(),
+          "Crashes the program on extra verification failures, e.g. cuDNN "
+          "cross checking failures"),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
index e9cf435d83..acda438395 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
@@ -17,9 +17,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
 
 #include <vector>
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
@@ -30,7 +31,7 @@ template <typename T>
 void parse_xla_backend_extra_options(T* extra_options_map,
                                      string comma_separated_values) {
   std::vector<string> extra_options_parts =
-      tensorflow::str_util::Split(comma_separated_values, ',');
+      absl::StrSplit(comma_separated_values, ',');
 
   // The flag contains a comma-separated list of options; some options
   // have arguments following "=", some don't.
@@ -59,8 +60,7 @@ void parse_xla_backend_extra_options(T* extra_options_map,
 inline bool parse_xla_reduce_precision_option(
     HloReducePrecisionOptions* options, string option_string) {
   // Split off "LOCATION" from remainder of string.
-  std::vector<string> eq_split =
-      tensorflow::str_util::Split(option_string, '=');
+  std::vector<string> eq_split = absl::StrSplit(option_string, '=');
   if (eq_split.size() != 2) {
     return false;
   }
@@ -80,26 +80,25 @@ inline bool parse_xla_reduce_precision_option(
   }
 
   // Split off "E,M" from remainder of string.
-  std::vector<string> colon_split =
-      tensorflow::str_util::Split(eq_split[1], ':');
+  std::vector<string> colon_split = absl::StrSplit(eq_split[1], ':');
   if (colon_split.size() != 2) {
     return false;
   }
 
   // Split E and M, and parse.
   std::vector<int32> bitsizes;
-  if (!tensorflow::str_util::SplitAndParseAsInts(colon_split[0], ',',
-                                                 &bitsizes) ||
-      bitsizes.size() != 2) {
-    return false;
+  for (const auto& s : absl::StrSplit(colon_split[0], ',')) {
+    bitsizes.emplace_back();
+    if (!absl::SimpleAtoi(s, &bitsizes.back())) {
+      return false;
+    }
   }
   options->set_exponent_bits(bitsizes[0]);
   options->set_mantissa_bits(bitsizes[1]);
 
   // Split off OPS comma-separated list from remainder of string, if the
   // remainder exists.
-  std::vector<string> semicolon_split =
-      tensorflow::str_util::Split(colon_split[1], ';');
+  std::vector<string> semicolon_split = absl::StrSplit(colon_split[1], ';');
   if (semicolon_split.size() > 2) {
     return false;
   }
@@ -113,8 +112,7 @@ inline bool parse_xla_reduce_precision_option(
       options->add_opcodes_to_suffix(i);
     }
   } else {
-    std::vector<string> opcodes =
-        tensorflow::str_util::Split(opcode_string, ',');
+    std::vector<string> opcodes = absl::StrSplit(opcode_string, ',');
     for (const string& opcode : opcodes) {
       bool found = false;
       for (int i = 0; i < HloOpcodeCount(); i++) {
@@ -132,8 +130,7 @@ inline bool parse_xla_reduce_precision_option(
 
   // Process the NAMES string, if it exists.
   if (semicolon_split.size() == 2) {
-    std::vector<string> opnames =
-        tensorflow::str_util::Split(semicolon_split[1], ',');
+    std::vector<string> opnames = absl::StrSplit(semicolon_split[1], ',');
     for (const string& opname : opnames) {
       if (opname.length() > 0) {
         options->add_opname_substrings_to_suffix(opname);
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
index 0ed788a967..6f197aec53 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 36e472568e..0c0b619d50 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -22,6 +22,9 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -30,19 +33,16 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-using tensorflow::strings::Printf;
-using tensorflow::strings::StrCat;
-
 namespace xla {
-
 namespace {
 
+using absl::StrCat;
+using tensorflow::strings::Printf;
+
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
 // Converts between little and big endian.
@@ -134,7 +134,7 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
 
 Literal::Literal(const Shape& shape, bool allocate_arrays)
     : MutableLiteralBase() {
-  shape_ = MakeUnique<Shape>(shape);
+  shape_ = absl::make_unique<Shape>(shape);
   CHECK(LayoutUtil::HasLayout(*shape_));
   root_piece_ = new Piece();
   root_piece_->set_subshape(shape_.get());
@@ -175,7 +175,7 @@ Literal& Literal::operator=(Literal&& other) {
 }
 
 std::unique_ptr<Literal> LiteralBase::CreateFromShape(const Shape& shape) {
-  auto literal = MakeUnique<Literal>(shape);
+  auto literal = absl::make_unique<Literal>(shape);
   literal->root_piece_->ForEachMutableSubpiece(
       [&](const ShapeIndex& index, Piece* piece) {
         if (ShapeUtil::IsArray(piece->subshape())) {
@@ -289,7 +289,7 @@ MutableLiteralBase::CreateFromProto(const LiteralProto& proto) {
     return InvalidArgument("LiteralProto has no layout");
   }
 
-  auto literal = MakeUnique<Literal>(proto.shape());
+  auto literal = absl::make_unique<Literal>(proto.shape());
 
   TF_RETURN_IF_ERROR(literal->root_piece_->ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
@@ -479,7 +479,7 @@ Status Literal::MoveFrom(Literal&& src_literal,
         dest_piece.set_sparse_indices(src_piece.sparse_indices());
       });
 
-  src_literal.shape_ = MakeUnique<Shape>(ShapeUtil::MakeNil());
+  src_literal.shape_ = absl::make_unique<Shape>(ShapeUtil::MakeNil());
   delete src_literal.root_piece_;
   src_literal.root_piece_ = new LiteralBase::Piece();
   src_literal.root_piece_->set_subshape(src_literal.shape_.get());
@@ -566,7 +566,7 @@ std::unique_ptr<Literal> LiteralBase::Relayout(
   Shape* subshape = ShapeUtil::GetMutableSubshape(&new_shape, shape_index);
   TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(new_layout, *subshape));
   *subshape->mutable_layout() = new_layout;
-  auto result = MakeUnique<Literal>(new_shape);
+  auto result = absl::make_unique<Literal>(new_shape);
   TF_CHECK_OK(result->CopyFrom(*this));
   return result;
 }
@@ -602,7 +602,7 @@ StatusOr<std::unique_ptr<Literal>> LiteralBase::Broadcast(
                  result_shape.dimensions(dimensions[i]));
   }
 
-  std::unique_ptr<Literal> result = MakeUnique<Literal>(result_shape);
+  std::unique_ptr<Literal> result = absl::make_unique<Literal>(result_shape);
 
   // scratch_source_index is temporary storage space for the computed index into
   // the input literal.  We put it here to avoid allocating an std::vector in
@@ -691,7 +691,7 @@ std::unique_ptr<Literal> LiteralBase::Transpose(
   for (auto index : LayoutUtil::MinorToMajor(shape())) {
     layout->add_minor_to_major(inverse_permutation[index]);
   }
-  auto new_literal = MakeUnique<Literal>(permuted_shape);
+  auto new_literal = absl::make_unique<Literal>(permuted_shape);
   DCHECK_EQ(ShapeUtil::ByteSizeOf(new_literal->shape()),
             ShapeUtil::ByteSizeOf(shape()));
   std::memcpy(new_literal->untyped_data(), untyped_data(), size_bytes());
@@ -702,7 +702,7 @@ template <typename NativeT>
 std::unique_ptr<Literal> LiteralBase::SliceInternal(
     const Shape& result_shape,
     tensorflow::gtl::ArraySlice<int64> start_indices) const {
-  auto result_literal = MakeUnique<Literal>(result_shape);
+  auto result_literal = absl::make_unique<Literal>(result_shape);
   DimensionVector new_indices(ShapeUtil::Rank(result_shape));
   result_literal->EachCell<NativeT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT /*value*/) {
@@ -756,7 +756,7 @@ Literal LiteralBase::Clone() const {
 }
 
 std::unique_ptr<Literal> LiteralBase::CloneToUnique() const {
-  auto result = MakeUnique<Literal>(shape());
+  auto result = absl::make_unique<Literal>(shape());
   TF_CHECK_OK(result->CopyFrom(*this));
   return result;
 }
@@ -1029,9 +1029,9 @@ void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
       element_index.push_back(i);
       std::vector<string> element_pieces;
       ToStringHelper(literal, element_index, print_layout, &element_pieces);
-      tuple_pieces.push_back(tensorflow::str_util::Join(element_pieces, ""));
+      tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
     }
-    pieces->push_back(tensorflow::str_util::Join(tuple_pieces, ",\n"));
+    pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
     pieces->push_back("\n)");
     return;
   }
@@ -1055,8 +1055,7 @@ void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
         pieces->push_back(": ");
       } else {
         pieces->push_back("[");
-        pieces->push_back(
-            tensorflow::str_util::Join(literal.GetSparseIndex(i), ", "));
+        pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
         pieces->push_back("]: ");
       }
       pieces->push_back(literal.GetSparseElementAsString(i));
@@ -1182,7 +1181,7 @@ string LiteralBase::ToString(bool print_layout) const {
   std::vector<string> pieces;
   CHECK(LayoutUtil::HasLayout(this->shape()));
   ToStringHelper(*this, {}, print_layout, &pieces);
-  return tensorflow::str_util::Join(pieces, "");
+  return absl::StrJoin(pieces, "");
 }
 
 void LiteralBase::EachCellAsString(
@@ -1203,7 +1202,7 @@ template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
 std::unique_ptr<Literal> ConvertBetweenNativeTypesWithConverter(
     const LiteralBase& src_literal, const ConverterType& converter) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
-  auto result_literal = MakeUnique<Literal>(ShapeUtil::ChangeElementType(
+  auto result_literal = absl::make_unique<Literal>(ShapeUtil::ChangeElementType(
       src_literal.shape(),
       primitive_util::NativeToPrimitiveType<NativeDestT>()));
   auto src_data = src_literal.data<NativeSrcT>();
@@ -1249,7 +1248,7 @@ BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
 template <PrimitiveType primitive_src_type>
 std::unique_ptr<Literal> ConvertToC64(const LiteralBase& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
-  auto result_literal = MakeUnique<Literal>(
+  auto result_literal = absl::make_unique<Literal>(
       ShapeUtil::ChangeElementType(src_literal.shape(), C64));
   using NativeSrcT =
       typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type;
@@ -1396,7 +1395,7 @@ StatusOr<std::unique_ptr<Literal>> LiteralBase::ConvertToShape(
         element.ConvertToShape(ShapeUtil::GetSubshape(dest_shape, {i})));
     elements.push_back(std::move(*new_element));
   }
-  auto converted = MakeUnique<Literal>();
+  auto converted = absl::make_unique<Literal>();
   *converted = MutableLiteralBase::MoveIntoTuple(&elements);
   return std::move(converted);
 }
@@ -1435,6 +1434,12 @@ bool LiteralBase::Piece::EqualElementsInternal(
 bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
   DCHECK(ShapeUtil::Compatible(subshape(), other.subshape()));
 
+  if (ShapeUtil::Equal(subshape(), other.subshape()) &&
+      LayoutUtil::IsDenseArray(subshape())) {
+    CHECK_EQ(size_bytes(), other.size_bytes());
+    return memcmp(buffer(), other.buffer(), size_bytes()) == 0;
+  }
+
   std::vector<int64> multi_index;
   switch (subshape().element_type()) {
     case PRED:
@@ -1956,7 +1961,7 @@ MutableLiteralBase::~MutableLiteralBase() {}
 MutableBorrowingLiteral::MutableBorrowingLiteral(
     const MutableBorrowingLiteral& literal)
     : MutableLiteralBase() {
-  shape_ = MakeUnique<Shape>(literal.shape());
+  shape_ = absl::make_unique<Shape>(literal.shape());
   CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = new Piece();
@@ -1967,7 +1972,7 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(
 
 MutableBorrowingLiteral& MutableBorrowingLiteral::operator=(
     const MutableBorrowingLiteral& literal) {
-  shape_ = MakeUnique<Shape>(literal.shape());
+  shape_ = absl::make_unique<Shape>(literal.shape());
   CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = new Piece();
@@ -1981,7 +1986,7 @@ MutableBorrowingLiteral& MutableBorrowingLiteral::operator=(
 MutableBorrowingLiteral::MutableBorrowingLiteral(
     const MutableLiteralBase& literal)
     : MutableLiteralBase() {
-  shape_ = MakeUnique<Shape>(literal.shape());
+  shape_ = absl::make_unique<Shape>(literal.shape());
   CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = new Piece();
@@ -1992,7 +1997,7 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(
 
 MutableBorrowingLiteral::MutableBorrowingLiteral(MutableLiteralBase* literal)
     : MutableLiteralBase() {
-  shape_ = MakeUnique<Shape>(literal->shape());
+  shape_ = absl::make_unique<Shape>(literal->shape());
   CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = new Piece();
@@ -2004,7 +2009,7 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(MutableLiteralBase* literal)
 MutableBorrowingLiteral::MutableBorrowingLiteral(
     MutableBorrowingLiteral literal, const ShapeIndex& view_root)
     : MutableLiteralBase() {
-  shape_ = MakeUnique<Shape>(literal.piece(view_root).subshape());
+  shape_ = absl::make_unique<Shape>(literal.piece(view_root).subshape());
   CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = new Piece();
@@ -2016,7 +2021,7 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(
 MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr,
                                                  const Shape& shape)
     : MutableLiteralBase() {
-  shape_ = MakeUnique<Shape>(shape);
+  shape_ = absl::make_unique<Shape>(shape);
   CHECK(LayoutUtil::HasLayout(*shape_));
   CHECK(!ShapeUtil::IsTuple(*shape_));
 
@@ -2061,7 +2066,7 @@ void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
 }
 
 BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
-    : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
+    : LiteralBase(), shape_(absl::make_unique<Shape>(shape)) {
   CHECK(ShapeUtil::IsArray(*shape_));
   CHECK(LayoutUtil::HasLayout(*shape_));
 
@@ -2072,7 +2077,7 @@ BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
 
 BorrowingLiteral::BorrowingLiteral(
     tensorflow::gtl::ArraySlice<const char*> src_buf_ptrs, const Shape& shape)
-    : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
+    : LiteralBase(), shape_(absl::make_unique<Shape>(shape)) {
   CHECK(ShapeUtil::IsTuple(*shape_));
   CHECK(!ShapeUtil::IsNestedTuple(*shape_));
   CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(*shape_));
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 92c0f903cb..aad435ed5b 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -25,13 +25,14 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/sparse_index_array.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/bitmap.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -312,7 +312,7 @@ class LiteralBase {
   // Note: It's an antipattern to use this method then immediately call
   // MutableLiteralBase::Populate on the result (since that results in zero
   // initialization, then reinitialization. Conside if a call to
-  // MakeUnique<Literal>(shape), followed by the call to
+  // absl::make_unique<Literal>(shape), followed by the call to
   // MutableLiteralBase::Populate can be used instead.
   static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
 
@@ -1154,8 +1154,8 @@ std::unique_ptr<Literal> LiteralBase::Replicate(int64 times) const {
   for (int64 bound : shape().dimensions()) {
     bounds.push_back(bound);
   }
-  auto literal =
-      MakeUnique<Literal>(ShapeUtil::MakeShape(shape().element_type(), bounds));
+  auto literal = absl::make_unique<Literal>(
+      ShapeUtil::MakeShape(shape().element_type(), bounds));
   int64 elements = ShapeUtil::ElementsIn(literal->shape());
   if (elements == 0) {
     return literal;
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 94993cc874..67a69c2403 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -19,16 +19,16 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/casts.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 
+using absl::StrAppend;
+using absl::StrCat;
 using tensorflow::strings::Appendf;
 using tensorflow::strings::Printf;
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
 
 namespace xla {
 namespace literal_comparison {
@@ -38,7 +38,8 @@ namespace {
 // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
 // -- on miscompare, a nice error message is given in the AssertionFailure.
 template <typename FloatT, typename UnsignedT>
-Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
+Status CompareFloatsBitwiseEqual(
+    FloatT lhs, FloatT rhs, tensorflow::gtl::ArraySlice<int64> multi_index) {
   auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
   auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
   auto lhs_double = static_cast<double>(lhs);
@@ -46,9 +47,10 @@ Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
   if (ulhs != urhs) {
     return InvalidArgument(
         "floating values are not bitwise-equal; and equality testing "
-        "was requested: %s=%g=%a vs %s=%g=%a",
-        StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double, lhs_double,
-        StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double);
+        "was requested: %s=%g=%a vs %s=%g=%a at array index %s",
+        StrCat(absl::Hex(ulhs)).c_str(), lhs_double, lhs_double,
+        StrCat(absl::Hex(urhs)).c_str(), rhs_double, rhs_double,
+        LiteralUtil::MultiIndexAsString(multi_index).c_str());
   }
   return Status::OK();
 }
@@ -57,39 +59,49 @@ Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
 // bitwise helper above (this is the un-specialized fallback, to just use the
 // default gunit implementation).
 template <typename NativeT>
-Status CompareEqual(NativeT lhs, NativeT rhs) {
+Status CompareEqual(NativeT lhs, NativeT rhs,
+                    tensorflow::gtl::ArraySlice<int64> multi_index) {
   if (lhs == rhs) {
     return Status::OK();
   }
-  return InvalidArgument("Expected equality of these values:\n  %s\n  %s",
-                         StrCat(lhs).c_str(), StrCat(rhs).c_str());
+  return InvalidArgument(
+      "first mismatch at array index %s:\n  expected value: %s\n  actual "
+      "value:   %s",
+      LiteralUtil::MultiIndexAsString(multi_index).c_str(), StrCat(lhs).c_str(),
+      StrCat(rhs).c_str());
 }
 
 // Specializations for floating types that do bitwise comparisons when equality
 // comparison is requested.
 template <>
-Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
-  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
+Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
+                              tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs) {
-  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs);
+Status CompareEqual<Eigen::half>(
+    Eigen::half lhs, Eigen::half rhs,
+    tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<float>(float lhs, float rhs) {
-  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
+Status CompareEqual<float>(float lhs, float rhs,
+                           tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<double>(double lhs, double rhs) {
-  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
+Status CompareEqual<double>(double lhs, double rhs,
+                            tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<complex64>(complex64 lhs, complex64 rhs) {
-  auto res = CompareEqual<float>(lhs.real(), rhs.real());
+Status CompareEqual<complex64>(complex64 lhs, complex64 rhs,
+                               tensorflow::gtl::ArraySlice<int64> multi_index) {
+  auto res = CompareEqual<float>(lhs.real(), rhs.real(), multi_index);
   if (!res.ok()) {
     return res;
   }
-  return CompareEqual<float>(lhs.imag(), rhs.imag());
+  return CompareEqual<float>(lhs.imag(), rhs.imag(), multi_index);
 }
 
 // A recursive function which iterates through every index of expected and
@@ -102,13 +114,14 @@ Status Equal(LiteralSlice expected, LiteralSlice actual,
   if (dimension == expected.shape().dimensions_size()) {
     NativeT expected_value = expected.Get<NativeT>(multi_index);
     NativeT actual_value = actual.Get<NativeT>(multi_index);
-    return CompareEqual<NativeT>(expected_value, actual_value);
+    return CompareEqual<NativeT>(expected_value, actual_value, multi_index);
   }
 
   Status result;
   for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
     multi_index[dimension] = i;
-    result.Update(Equal<NativeT>(expected, actual, multi_index, dimension + 1));
+    TF_RETURN_IF_ERROR(
+        Equal<NativeT>(expected, actual, multi_index, dimension + 1));
   }
   return result;
 }
@@ -240,11 +253,6 @@ class NearComparator {
 
   // Runs the comparison between expected and actual literals.
   Status Run() {
-    VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, ToStringTruncated(expected_));
-    VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, ToStringTruncated(actual_));
-
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
     TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape()));
@@ -528,6 +536,62 @@ constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
 template <typename NativeT>
 constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
 
+Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
+  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  std::vector<int64> multi_index(expected.shape().dimensions_size(), 0);
+  Status result;
+  switch (expected.shape().element_type()) {
+    case PRED:
+      result = Equal<bool>(expected, actual, &multi_index, 0);
+      break;
+    case U8:
+      result = Equal<uint8>(expected, actual, &multi_index, 0);
+      break;
+    case S32:
+      result = Equal<int32>(expected, actual, &multi_index, 0);
+      break;
+    case S64:
+      result = Equal<int64>(expected, actual, &multi_index, 0);
+      break;
+    case U32:
+      result = Equal<uint32>(expected, actual, &multi_index, 0);
+      break;
+    case U64:
+      result = Equal<uint64>(expected, actual, &multi_index, 0);
+      break;
+    case BF16:
+      result = Equal<bfloat16>(expected, actual, &multi_index, 0);
+      break;
+    case F16:
+      result = Equal<half>(expected, actual, &multi_index, 0);
+      break;
+    case F32:
+      result = Equal<float>(expected, actual, &multi_index, 0);
+      break;
+    case F64:
+      result = Equal<double>(expected, actual, &multi_index, 0);
+      break;
+    case C64:
+      result = Equal<complex64>(expected, actual, &multi_index, 0);
+      break;
+    case TUPLE: {
+      for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+        result.Update(EqualHelper(LiteralSlice(expected, {i}),
+                                  LiteralSlice(actual, {i})));
+      }
+      break;
+    }
+    case TOKEN:
+      // Tokens have no on-device representation and are trivially equal.
+      return Status::OK();
+    default:
+      LOG(FATAL) << "Unsupported primitive type: "
+                 << PrimitiveType_Name(expected.shape().element_type());
+  }
+
+  return result;
+}
+
 // Helper function for comparing two literals for nearness. Handles tuple-shapes
 // via recursion. shape_index is the ShapeIndex of expected (or actual)
 // currently being compared.
@@ -544,17 +608,18 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
       const auto actual_element = LiteralSlice(actual, {i});
       ShapeIndex element_index = shape_index;
       element_index.push_back(i);
-      Status res =
+      Status element_result =
           NearHelper(expected_element, actual_element, error, detailed_message,
                      miscompare_callback, element_index);
-      if (!res.ok()) {
-        string err_message = Printf("\nArray at shape index %s%s",
-                                    element_index.ToString().c_str(),
-                                    res.error_message().c_str());
+      if (!element_result.ok()) {
+        element_result = InvalidArgument(
+            "Array at shape index %s, %s", element_index.ToString().c_str(),
+            element_result.error_message().c_str());
         if (return_status.ok()) {
-          return_status = res;
+          return_status = element_result;
         } else {
-          return_status = AppendStatus(return_status, res.error_message());
+          return_status =
+              AppendStatus(return_status, element_result.error_message());
         }
       }
     }
@@ -600,8 +665,8 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
     }
   }
 
-  // Non-floating point literal.
-  return literal_comparison::Equal(expected, actual);
+  // Non-floating point, non-tuple literal.
+  return EqualHelper(expected, actual);
 }
 
 }  // namespace
@@ -657,83 +722,44 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
   return Status::OK();
 }
 
+namespace {
+
+// If result is an error, extend the error message with the expected and actual
+// literals.
+Status EmitLiteralsInErrorMessage(const Status& result,
+                                  const LiteralSlice& expected,
+                                  const LiteralSlice& actual) {
+  if (result.ok()) {
+    return result;
+  }
+  return InvalidArgument("%s\n\nExpected literal:\n%s\n\nActual literal:\n%s",
+                         result.error_message().c_str(),
+                         ToStringTruncated(expected).c_str(),
+                         ToStringTruncated(actual).c_str());
+}
+
+}  // namespace
+
 Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
   VLOG(1) << "expected:";
   XLA_VLOG_LINES(1, expected.ToString());
   VLOG(1) << "actual:";
   XLA_VLOG_LINES(1, actual.ToString());
-
-  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
-  std::vector<int64> multi_index(expected.shape().dimensions_size(), 0);
-  Status result;
-  switch (expected.shape().element_type()) {
-    case PRED:
-      result = Equal<bool>(expected, actual, &multi_index, 0);
-      break;
-    case U8:
-      result = Equal<uint8>(expected, actual, &multi_index, 0);
-      break;
-    case S32:
-      result = Equal<int32>(expected, actual, &multi_index, 0);
-      break;
-    case S64:
-      result = Equal<int64>(expected, actual, &multi_index, 0);
-      break;
-    case U32:
-      result = Equal<uint32>(expected, actual, &multi_index, 0);
-      break;
-    case U64:
-      result = Equal<uint64>(expected, actual, &multi_index, 0);
-      break;
-    case BF16:
-      result = Equal<bfloat16>(expected, actual, &multi_index, 0);
-      break;
-    case F16:
-      result = Equal<half>(expected, actual, &multi_index, 0);
-      break;
-    case F32:
-      result = Equal<float>(expected, actual, &multi_index, 0);
-      break;
-    case F64:
-      result = Equal<double>(expected, actual, &multi_index, 0);
-      break;
-    case C64:
-      result = Equal<complex64>(expected, actual, &multi_index, 0);
-      break;
-    case TUPLE: {
-      for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-        result.Update(
-            Equal(LiteralSlice(expected, {i}), LiteralSlice(actual, {i})));
-      }
-      break;
-    }
-    case TOKEN:
-      // Tokens have no on-device representation and are trivially equal.
-      return Status::OK();
-    default:
-      LOG(FATAL)
-          << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: "
-          << PrimitiveType_Name(expected.shape().element_type());
-  }
-
-  if (result.ok()) {
-    return Status::OK();
-  }
-
-  return AppendStatus(result,
-                      tensorflow::strings::Printf(
-                          "\nat index: %s\nexpected: %s\nactual:   %s",
-                          LiteralUtil::MultiIndexAsString(multi_index).c_str(),
-                          ToStringTruncated(expected).c_str(),
-                          ToStringTruncated(actual).c_str()));
+  Status result = EqualHelper(expected, actual);
+  return EmitLiteralsInErrorMessage(result, expected, actual);
 }
 
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
             const ErrorSpec& error, bool detailed_message,
             const MiscompareCallback& miscompare_callback) {
-  return NearHelper(expected, actual, error, detailed_message,
-                    miscompare_callback,
-                    /*shape_index=*/{});
+  VLOG(1) << "Expected literal:";
+  XLA_VLOG_LINES(1, expected.ToString());
+  VLOG(1) << "Actual literal:";
+  XLA_VLOG_LINES(1, actual.ToString());
+  Status result =
+      NearHelper(expected, actual, error, detailed_message, miscompare_callback,
+                 /*shape_index=*/{});
+  return EmitLiteralsInErrorMessage(result, expected, actual);
 }
 
 string ToStringTruncated(const LiteralSlice& literal) {
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index e8f919950f..aef87e46d8 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -17,6 +17,9 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
@@ -355,15 +358,15 @@ TEST_F(LiteralUtilTest, TokenEquality) {
 
 TEST_F(LiteralUtilTest, DifferentLayoutEquality) {
   // Test equality with literals which have different layouts.
-  auto colmajor =
-      MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1}));
+  auto colmajor = absl::make_unique<Literal>(
+      ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1}));
   colmajor->Set<float>({0, 0}, 1.0);
   colmajor->Set<float>({0, 1}, 2.0);
   colmajor->Set<float>({1, 0}, 3.0);
   colmajor->Set<float>({1, 1}, 4.0);
 
-  auto rowmajor =
-      MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0}));
+  auto rowmajor = absl::make_unique<Literal>(
+      ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0}));
   rowmajor->Set<float>({0, 0}, 1.0);
   rowmajor->Set<float>({0, 1}, 2.0);
   rowmajor->Set<float>({1, 0}, 3.0);
@@ -1089,7 +1092,7 @@ TEST_F(LiteralUtilTest, Populate) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
-    auto literal = MakeUnique<Literal>(shape);
+    auto literal = absl::make_unique<Literal>(shape);
     auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
@@ -1131,7 +1134,7 @@ TEST_F(LiteralUtilTest, PopulateParallel) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
-    auto literal = MakeUnique<Literal>(shape);
+    auto literal = absl::make_unique<Literal>(shape);
     auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
@@ -1323,8 +1326,8 @@ TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
   auto literal = LiteralUtil::CreateR0<uint32>(1234);
   Status status = literal->BitcastConvert(F64).status();
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(tensorflow::str_util::StrContains(status.error_message(),
-                                                "bit widths are different"));
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "bit widths are different"));
 }
 
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
@@ -1818,21 +1821,20 @@ TEST_F(LiteralUtilTest, GetSparseElementAsString) {
       "false");
   ASSERT_EQ(LiteralUtil::CreateSparse<int64>(dimensions, indices, {1, 2, 3})
                 ->GetSparseElementAsString(1),
-            tensorflow::strings::StrCat(int64{2}));
+            absl::StrCat(int64{2}));
   ASSERT_EQ(
       LiteralUtil::CreateSparse<double>(dimensions, indices, {1.0, 2.0, 3.0})
           ->GetSparseElementAsString(1),
-      tensorflow::strings::StrCat(double{2.0}));
+      absl::StrCat(double{2.0}));
   ASSERT_EQ(LiteralUtil::CreateSparse<half>(dimensions, indices,
                                             {half{1.0}, half{2.0}, half{3.0}})
                 ->GetSparseElementAsString(1),
-            tensorflow::strings::StrCat(static_cast<float>(half{2.0})));
-  ASSERT_EQ(
-      LiteralUtil::CreateSparse<complex64>(
-          dimensions, indices,
-          std::vector<complex64>{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}})
-          ->GetSparseElementAsString(1),
-      tensorflow::strings::StrCat("(", float{3.0}, ", ", float{4.0}, ")"));
+            absl::StrCat(static_cast<float>(half{2.0})));
+  ASSERT_EQ(LiteralUtil::CreateSparse<complex64>(
+                dimensions, indices,
+                std::vector<complex64>{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}})
+                ->GetSparseElementAsString(1),
+            absl::StrCat("(", float{3.0}, ", ", float{4.0}, ")"));
 }
 
 TEST_F(LiteralUtilTest, BroadcastVectorToMatrix0) {
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 5d33df7d40..95d93acfe8 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -22,6 +22,9 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -30,19 +33,16 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
-using tensorflow::strings::StrCat;
-
 namespace xla {
-
 namespace {
 
+using absl::StrCat;
+
 // Return a literal with all arrays of type FromNativeT converted to type
 // ToNativeT in the given literal.
 template <typename FromNativeT, typename ToNativeT>
@@ -57,7 +57,7 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
               primitive_util::NativeToPrimitiveType<ToNativeT>());
         }
       });
-  auto result = MakeUnique<Literal>(result_shape);
+  auto result = absl::make_unique<Literal>(result_shape);
 
   // Then copy over the data from 'literal' converting FromNativeT values to
   // ToNativeT values as necessary.
@@ -102,7 +102,7 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
 }
 
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateToken() {
-  return MakeUnique<Literal>(ShapeUtil::MakeTokenShape());
+  return absl::make_unique<Literal>(ShapeUtil::MakeTokenShape());
 }
 
 /* static */ Literal LiteralUtil::Zero(PrimitiveType primitive_type) {
@@ -279,15 +279,15 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
 
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateR1(
     const tensorflow::core::Bitmap& values) {
-  auto literal = MakeUnique<Literal>(
+  auto literal = absl::make_unique<Literal>(
       ShapeUtil::MakeShape(PRED, {static_cast<int64>(values.bits())}));
   literal->PopulateR1(values);
   return literal;
 }
 
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateR1U8(
-    tensorflow::StringPiece value) {
-  auto literal = MakeUnique<Literal>(
+    absl::string_view value) {
+  auto literal = absl::make_unique<Literal>(
       ShapeUtil::MakeShape(U8, {static_cast<int64>(value.size())}));
   for (int i = 0; i < value.size(); ++i) {
     literal->Set<uint8>({i}, value[i]);
@@ -312,7 +312,7 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
   CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements);
   CHECK_EQ(new_dimensions.size(), minor_to_major.size());
 
-  auto new_literal = MakeUnique<Literal>(
+  auto new_literal = absl::make_unique<Literal>(
       ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions));
 
   // Create a new shape with the given minor-to-major layout. This shape is used
@@ -436,7 +436,8 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
   for (const auto* element : elements) {
     element_shapes.push_back(element->shape());
   }
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  auto literal =
+      absl::make_unique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
   for (int i = 0; i < elements.size(); ++i) {
     TF_CHECK_OK(literal->CopyFrom(*elements[i], /*dest_shape_index=*/{i}));
   }
@@ -449,7 +450,8 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
   for (const auto& element : elements) {
     element_shapes.push_back(element.shape());
   }
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  auto literal =
+      absl::make_unique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
   for (int i = 0; i < elements.size(); ++i) {
     TF_CHECK_OK(literal->CopyFrom(elements[i], /*dest_shape_index=*/{i}));
   }
@@ -463,7 +465,8 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
   for (const auto& element : elements) {
     element_shapes.push_back(element->shape());
   }
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  auto literal =
+      absl::make_unique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
   for (int64 i = 0; i < elements.size(); ++i) {
     TF_CHECK_OK(
         literal->MoveFrom(std::move(*elements[i]), /*dest_shape_index=*/{i}));
@@ -473,7 +476,7 @@ std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
 
 /* static */ string LiteralUtil::MultiIndexAsString(
     tensorflow::gtl::ArraySlice<int64> multi_index) {
-  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
+  return StrCat("{", absl::StrJoin(multi_index, ","), "}");
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index e3737a9d00..3d28c070f2 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -27,6 +27,8 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
@@ -34,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/sparse_index_array.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/bitmap.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -187,7 +187,7 @@ class LiteralUtil {
       const Array4D<NativeT>& values, const Layout& layout);
 
   // Creates a new vector of U8s literal value from a string.
-  static std::unique_ptr<Literal> CreateR1U8(tensorflow::StringPiece value);
+  static std::unique_ptr<Literal> CreateR1U8(absl::string_view value);
 
   // Creates a linspace-populated literal with the given number of rows and
   // columns.
@@ -327,7 +327,7 @@ std::ostream& operator<<(std::ostream& out, const Literal& literal);
 
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateR0(NativeT value) {
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShape(
+  auto literal = absl::make_unique<Literal>(ShapeUtil::MakeShape(
       primitive_util::NativeToPrimitiveType<NativeT>(), {}));
   literal->Set({}, value);
   return literal;
@@ -336,7 +336,7 @@ template <typename NativeT>
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateR1(
     tensorflow::gtl::ArraySlice<NativeT> values) {
-  auto literal = MakeUnique<Literal>(
+  auto literal = absl::make_unique<Literal>(
       ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<NativeT>(),
                            {static_cast<int64>(values.size())}));
   literal->PopulateR1(values);
@@ -347,7 +347,7 @@ template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateR2WithLayout(
     std::initializer_list<std::initializer_list<NativeT>> values,
     const Layout& layout) {
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(
+  auto literal = absl::make_unique<Literal>(ShapeUtil::MakeShapeWithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(),
       {static_cast<int64>(values.size()),
        static_cast<int64>(values.begin()->size())},
@@ -433,9 +433,10 @@ template <typename NativeT>
   int64 rank = dimensions.size();
   CHECK_EQ(num_elements, indices.index_count());
   CHECK_EQ(rank, indices.rank());
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithSparseLayout(
-      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions,
-      indices.max_indices()));
+  auto literal =
+      absl::make_unique<Literal>(ShapeUtil::MakeShapeWithSparseLayout(
+          primitive_util::NativeToPrimitiveType<NativeT>(), dimensions,
+          indices.max_indices()));
   literal->PopulateSparse(indices, values, sort);
   return literal;
 }
@@ -451,7 +452,7 @@ template <typename NativeT>
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateFromArrayWithLayout(
     const Array<NativeT>& values, const Layout& layout) {
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(
+  auto literal = absl::make_unique<Literal>(ShapeUtil::MakeShapeWithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(), values.dimensions(),
       AsInt64Slice(layout.minor_to_major())));
   literal->PopulateFromArray(values);
@@ -571,8 +572,9 @@ template <typename NativeT>
 /* static */ std::unique_ptr<Literal>
 LiteralUtil::CreateFullWithDescendingLayout(
     tensorflow::gtl::ArraySlice<int64> dimensions, NativeT value) {
-  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithDescendingLayout(
-      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions));
+  auto literal =
+      absl::make_unique<Literal>(ShapeUtil::MakeShapeWithDescendingLayout(
+          primitive_util::NativeToPrimitiveType<NativeT>(), dimensions));
   literal->PopulateWithValue(value);
   return literal;
 }
@@ -584,7 +586,7 @@ LiteralUtil::CreateRandomLiteral(
     const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
   using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
   TF_RET_CHECK(shape.element_type() == type);
-  auto literal = MakeUnique<Literal>(shape);
+  auto literal = absl::make_unique<Literal>(shape);
   TF_RETURN_IF_ERROR(literal.get()->Populate<NativeT>(
       [&](tensorflow::gtl::ArraySlice<int64> indexes) {
         return generator(indexes);
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index 69ef4f7a2f..2f22e02c3e 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cctype>
 #include <unordered_map>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -84,7 +85,7 @@ void MetricTableReport::WriteReportToInfoLog(double expected_metric_sum) {
     if (end_of_line == string::npos) {
       end_of_line = report.size();
     }
-    tensorflow::StringPiece line(report.data() + pos, end_of_line - pos);
+    absl::string_view line(report.data() + pos, end_of_line - pos);
 
     // TODO(b/34779244): Figure out how to do this without the verbose log-line
     // prefix. The usual way didn't compile on open source.
@@ -152,8 +153,8 @@ void MetricTableReport::AppendCategoryTable() {
     if (text.empty()) {
       text = "[no category]";
     }
-    tensorflow::strings::StrAppend(&text, " (", category.entries.size(), " ",
-                                   entry_name_, ")");
+    absl::StrAppend(&text, " (", category.entries.size(), " ", entry_name_,
+                    ")");
     AppendTableRow(text, category.metric_sum, metric_sum);
 
     // Show the top entries in the category.
@@ -177,9 +178,9 @@ void MetricTableReport::AppendCategoryTable() {
   }
   const int64 remaining_categories = categories.size() - categories_shown;
   if (remaining_categories > 0) {
-    AppendTableRow(tensorflow::strings::StrCat("... (", remaining_categories,
-                                               " more categories)"),
-                   expected_metric_sum_ - metric_sum, expected_metric_sum_);
+    AppendTableRow(
+        absl::StrCat("... (", remaining_categories, " more categories)"),
+        expected_metric_sum_ - metric_sum, expected_metric_sum_);
   }
 }
 
@@ -206,9 +207,9 @@ void MetricTableReport::AppendEntryTable() {
   }
   const int64 remaining_entries = entries_.size() - entries_shown;
   if (remaining_entries > 0) {
-    AppendTableRow(tensorflow::strings::StrCat("... (", remaining_entries,
-                                               " more ", entry_name_, ")"),
-                   expected_metric_sum_ - metric_sum, expected_metric_sum_);
+    AppendTableRow(
+        absl::StrCat("... (", remaining_entries, " more ", entry_name_, ")"),
+        expected_metric_sum_ - metric_sum, expected_metric_sum_);
   }
 }
 
@@ -241,10 +242,10 @@ double MetricTableReport::UnaccountedMetric() {
 
 string MetricTableReport::MetricString(double metric) {
   // Round to integer and stringify.
-  string s1 = tensorflow::strings::StrCat(std::llround(metric));
+  string s1 = absl::StrCat(std::llround(metric));
 
   // Code below commafies the string, e.g. "1234" becomes "1,234".
-  tensorflow::StringPiece sp1(s1);
+  absl::string_view sp1(s1);
   string output;
   // Copy leading non-digit characters unconditionally.
   // This picks up the leading sign.
diff --git a/tensorflow/compiler/xla/metric_table_report.h b/tensorflow/compiler/xla/metric_table_report.h
index 818fb1d3fe..062d8ed99b 100644
--- a/tensorflow/compiler/xla/metric_table_report.h
+++ b/tensorflow/compiler/xla/metric_table_report.h
@@ -18,9 +18,8 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
@@ -108,7 +107,7 @@ class MetricTableReport {
   // Append all parameters to the report.
   template <typename... Args>
   void AppendLine(Args... args) {
-    tensorflow::strings::StrAppend(&report_, std::forward<Args>(args)..., "\n");
+    absl::StrAppend(&report_, std::forward<Args>(args)..., "\n");
   }
 
   // Represents a set of entries with the same category_text.
diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc
index 6b7fd10d63..012df87551 100644
--- a/tensorflow/compiler/xla/packed_literal_reader.cc
+++ b/tensorflow/compiler/xla/packed_literal_reader.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -57,14 +57,14 @@ StatusOr<std::unique_ptr<Literal>> PackedLiteralReader::Read(
         PrimitiveType_Name(shape.element_type()).c_str());
   }
 
-  auto result = MakeUnique<Literal>(literal_shape);
+  auto result = absl::make_unique<Literal>(literal_shape);
   result->PopulateWithValue(std::numeric_limits<float>::quiet_NaN());
 
   int64 elements = ShapeUtil::ElementsIn(shape);
   tensorflow::gtl::ArraySlice<float> field = result->data<float>();
   char* data = tensorflow::bit_cast<char*>(field.data());
   uint64 bytes = elements * sizeof(float);
-  tensorflow::StringPiece sp;
+  tensorflow::StringPiece sp;  // non-absl OK
   auto s = file_->Read(offset_, bytes, &sp, data);
   offset_ += sp.size();
   if (!s.ok()) {
@@ -85,7 +85,7 @@ bool PackedLiteralReader::IsExhausted() const {
   // Try to read a single byte from offset_.  If we can't, we've
   // exhausted the data.
   char single_byte[1];
-  tensorflow::StringPiece sp;
+  tensorflow::StringPiece sp;  // non-absl OK
   auto s = file_->Read(offset_, sizeof(single_byte), &sp, single_byte);
   return !s.ok();
 }
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index c8f2d65c22..2d8fe434b0 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/python:numpy_lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -59,6 +60,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 8246f76d34..00e36c3c86 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/python/local_computation_builder.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -137,8 +137,7 @@ static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
 
 /* static */
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
-    const Literal& argument,
-    const tensorflow::gtl::optional<Shape>& shape_with_layout) {
+    const Literal& argument, const absl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
   StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
@@ -163,7 +162,7 @@ CompiledLocalComputation::CompiledLocalComputation(
 
 StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
     const std::vector<Literal>& arguments,
-    const std::vector<tensorflow::gtl::optional<Shape>>& shapes_with_layout) {
+    const std::vector<absl::optional<Shape>>& shapes_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
 
   VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
@@ -194,7 +193,7 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
             scoped_buffers.reserve(arguments.size());
             for (int i = 0; i < arguments.size(); ++i) {
               const Literal& argument = arguments[i];
-              const tensorflow::gtl::optional<Shape>& shape_with_layout =
+              const absl::optional<Shape>& shape_with_layout =
                   shapes_with_layout[i];
 
               StatusOr<ScopedShapedBuffer> pushed;
@@ -575,6 +574,16 @@ StatusOr<bool> LocalComputationBuilder::IsConstant(const LocalOp& operand) {
   return builder_.IsConstant(operand.op());
 }
 
+LocalOp LocalComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
+  return xla::Sort(operand.op(), absl::nullopt, dimension);
+}
+
+LocalOp LocalComputationBuilder::SortKeyVal(const LocalOp& keys,
+                                            const LocalOp& values,
+                                            int64 dimension) {
+  return xla::Sort(keys.op(), values.op(), dimension);
+}
+
 StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
     const LocalOp& operand) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
@@ -640,7 +649,6 @@ _FORWARD_UNOP(Sin)
 _FORWARD_UNOP(Tanh)
 _FORWARD_UNOP(IsFinite)
 _FORWARD_UNOP(Neg)
-_FORWARD_UNOP(Sort)
 _FORWARD_UNOP(Sqrt)
 _FORWARD_UNOP(Rsqrt)
 _FORWARD_UNOP(Square)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index a568c24c63..d9543b958d 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -60,8 +60,7 @@ StatusOr<std::unique_ptr<Literal> > TransferFromOutfeedLocalReplica(
 class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
-      const Literal& argument,
-      const tensorflow::gtl::optional<Shape>& shape_with_layout);
+      const Literal& argument, const absl::optional<Shape>& shape_with_layout);
 
   LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
   const ScopedShapedBuffer* shaped_buffer() const;
@@ -120,7 +119,7 @@ class CompiledLocalComputation {
   // shapes_with_layout.
   StatusOr<std::unique_ptr<Literal> > Execute(
       const std::vector<Literal>& arguments,
-      const std::vector<tensorflow::gtl::optional<Shape> >& shapes_with_layout);
+      const std::vector<absl::optional<Shape> >& shapes_with_layout);
 
   LocalShapedBuffer* ExecuteWithShapedBuffers(
       tensorflow::gtl::ArraySlice<LocalShapedBuffer*> argument_handles);
@@ -301,6 +300,11 @@ class LocalComputationBuilder {
 
   StatusOr<bool> IsConstant(const LocalOp& operand);
 
+  LocalOp Sort(const LocalOp& operand, int64 dimension);
+
+  LocalOp SortKeyVal(const LocalOp& keys, const LocalOp& values,
+                     int64 dimension);
+
   StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
@@ -357,7 +361,6 @@ class LocalComputationBuilder {
   _FORWARD_UNOP(Tanh)
   _FORWARD_UNOP(IsFinite)
   _FORWARD_UNOP(Neg)
-  _FORWARD_UNOP(Sort)
   _FORWARD_UNOP(Sqrt)
   _FORWARD_UNOP(Rsqrt)
   _FORWARD_UNOP(Square)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 5d5a955bfe..08dccb3ee1 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -109,6 +109,7 @@ limitations under the License.
 // Must be included first
 #include "tensorflow/python/lib/core/numpy.h"
 
+#include "third_party/absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -409,10 +410,10 @@ tensorflow::ImportNumpy();
   $1 = &temp;
 }
 
-%typemap(in) const tensorflow::gtl::optional<Shape>& (
-    tensorflow::gtl::optional<Shape> temp) {
+%typemap(in) const absl::optional<Shape>& (
+    absl::optional<Shape> temp) {
   if ($input == Py_None) {
-    temp = tensorflow::gtl::nullopt;
+    temp = absl::nullopt;
     $1 = &temp;
   } else {
     StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
@@ -448,8 +449,8 @@ tensorflow::ImportNumpy();
   $1 = &temps;
 }
 
-%typemap(in) const std::vector<tensorflow::gtl::optional<Shape> >& (
-    std::vector<tensorflow::gtl::optional<Shape> > temps) {
+%typemap(in) const std::vector<absl::optional<Shape> >& (
+    std::vector<absl::optional<Shape> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
     SWIG_fail;
@@ -458,7 +459,7 @@ tensorflow::ImportNumpy();
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
     if (o == Py_None) {
-      temps.push_back(tensorflow::gtl::nullopt);
+      temps.push_back(absl::nullopt);
     } else {
       StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
       Py_DECREF(o);
@@ -896,7 +897,7 @@ tensorflow::ImportNumpy();
     if (o != Py_None) {
       StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
       if (!statusor.ok()) {
-        PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str());
+        PyErr_SetString(PyExc_TypeError, absl::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str());
         Py_DECREF(o);
         SWIG_fail;
       }
@@ -1011,6 +1012,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Pow;
 %unignore xla::swig::LocalComputationBuilder::Neg;
 %unignore xla::swig::LocalComputationBuilder::Sort;
+%unignore xla::swig::LocalComputationBuilder::SortKeyVal;
 %unignore xla::swig::LocalComputationBuilder::Sqrt;
 %unignore xla::swig::LocalComputationBuilder::Rsqrt;
 %unignore xla::swig::LocalComputationBuilder::Square;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 6f665faf61..f2f99c1745 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/python/numpy_bridge.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -191,8 +192,8 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
     PyObject* result =
         PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
     if (result == nullptr) {
-      return error(tensorflow::strings::StrCat(
-          "Failed to call method of shape object:", method));
+      return error(
+          absl::StrCat("Failed to call method of shape object:", method));
     }
     return result;
   };
@@ -281,15 +282,15 @@ StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
 
 // Helper that retrieves the member with attr_name, stringifies it if is not
 // None, and returns it as a C++ string.
-static tensorflow::gtl::optional<string> GetAttrAsString(
-    PyObject* o, const string& attr_name) {
+static absl::optional<string> GetAttrAsString(PyObject* o,
+                                              const string& attr_name) {
   if (!PyObject_HasAttrString(o, attr_name.c_str())) {
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   PyObject* attr = PyObject_GetAttrString(o, attr_name.c_str());
   if (attr == Py_None) {
     Py_DECREF(attr);
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   string result = PyObjectCppStr(attr);
   Py_DECREF(attr);
@@ -298,48 +299,46 @@ static tensorflow::gtl::optional<string> GetAttrAsString(
 
 // Helper that retrieves the member with attr_name, checks that it is an integer
 // if it is not None, and returns it as an int32 value.
-static tensorflow::gtl::optional<int32> GetAttrAsInt32(
-    PyObject* o, const string& attr_name) {
+static absl::optional<int32> GetAttrAsInt32(PyObject* o,
+                                            const string& attr_name) {
   if (!PyObject_HasAttrString(o, attr_name.c_str())) {
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   PyObject* attr = PyObject_GetAttrString(o, attr_name.c_str());
   if (attr == Py_None) {
     Py_DECREF(attr);
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   if (!CheckPyIntOrLong(attr)) {
     Py_DECREF(attr);
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   long value = PyIntOrPyLongToLong(attr);  // NOLINT
   Py_DECREF(attr);
   if (value == -1 && PyErr_Occurred() != nullptr) {
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   if (static_cast<int32>(value) != value) {
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   return value;
 }
 
 StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
   OpMetadata result;
-  tensorflow::gtl::optional<string> op_type = GetAttrAsString(o, "op_type");
+  absl::optional<string> op_type = GetAttrAsString(o, "op_type");
   if (op_type.has_value()) {
     result.set_op_type(op_type.value());
   }
-  tensorflow::gtl::optional<string> op_name = GetAttrAsString(o, "op_name");
+  absl::optional<string> op_name = GetAttrAsString(o, "op_name");
   if (op_name.has_value()) {
     result.set_op_name(op_name.value());
   }
-  tensorflow::gtl::optional<string> source_file =
-      GetAttrAsString(o, "source_file");
+  absl::optional<string> source_file = GetAttrAsString(o, "source_file");
   if (source_file.has_value()) {
     result.set_source_file(source_file.value());
   }
-  tensorflow::gtl::optional<int32> source_line =
-      GetAttrAsInt32(o, "source_line");
+  absl::optional<int32> source_line = GetAttrAsInt32(o, "source_line");
   if (source_line.has_value()) {
     result.set_source_line(source_line.value());
   }
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index a2c6fc344d..fa4366ff07 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -105,7 +105,6 @@ _UNARY_OPS = [
     'Square',
     'Reciprocal',
     'Neg',
-    'Sort',
     'Erf',
     'Erfc',
     'ErfInv',
@@ -1218,6 +1217,14 @@ class ComputationBuilder(object):
                                            lhs_dilation, rhs_dilation,
                                            dimension_numbers)
 
+  def Sort(self, operand, dimension=-1):
+    """Enqueues a sort operation onto the computation."""
+    return self._client.Sort(operand, dimension)
+
+  def SortKeyVal(self, keys, values, dimension=-1):
+    """Enqueues a key-value sort operation onto the computation."""
+    return self._client.SortKeyVal(keys, values, dimension)
+
 
 def _forward_methods_to_local_builder():
   """Forward remaining ComputationBuilder methods to the C API.
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index a803520876..3de7ee2bc8 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
@@ -43,7 +44,7 @@ std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
   int m = lhs.height();
   int n = rhs.width();
   int k = lhs.width();
-  auto result = MakeUnique<Array2D<T>>(m, n);
+  auto result = absl::make_unique<Array2D<T>>(m, n);
   // Because Eigen is a header-oriented library, make sure that the Eigen code
   // is the same as the code used by the CPU backend (otherwise the linker will
   // randomly pick *some* definition).
@@ -77,7 +78,8 @@ std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
 
 /* static */ std::unique_ptr<Array2D<double>> ReferenceUtil::Array2DF32ToF64(
     const Array2D<float>& input) {
-  auto result = MakeUnique<Array2D<double>>(input.height(), input.width());
+  auto result =
+      absl::make_unique<Array2D<double>>(input.height(), input.width());
   for (int64 rowno = 0; rowno < input.height(); ++rowno) {
     for (int64 colno = 0; colno < input.height(); ++colno) {
       (*result)(rowno, colno) = input(rowno, colno);
@@ -126,8 +128,8 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
       a4dlhs, a4drhs, {kernel_stride, 1}, padding, {lhs_dilation, 1},
       {rhs_dilation, 1}, dnums2d);
 
-  auto convr3 = MakeUnique<Array3D<float>>(convr4->planes(), convr4->depth(),
-                                           convr4->height());
+  auto convr3 = absl::make_unique<Array3D<float>>(
+      convr4->planes(), convr4->depth(), convr4->height());
   convr4->Each(
       [&](tensorflow::gtl::ArraySlice<int64> indices, float* value_ptr) {
         CHECK_EQ(indices[3], 0);
@@ -201,7 +203,7 @@ ReferenceUtil::ReduceWindow1DGeneric(
         window_util::StridedBound(padded_width, window[i], stride[i]);
     pad_low[i] = padding[i].first;
   }
-  auto result = MakeUnique<std::vector<float>>(window_counts[0]);
+  auto result = absl::make_unique<std::vector<float>>(window_counts[0]);
 
   // Do a full 1D reduce window.
   for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
@@ -247,7 +249,8 @@ ReferenceUtil::ReduceWindow2DGeneric(
         window_util::StridedBound(padded_width, window[i], stride[i]);
     pad_low[i] = padding[i].first;
   }
-  auto result = MakeUnique<Array2D<float>>(window_counts[0], window_counts[1]);
+  auto result =
+      absl::make_unique<Array2D<float>>(window_counts[0], window_counts[1]);
 
   // Do a full 2D reduce window.
   for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
@@ -296,8 +299,8 @@ ReferenceUtil::ReduceWindow2DGeneric(
         WindowCount(dim_lengths[i], window[i], stride[i], padding);
     pad_low[i] = padding_both[i].first;
   }
-  auto result = MakeUnique<Array3D<float>>(window_counts[0], window_counts[1],
-                                           window_counts[2]);
+  auto result = absl::make_unique<Array3D<float>>(
+      window_counts[0], window_counts[1], window_counts[2]);
 
   for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
     for (int64 i1 = 0; i1 < window_counts[1]; ++i1) {
@@ -358,8 +361,8 @@ ReferenceUtil::ReduceWindow4DGeneric(
         window_util::StridedBound(padded_width, window[i], stride[i]);
     pad_low[i] = padding[i].first;
   }
-  auto result = MakeUnique<Array4D<float>>(window_counts[0], window_counts[1],
-                                           window_counts[2], window_counts[3]);
+  auto result = absl::make_unique<Array4D<float>>(
+      window_counts[0], window_counts[1], window_counts[2], window_counts[3]);
   // Do a full 4D reduce window.
   for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
     for (int64 i1 = 0; i1 < window_counts[1]; ++i1) {
@@ -426,8 +429,8 @@ ReferenceUtil::SelectAndScatter4DGePlus(
     const tensorflow::gtl::ArraySlice<int64>& window,
     const tensorflow::gtl::ArraySlice<int64>& stride, bool same_padding) {
   Padding padding = same_padding ? Padding::kSame : Padding::kValid;
-  auto result = MakeUnique<Array4D<float>>(operand.n1(), operand.n2(),
-                                           operand.n3(), operand.n4());
+  auto result = absl::make_unique<Array4D<float>>(operand.n1(), operand.n2(),
+                                                  operand.n3(), operand.n4());
   std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
                                  operand.n4()};
   auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
@@ -583,10 +586,10 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
 
   CHECK_EQ(ShapeUtil::Rank(result_literal->shape()), 4);
   auto result =
-      MakeUnique<Array4D<float>>(result_literal->shape().dimensions(0),
-                                 result_literal->shape().dimensions(1),
-                                 result_literal->shape().dimensions(2),
-                                 result_literal->shape().dimensions(3));
+      absl::make_unique<Array4D<float>>(result_literal->shape().dimensions(0),
+                                        result_literal->shape().dimensions(1),
+                                        result_literal->shape().dimensions(2),
+                                        result_literal->shape().dimensions(3));
 
   result->Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
     *value = result_literal->Get<float>(indices);
@@ -601,7 +604,7 @@ ReferenceUtil::ReduceToColArray2D(
     const std::function<float(float, float)>& reduce_function) {
   int64 rows = matrix.height();
   int64 cols = matrix.width();
-  auto result = MakeUnique<std::vector<float>>();
+  auto result = absl::make_unique<std::vector<float>>();
   for (int64 i = 0; i < rows; ++i) {
     float acc = init;
     for (int64 j = 0; j < cols; ++j) {
@@ -618,7 +621,7 @@ ReferenceUtil::ReduceToRowArray2D(
     const std::function<float(float, float)>& reduce_function) {
   int64 rows = matrix.height();
   int64 cols = matrix.width();
-  auto result = MakeUnique<std::vector<float>>();
+  auto result = absl::make_unique<std::vector<float>>();
   for (int64 i = 0; i < cols; ++i) {
     float acc = init;
     for (int64 j = 0; j < rows; ++j) {
@@ -674,8 +677,8 @@ ReferenceUtil::ReduceToRowArray2D(
 /* static */ std::unique_ptr<Array4D<float>> ReferenceUtil::Broadcast1DTo4D(
     const std::vector<float>& array, const std::vector<int64>& bounds,
     int64 broadcast_from_dim) {
-  auto result =
-      MakeUnique<Array4D<float>>(bounds[0], bounds[1], bounds[2], bounds[3]);
+  auto result = absl::make_unique<Array4D<float>>(bounds[0], bounds[1],
+                                                  bounds[2], bounds[3]);
   for (int64 i = 0; i < result->n1(); ++i) {
     for (int64 j = 0; j < result->n2(); ++j) {
       for (int64 k = 0; k < result->n3(); ++k) {
@@ -710,7 +713,7 @@ ReferenceUtil::ReduceToRowArray2D(
   CHECK_EQ(dims.size(), 1);
   int64 rows = dims[0] == 0 ? array.n2() : array.n1();
   int64 cols = dims[0] == 2 ? array.n2() : array.n3();
-  auto result = MakeUnique<Array2D<float>>(rows, cols);
+  auto result = absl::make_unique<Array2D<float>>(rows, cols);
   result->Fill(init);
   for (int i0 = 0; i0 < array.n1(); ++i0) {
     for (int i1 = 0; i1 < array.n2(); ++i1) {
@@ -730,7 +733,7 @@ ReferenceUtil::ReduceToRowArray2D(
     const std::function<float(float)>& map_function) {
   int64 rows = matrix.height();
   int64 cols = matrix.width();
-  auto result = MakeUnique<Array2D<float>>(rows, cols);
+  auto result = absl::make_unique<Array2D<float>>(rows, cols);
   for (int64 i = 0; i < rows; ++i) {
     for (int64 j = 0; j < cols; ++j) {
       (*result)(i, j) = map_function(matrix(i, j));
@@ -746,7 +749,7 @@ ReferenceUtil::ReduceToRowArray2D(
   CHECK_EQ(lhs.width(), rhs.width());
   int64 rows = lhs.height();
   int64 cols = rhs.width();
-  auto result = MakeUnique<Array2D<float>>(rows, cols);
+  auto result = absl::make_unique<Array2D<float>>(rows, cols);
   for (int64 i = 0; i < rows; ++i) {
     for (int64 j = 0; j < cols; ++j) {
       (*result)(i, j) = map_function(lhs(i, j), rhs(i, j));
@@ -760,7 +763,7 @@ ReferenceUtil::ReduceToRowArray2D(
     const std::function<float(float, int64, int64)>& map_function) {
   int64 rows = matrix.height();
   int64 cols = matrix.width();
-  auto result = MakeUnique<Array2D<float>>(rows, cols);
+  auto result = absl::make_unique<Array2D<float>>(rows, cols);
   for (int64 i = 0; i < rows; ++i) {
     for (int64 j = 0; j < cols; ++j) {
       (*result)(i, j) = map_function(matrix(i, j), i, j);
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 8fa6961d19..88f853a359 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -42,7 +42,8 @@ class ReferenceUtil {
   template <typename T>
   static std::unique_ptr<Array2D<T>> TransposeArray2D(
       const Array2D<T>& operand) {
-    auto result = MakeUnique<Array2D<T>>(operand.width(), operand.height());
+    auto result =
+        absl::make_unique<Array2D<T>>(operand.width(), operand.height());
     for (int64 w = 0; w < operand.width(); ++w) {
       for (int64 h = 0; h < operand.height(); ++h) {
         (*result)(w, h) = operand(h, w);
@@ -242,7 +243,7 @@ class ReferenceUtil {
                                               const Array2D<T>& rhs,
                                               int concatenate_dimension) {
     CHECK(0 <= concatenate_dimension && concatenate_dimension < 2);
-    auto result = MakeUnique<Array2D<T>>(
+    auto result = absl::make_unique<Array2D<T>>(
         concatenate_dimension == 0 ? lhs.n1() + rhs.n1() : lhs.n1(),
         concatenate_dimension == 1 ? lhs.n2() + rhs.n2() : lhs.n2());
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
@@ -276,7 +277,8 @@ class ReferenceUtil {
         out_dims[i] = lhs_dims[i] + rhs_dims[i];
       }
     }
-    auto result = MakeUnique<Array3D<T>>(out_dims[0], out_dims[1], out_dims[2]);
+    auto result =
+        absl::make_unique<Array3D<T>>(out_dims[0], out_dims[1], out_dims[2]);
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
         for (int64 i2 = 0; i2 < result->n3(); ++i2) {
@@ -310,8 +312,8 @@ class ReferenceUtil {
         out_dims[i] = lhs_dims[i] + rhs_dims[i];
       }
     }
-    auto result = MakeUnique<Array4D<T>>(out_dims[0], out_dims[1], out_dims[2],
-                                         out_dims[3]);
+    auto result = absl::make_unique<Array4D<T>>(out_dims[0], out_dims[1],
+                                                out_dims[2], out_dims[3]);
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
         for (int64 i2 = 0; i2 < result->n3(); ++i2) {
@@ -355,9 +357,9 @@ class ReferenceUtil {
     CHECK_LE(limits[1], input.n2());
     CHECK_GE(strides[0], 1);
     CHECK_GE(strides[1], 1);
-    auto result =
-        MakeUnique<Array2D<T>>(CeilOfRatio(limits[0] - starts[0], strides[0]),
-                               CeilOfRatio(limits[1] - starts[1], strides[1]));
+    auto result = absl::make_unique<Array2D<T>>(
+        CeilOfRatio(limits[0] - starts[0], strides[0]),
+        CeilOfRatio(limits[1] - starts[1], strides[1]));
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
         (*result)(i0, i1) =
@@ -381,10 +383,10 @@ class ReferenceUtil {
     CHECK_GE(strides[0], 1);
     CHECK_GE(strides[1], 1);
     CHECK_GE(strides[2], 1);
-    auto result =
-        MakeUnique<Array3D<T>>(CeilOfRatio(limits[0] - starts[0], strides[0]),
-                               CeilOfRatio(limits[1] - starts[1], strides[1]),
-                               CeilOfRatio(limits[2] - starts[2], strides[2]));
+    auto result = absl::make_unique<Array3D<T>>(
+        CeilOfRatio(limits[0] - starts[0], strides[0]),
+        CeilOfRatio(limits[1] - starts[1], strides[1]),
+        CeilOfRatio(limits[2] - starts[2], strides[2]));
 
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
@@ -415,11 +417,11 @@ class ReferenceUtil {
     CHECK_GE(strides[1], 1);
     CHECK_GE(strides[2], 1);
     CHECK_GE(strides[3], 1);
-    auto result =
-        MakeUnique<Array4D<T>>(CeilOfRatio(limits[0] - starts[0], strides[0]),
-                               CeilOfRatio(limits[1] - starts[1], strides[1]),
-                               CeilOfRatio(limits[2] - starts[2], strides[2]),
-                               CeilOfRatio(limits[3] - starts[3], strides[3]));
+    auto result = absl::make_unique<Array4D<T>>(
+        CeilOfRatio(limits[0] - starts[0], strides[0]),
+        CeilOfRatio(limits[1] - starts[1], strides[1]),
+        CeilOfRatio(limits[2] - starts[2], strides[2]),
+        CeilOfRatio(limits[3] - starts[3], strides[3]));
     for (int64 i0 = 0; i0 < result->n1(); ++i0) {
       for (int64 i1 = 0; i1 < result->n2(); ++i1) {
         for (int64 i2 = 0; i2 < result->n3(); ++i2) {
@@ -460,8 +462,8 @@ class ReferenceUtil {
   template <typename F>
   static std::unique_ptr<Array4D<float>> MapWithIndexArray4D(
       const Array4D<float>& input, F&& map_function) {
-    auto result = MakeUnique<Array4D<float>>(input.planes(), input.depth(),
-                                             input.height(), input.width());
+    auto result = absl::make_unique<Array4D<float>>(
+        input.planes(), input.depth(), input.height(), input.width());
     for (int64 plane = 0; plane < input.planes(); ++plane) {
       for (int64 depth = 0; depth < input.depth(); ++depth) {
         for (int64 height = 0; height < input.height(); ++height) {
@@ -495,8 +497,8 @@ class ReferenceUtil {
   template <typename F>
   static std::unique_ptr<Array4D<float>> MapWithIndexArray4D(
       const Array4D<float>& lhs, const Array4D<float>& rhs, F&& map_function) {
-    auto result = MakeUnique<Array4D<float>>(lhs.planes(), lhs.depth(),
-                                             lhs.height(), lhs.width());
+    auto result = absl::make_unique<Array4D<float>>(lhs.planes(), lhs.depth(),
+                                                    lhs.height(), lhs.width());
     for (int64 plane = 0; plane < lhs.planes(); ++plane) {
       for (int64 depth = 0; depth < lhs.depth(); ++depth) {
         for (int64 height = 0; height < lhs.height(); ++height) {
@@ -530,7 +532,7 @@ class ReferenceUtil {
     int64 out1 =
         in1 + low_padding1 + high_padding1 + (in1 - 1) * interior_padding1;
 
-    auto result = MakeUnique<Array2D<NativeT>>(out0, out1);
+    auto result = absl::make_unique<Array2D<NativeT>>(out0, out1);
     result->Fill(pad);
     int64 o0 = low_padding0;
     for (int64 i0 = 0; i0 < in0; ++i0) {
@@ -669,7 +671,7 @@ class ReferenceUtil {
   static std::unique_ptr<Array2D<T1>> ApplyElementwise2D(
       F&& f, const Array2D<T1>& array1, const Array2D<Ts>&... arrays) {
     AssertSameSize2D(array1, arrays...);
-    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n2());
+    auto result = absl::make_unique<Array2D<T1>>(array1.n1(), array1.n2());
     for (int64 i = 0; i < array1.n1(); ++i) {
       for (int64 j = 0; j < array1.n2(); ++j) {
         (*result)(i, j) = f(array1(i, j), arrays(i, j)...);
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index 8091bed499..3ec0192148 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <cmath>
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -36,7 +36,7 @@ namespace {
 class ReferenceUtilTest : public ::testing::Test {
  protected:
   ReferenceUtilTest() {
-    matrix_ = MakeUnique<Array2D<float>>(rows_, cols_);
+    matrix_ = absl::make_unique<Array2D<float>>(rows_, cols_);
     // [1.f  2.f  3.f]
     // [4.f  5.f  6.f]
     for (int64 i = 0; i < rows_; ++i) {
@@ -112,8 +112,8 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray2D) {
 }
 
 TEST_F(ReferenceUtilTest, MapArray4D) {
-  auto input = MakeUnique<Array4D<float>>(/*planes=*/2, /*depth=*/3,
-                                          /*height=*/4, /*width=*/5);
+  auto input = absl::make_unique<Array4D<float>>(/*planes=*/2, /*depth=*/3,
+                                                 /*height=*/4, /*width=*/5);
   input->FillWithMultiples(1.0f);
   auto multiply_by_two = [](float value) { return 2 * value; };
   auto result = ReferenceUtil::MapArray4D(*input, multiply_by_two);
@@ -126,8 +126,8 @@ TEST_F(ReferenceUtilTest, MapArray4D) {
 }
 
 TEST_F(ReferenceUtilTest, MapWithIndexArray4D) {
-  auto input = MakeUnique<Array4D<float>>(/*planes=*/2, /*depth=*/3,
-                                          /*height=*/4, /*width=*/5);
+  auto input = absl::make_unique<Array4D<float>>(/*planes=*/2, /*depth=*/3,
+                                                 /*height=*/4, /*width=*/5);
   input->FillWithMultiples(1.0f);
   auto subtract_index = [](float value, int64 plane, int64 depth, int64 height,
                            int64 width) {
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 7d315fa0d3..47d376c8ac 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -99,6 +99,7 @@ cc_library(
         ":bfloat16_support",
         ":hlo",
         ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
@@ -175,6 +176,8 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -237,6 +240,11 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -263,6 +271,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -311,6 +320,10 @@ cc_library(
         "//tensorflow/core:human_readable_json",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -337,7 +350,7 @@ cc_library(
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -389,7 +402,8 @@ cc_library(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -449,6 +463,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -517,6 +533,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -552,6 +569,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -574,6 +592,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -615,6 +635,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
@@ -647,6 +669,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -669,6 +693,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -719,6 +744,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -736,6 +763,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -766,6 +794,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -813,6 +842,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -831,6 +862,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -847,6 +880,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -864,6 +898,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -874,6 +910,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -908,6 +945,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -917,12 +955,14 @@ tf_cc_test(
     deps = [
         ":buffer_liveness",
         ":hlo",
+        ":hlo_dataflow_analysis",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -950,6 +990,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -977,6 +1019,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -996,6 +1039,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1031,6 +1075,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1049,6 +1094,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1065,6 +1111,8 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -1074,6 +1122,7 @@ cc_library(
     hdrs = ["hlo_module_group_util.h"],
     deps = [
         ":hlo",
+        ":hlo_casting_utils",
         ":hlo_module_group_metadata",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:status",
@@ -1082,6 +1131,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1101,6 +1152,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -1108,17 +1160,18 @@ tf_cc_test(
     name = "hlo_scheduling_test",
     srcs = ["hlo_scheduling_test.cc"],
     deps = [
-        ":buffer_value",
         ":heap_simulator",
         ":hlo",
+        ":hlo_dce",
         ":hlo_ordering",
+        ":hlo_parser",
         ":hlo_scheduling",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -1142,6 +1195,7 @@ cc_library(
         ":hlo_pass",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -1167,6 +1221,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1181,6 +1236,9 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1198,6 +1256,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1216,6 +1275,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -1231,6 +1291,22 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+cc_library(
+    name = "scatter_expander",
+    srcs = ["scatter_expander.cc"],
+    hdrs = ["scatter_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        ":while_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -1253,6 +1329,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1275,6 +1352,10 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -1298,6 +1379,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1309,8 +1392,7 @@ cc_library(
         ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -1363,6 +1445,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1386,14 +1469,48 @@ tf_cc_test(
 )
 
 cc_library(
+    name = "convolution_feature_group_converter",
+    srcs = ["convolution_feature_group_converter.cc"],
+    hdrs = ["convolution_feature_group_converter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "convolution_feature_group_converter_test",
+    size = "small",
+    srcs = ["convolution_feature_group_converter_test.cc"],
+    deps = [
+        ":convolution_feature_group_converter",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
+cc_library(
     name = "while_loop_analysis",
     srcs = ["while_loop_analysis.cc"],
     hdrs = ["while_loop_analysis.h"],
     deps = [
         ":hlo",
         ":hlo_evaluator",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -1408,6 +1525,8 @@ cc_library(
         ":while_loop_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -1421,6 +1540,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1535,6 +1655,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -1555,6 +1676,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1588,6 +1710,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1607,6 +1730,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = True,  # Contains per-platform computation placer registration
 )
@@ -1620,6 +1745,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1697,6 +1823,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1711,6 +1839,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1742,6 +1871,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1758,6 +1889,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1773,6 +1905,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1800,6 +1933,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1817,6 +1951,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1835,6 +1971,9 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1876,6 +2015,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1912,6 +2053,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1932,6 +2074,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1969,6 +2112,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1981,7 +2125,6 @@ cc_library(
         ":hlo_dataflow_analysis",
         ":logical_buffer",
         ":logical_buffer_analysis",
-        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -1989,6 +2132,9 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2039,6 +2185,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2061,6 +2209,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2129,6 +2278,8 @@ cc_library(
         ":shape_inference",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2165,13 +2316,15 @@ cc_library(
         ":hlo_scheduling",
         ":logical_buffer",
         ":tuple_points_to_analysis",
-        ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2211,6 +2364,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2292,6 +2446,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2329,6 +2485,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2345,6 +2502,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2355,6 +2513,7 @@ tf_cc_test(
         ":hlo",
         ":hlo_constant_folding",
         ":hlo_matchers",
+        ":hlo_parser",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -2376,6 +2535,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2390,6 +2550,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2450,6 +2611,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2518,6 +2680,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:transform_utils",
     ],
@@ -2549,10 +2713,11 @@ cc_library(
         ":computation_layout",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -2565,6 +2730,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2601,8 +2767,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2636,6 +2802,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
     alwayslink = 1,
 )
@@ -2652,6 +2820,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2733,9 +2902,9 @@ cc_library(
     hdrs = ["stream_pool.h"],
     deps = [
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2833,6 +3002,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2879,7 +3049,8 @@ cc_library(
         ":hlo_creation_utils",
         ":tuple_util",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2893,6 +3064,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -2908,6 +3080,8 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
@@ -2935,6 +3109,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -2989,6 +3164,10 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -3022,6 +3201,9 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3030,11 +3212,13 @@ tf_cc_test(
     size = "small",
     srcs = ["hlo_parser_test.cc"],
     deps = [
+        ":hlo_matchers",
         ":hlo_parser",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",  # fixdeps: keep
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3053,6 +3237,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 37834e1cc2..c236453fc7 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -22,6 +22,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -41,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -266,7 +269,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   StatusOr<HloInstruction*> OptimizeDotOfConcat(HloInstruction* dot);
   StatusOr<HloInstruction*> OptimizeDotOfConcatHelper(
-      const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
+      const HloInstruction& dot, HloInstruction* lhs, int64 lhs_contracting_dim,
       HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped);
 
   StatusOr<HloInstruction*> OptimizeDotOfGather(HloInstruction* dot);
@@ -540,7 +543,7 @@ Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
   // If a literal is all the same element replace it with a scalar broadcast.
   if (ShapeUtil::ElementsIn(constant->shape()) > 1 &&
       constant->literal().IsAllFirst()) {
-    std::unique_ptr<Literal> unique_scalar = MakeUnique<Literal>(
+    std::unique_ptr<Literal> unique_scalar = absl::make_unique<Literal>(
         LiteralUtil::GetFirstScalarLiteral(constant->literal()));
     HloInstruction* scalar = computation_->AddInstruction(
         HloInstruction::CreateConstant(std::move(unique_scalar)));
@@ -827,18 +830,18 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * optimized_lhs_concat,
-      OptimizeDotOfConcatHelper(dot->shape(), lhs, lhs_contracting_dim, rhs,
+      OptimizeDotOfConcatHelper(*dot, lhs, lhs_contracting_dim, rhs,
                                 rhs_contracting_dim, /*swapped=*/false));
   if (optimized_lhs_concat) {
     return optimized_lhs_concat;
   }
 
-  return OptimizeDotOfConcatHelper(dot->shape(), rhs, rhs_contracting_dim, lhs,
+  return OptimizeDotOfConcatHelper(*dot, rhs, rhs_contracting_dim, lhs,
                                    lhs_contracting_dim, /*swapped=*/true);
 }
 
 StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
-    const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
+    const HloInstruction& dot, HloInstruction* lhs, int64 lhs_contracting_dim,
     HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped) {
   bool can_optimize = lhs->opcode() == HloOpcode::kConcatenate &&
                       lhs->concatenate_dimension() == lhs_contracting_dim &&
@@ -937,11 +940,12 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
     }
 
     auto* new_dot = computation_->AddInstruction(HloInstruction::CreateDot(
-        dot_shape, new_dot_lhs, new_dot_rhs, new_dot_dnums));
+        dot.shape(), new_dot_lhs, new_dot_rhs, new_dot_dnums));
+    new_dot->set_precision_config(dot.precision_config());
 
     if (add_result) {
       add_result = computation_->AddInstruction(HloInstruction::CreateBinary(
-          dot_shape, HloOpcode::kAdd, add_result, new_dot));
+          dot.shape(), HloOpcode::kAdd, add_result, new_dot));
     } else {
       add_result = new_dot;
     }
@@ -1040,6 +1044,7 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   auto memoized_shape = ShapeUtil::MakeShape(F32, {m, n});
   auto* memoized_inst = computation_->AddInstruction(HloInstruction::CreateDot(
       memoized_shape, left_operand, right_operand, dnums));
+  memoized_inst->set_precision_config(dot->precision_config());
   // Get pair {start, 0} or {0, start}.
   HloInstruction* original_start_indices =
       lhs_is_dynamic_slice ? lhs->mutable_operand(1) : rhs->mutable_operand(1);
@@ -1137,6 +1142,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
         ShapeUtil::PermuteDimensions({1, 0}, dot->shape()),
         rhs->mutable_operand(0), lhs->mutable_operand(0),
         dot_dimension_numbers));
+    new_dot->set_precision_config(dot->precision_config());
     return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
   }
@@ -1705,6 +1711,10 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
         reshape, HloInstruction::CreateReshape(reshape->shape(),
                                                operand->mutable_operand(0)));
   }
+  if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
+    *operand->mutable_shape() = reshape->shape();
+    return ReplaceInstruction(reshape, operand);
+  }
 
   if (HloOpcode::kBroadcast == reshape->operand(0)->opcode()) {
     auto opt_dims = ReshapeLeavesDimensionsUnmodified(
@@ -1748,8 +1758,8 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   }
 
   auto is_unstrided_slice = [](const HloInstruction* hlo) {
-    return c_all_of(hlo->slice_strides(),
-                    [](int64 stride) { return stride == 1; });
+    return absl::c_all_of(hlo->slice_strides(),
+                          [](int64 stride) { return stride == 1; });
   };
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
       is_unstrided_slice(slice) && is_unstrided_slice(slice->operand(0))) {
@@ -1926,7 +1936,8 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
   // This should make fusion easier or use less memory bandwidth in the unfused
   // case.
   if (arg->opcode() == HloOpcode::kConcatenate &&
-      c_linear_search(reduce->dimensions(), arg->concatenate_dimension())) {
+      absl::c_linear_search(reduce->dimensions(),
+                            arg->concatenate_dimension())) {
     HloInstruction* old_reduce = nullptr;
     for (HloInstruction* operand : arg->operands()) {
       HloInstruction* new_reduce = computation_->AddInstruction(
@@ -1979,9 +1990,9 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
   VLOG(10) << "Considering folding Pad: " << pad->ToString()
            << "\ninto reduce-window: " << reduce_window->ToString()
-           << (convert != nullptr ? tensorflow::strings::StrCat(
-                                        "\nvia convert: ", convert->ToString())
-                                  : "");
+           << (convert != nullptr
+                   ? absl::StrCat("\nvia convert: ", convert->ToString())
+                   : "");
 
   // Do not fold interior padding into ReduceWindow since the backends do not
   // support it.
@@ -2144,6 +2155,11 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
                                            transpose->dimensions())));
   }
 
+  if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
+    *operand->mutable_shape() = transpose->shape();
+    return ReplaceInstruction(transpose, operand);
+  }
+
   if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) {
     ReplaceWithBitcast(transpose);
     return Status::OK();
@@ -2285,6 +2301,8 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   dot_dimension_numbers.add_rhs_contracting_dimensions(0);
   auto dot = computation_->AddInstruction(HloInstruction::CreateDot(
       dot_output_shape, new_lhs, new_rhs, dot_dimension_numbers));
+  dot->set_precision_config(convolution->precision_config());
+
   return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index c48196e861..b864c372fa 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -47,7 +47,7 @@ class AlgebraicSimplifier : public HloPassInterface {
         enable_dot_strength_reduction_(enable_dot_strength_reduction),
         enable_conv_simplification_(enable_conv_simplification) {}
   ~AlgebraicSimplifier() override = default;
-  tensorflow::StringPiece name() const override { return "algsimp"; }
+  absl::string_view name() const override { return "algsimp"; }
 
   // Run algebraic simplification on the given computation. Returns whether the
   // computation was changed.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 862cbeeba6..bb63ea26d4 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -34,13 +36,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-
-using ::testing::ElementsAre;
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 namespace op = xla::testing::opcode_matchers;
 
 AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
@@ -51,7 +52,12 @@ AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
 
-class AlgebraicSimplifierTest : public HloVerifiedTestBase {};
+class AlgebraicSimplifierTest : public HloVerifiedTestBase {
+ public:
+  AlgebraicSimplifierTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+};
 
 // Test that A + 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, AddZero) {
@@ -1428,6 +1434,37 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
 
+// Test transforming reshapes and transposes of rng.
+TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  HloInstruction* one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+  HloInstruction* rng0 = builder.AddInstruction(
+      HloInstruction::CreateRng(ShapeUtil::MakeShape(F32, {2, 2}),
+                                RandomDistribution::RNG_UNIFORM, {zero, one}));
+
+  HloInstruction* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(rng0->shape(), rng0, {1, 0}));
+  Shape reshape_shape = builder
+                            .AddInstruction(HloInstruction::CreateReshape(
+                                ShapeUtil::MakeShape(F32, {4}), transpose))
+                            ->shape();
+
+  auto computation = module().AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 bitcasting_callback());
+  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+
+  // Verify that that reshape(transpose(rng)) is replace by a single rng of the
+  // same shape as the reshape.
+  EXPECT_THAT(computation->root_instruction(), op::Rng());
+  EXPECT_TRUE(ShapeUtil::Equal(computation->root_instruction()->shape(),
+                               reshape_shape));
+}
+
 // Test transforming reshapes to bitcasts under various conditions.
 TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   HloComputation::Builder builder(TestName());
@@ -2006,7 +2043,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
   // Builds a convolution from <options> and runs algebraic simplification on
   // the computation. Returns a string description of the result of
   // simplification.
-  auto build_and_simplify = [&options]() -> string {
+  auto build_and_simplify = [&]() -> string {
     HloComputation::Builder b(TestName());
 
     Window window;
@@ -2112,9 +2149,8 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
         root->operand(0)->opcode() == HloOpcode::kDot) {
       auto lhs_shape = root->operand(0)->operand(0)->shape();
       auto rhs_shape = root->operand(0)->operand(1)->shape();
-      return tensorflow::strings::StrCat(
-          tensorflow::str_util::Join(lhs_shape.dimensions(), "x"), " DOT ",
-          tensorflow::str_util::Join(rhs_shape.dimensions(), "x"));
+      return absl::StrCat(absl::StrJoin(lhs_shape.dimensions(), "x"), " DOT ",
+                          absl::StrJoin(rhs_shape.dimensions(), "x"));
     }
     return "UNEXPECTED CHANGE";
   };
@@ -2629,11 +2665,10 @@ struct PadReduceWindowEffectiveBroadcastCase {
   bool should_become_broadcast;
 
   string ToTestCaseName() const {
-    return tensorflow::strings::StrCat(
-        tensorflow::str_util::Join(input_spatials, ","), ";",
-        tensorflow::str_util::Join(symmetric_pad_spatials, ","), ";",
-        tensorflow::str_util::Join(reduce_window_spatials, ","), ";", prepend_a,
-        ";", should_become_broadcast);
+    return absl::StrCat(absl::StrJoin(input_spatials, ","), ";",
+                        absl::StrJoin(symmetric_pad_spatials, ","), ";",
+                        absl::StrJoin(reduce_window_spatials, ","), ";",
+                        prepend_a, ";", should_become_broadcast);
   }
 };
 
@@ -2821,7 +2856,12 @@ struct DotOfConcatTestSpec {
 
 class DotOfConcatSimplificationTest
     : public HloVerifiedTestBase,
-      public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
+      public ::testing::WithParamInterface<DotOfConcatTestSpec> {
+ public:
+  DotOfConcatSimplificationTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+};
 
 // Test that we transform
 //  dot(const, concat(A, B, C))
@@ -2994,7 +3034,12 @@ struct DotOfGatherTestSpec {
 
 class DotOfGatherSimplificationTest
     : public HloVerifiedTestBase,
-      public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
+      public ::testing::WithParamInterface<DotOfGatherTestSpec> {
+ public:
+  DotOfGatherSimplificationTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+};
 
 // input: dot(DS(ctA), ctB))
 // where DS(ctA) = DS({M x K}, {s, 0}, {1, K}) and ctB = {K x N}.
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 51ebc4763b..5115a14df0 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -17,15 +17,15 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -91,8 +91,9 @@ StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
     // If ShapedBufferTy is ScopedShapedBuffer, release the ScopedShapedBuffer
     // into a regular ShapedBuffer, which is stored in
     // handle_to_shaped_buffers_.
-    handle_to_shaped_buffers_[handle].emplace_back(MakeUnique<ShapedBuffer>(
-        ReleaseIfScopedShapedBuffer(std::move(shaped_buffer))));
+    handle_to_shaped_buffers_[handle].emplace_back(
+        absl::make_unique<ShapedBuffer>(
+            ReleaseIfScopedShapedBuffer(std::move(shaped_buffer))));
   }
 
   GlobalDataHandle result;
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index d12be3e007..841d0fa85b 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -127,8 +128,8 @@ Backend::Backend(
     }
   }
   // Create a memory allocator for the valid stream executors.
-  memory_allocator_ =
-      MakeUnique<StreamExecutorMemoryAllocator>(platform, stream_executors);
+  memory_allocator_ = absl::make_unique<StreamExecutorMemoryAllocator>(
+      platform, stream_executors);
   CHECK(!stream_executors_.empty())
       << "Service found no devices for backend " << platform_->Name() << '.';
 
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 1bc3796fa4..4a6a78daf0 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -130,7 +130,7 @@ class Backend {
 
   // Return a string identifier for the given device, eg: "GPU:3".
   string device_name(int device_ordinal) const {
-    return tensorflow::strings::StrCat(platform_->Name(), ":", device_ordinal);
+    return absl::StrCat(platform_->Name(), ":", device_ordinal);
   }
 
   // Returns true if the devices with the given ordinals are equivalent from
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
index 2099916509..a16b85a0a5 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 
@@ -63,6 +64,7 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
 
   TF_ASSIGN_OR_RETURN(HloInstruction * new_dot,
                       MakeDotHlo(new_lhs, new_rhs, new_dim_numbers));
+  new_dot->set_precision_config(batch_dot->precision_config());
 
   TF_ASSIGN_OR_RETURN(HloInstruction * new_dot_reshaped,
                       MakeReshapeHlo(batch_dot->shape(), new_dot));
@@ -76,7 +78,7 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
   return true;
 }
 
-tensorflow::StringPiece BatchDotSimplification::name() const {
+absl::string_view BatchDotSimplification::name() const {
   return "batch-dot-simplification";
 }
 
@@ -84,10 +86,10 @@ StatusOr<bool> BatchDotSimplification::Run(HloModule* module) {
   bool changed = false;
   std::vector<HloInstruction*> dot_instrs;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    c_copy_if(computation->instructions(), std::back_inserter(dot_instrs),
-              [](HloInstruction* instr) {
-                return instr->opcode() == HloOpcode::kDot;
-              });
+    absl::c_copy_if(computation->instructions(), std::back_inserter(dot_instrs),
+                    [](HloInstruction* instr) {
+                      return instr->opcode() == HloOpcode::kDot;
+                    });
   }
   for (HloInstruction* dot_instr : dot_instrs) {
     TF_ASSIGN_OR_RETURN(bool elided_batch_dim_from_one,
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.h b/tensorflow/compiler/xla/service/batch_dot_simplification.h
index c0ca8d8eba..79d37f08d3 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification.h
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.h
@@ -28,7 +28,7 @@ namespace xla {
 class BatchDotSimplification : public HloPassInterface {
  public:
   StatusOr<bool> Run(HloModule* module) override;
-  tensorflow::StringPiece name() const override;
+  absl::string_view name() const override;
 
  private:
   StatusOr<bool> ElideDegenerateBatchDimensionFromBatchDot(
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
index 38f1a5d3a6..b342acb025 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -24,7 +24,12 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class BatchDotSimplificationTest : public HloVerifiedTestBase {};
+class BatchDotSimplificationTest : public HloVerifiedTestBase {
+ public:
+  BatchDotSimplificationTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+};
 
 TEST_F(BatchDotSimplificationTest,
        ElideSingleDegenerateBatchDotDim_VectorVector) {
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index c4cd60c120..01931b2d02 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -43,7 +43,7 @@ namespace xla {
 
 namespace {
 
-using tensorflow::gtl::optional;
+using absl::optional;
 
 // BatchNormExpanderVisitor traverses the HLO computation and rewrites BatchNorm
 // operations into smaller operations.
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
index 7ae202c583..76e32174f3 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -36,7 +36,7 @@ class BatchNormExpander : public HloPassInterface {
         rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op) {}
   ~BatchNormExpander() = default;
-  tensorflow::StringPiece name() const override { return "batchnorm_expander"; }
+  absl::string_view name() const override { return "batchnorm_expander"; }
 
   // Run operation expander on the given computation. Returns whether the
   // computation was changed.
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index a725351462..aba0d9bb5b 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
index c939838709..5dcd31b83d 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
@@ -37,7 +37,7 @@ class BFloat16ConversionFolding : public HloPassInterface {
       : bfloat16_support_(bfloat16_support) {}
 
   ~BFloat16ConversionFolding() override = default;
-  tensorflow::StringPiece name() const override { return "bfloat16-fold"; }
+  absl::string_view name() const override { return "bfloat16-fold"; }
 
   // Run BF16 conversion folding on the given computation. Returns whether the
   // computation was changed.
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 7cf05ca443..6363a21c3b 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -235,8 +235,8 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   HloInstruction* crs =
       builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
           ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b},
-          sum, /*replica_group_ids=*/{}, /*barrier=*/"",
-          /*all_reduce_id=*/tensorflow::gtl::nullopt));
+          sum, /*replica_groups=*/{}, /*barrier=*/"",
+          /*all_reduce_id=*/absl::nullopt));
   HloInstruction* gte_a = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32_shape, crs, 0));
   HloInstruction* gte_b = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
index 16e99b5722..32573ed355 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -34,11 +35,6 @@ class BFloat16NormalizationVisitor : public DfsHloVisitorWithDefault {
 
   Status DefaultAction(HloInstruction* hlo) override;
 
-  // Special handling for cross-replica-sum and sort which can have a tuple
-  // output.
-  Status HandleCrossReplicaSum(HloInstruction* crs) override;
-  Status HandleSort(HloInstruction* sort) override;
-
   static bool Run(HloComputation* computation,
                   const BFloat16Support* bfloat16_support) {
     BFloat16NormalizationVisitor visitor(computation, bfloat16_support);
@@ -150,23 +146,6 @@ Status BFloat16NormalizationVisitor::ConvertCalledComputations(
   return Status::OK();
 }
 
-Status BFloat16NormalizationVisitor::HandleCrossReplicaSum(
-    HloInstruction* crs) {
-  if (!ShapeUtil::IsTuple(crs->shape())) {
-    return HandleInstruction(crs);
-  } else {
-    return HandleMultipleOutputs(crs);
-  }
-}
-
-Status BFloat16NormalizationVisitor::HandleSort(HloInstruction* sort) {
-  if (!ShapeUtil::IsTuple(sort->shape())) {
-    return HandleInstruction(sort);
-  } else {
-    return HandleMultipleOutputs(sort);
-  }
-}
-
 Status BFloat16NormalizationVisitor::HandleMultipleOutputs(
     HloInstruction* hlo) {
   std::vector<PrimitiveType> operand_types(hlo->operand_count());
@@ -380,6 +359,11 @@ Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) {
       hlo->opcode() == HloOpcode::kConditional) {
     return Status::OK();
   }
+  if ((hlo->opcode() == HloOpcode::kSort ||
+       hlo->opcode() == HloOpcode::kCrossReplicaSum) &&
+      ShapeUtil::IsTuple(hlo->shape())) {
+    return HandleMultipleOutputs(hlo);
+  }
   return HandleInstruction(hlo);
 }
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.h b/tensorflow/compiler/xla/service/bfloat16_normalization.h
index 2a60fe0af3..30b6346312 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.h
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.h
@@ -31,7 +31,7 @@ class BFloat16Normalization : public HloPassInterface {
       : bfloat16_support_(bfloat16_support) {}
 
   ~BFloat16Normalization() override = default;
-  tensorflow::StringPiece name() const override { return "bf16-normalization"; }
+  absl::string_view name() const override { return "bf16-normalization"; }
 
   // Run BF16 normalization on the given computation. Returns whether the
   // computation was changed.
@@ -54,7 +54,7 @@ class BFloat16MixedPrecisionRemoval : public HloPassInterface {
 
   ~BFloat16MixedPrecisionRemoval() override = default;
 
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "bf16-mixed-precision-removal";
   }
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index f9f1f64998..b08705d4c2 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -76,7 +76,8 @@ class BFloat16NormalizationTest : public HloTestBase {
     StatusOr<bool> result = normalization.Run(module);
     EXPECT_IS_OK(result.status());
 
-    HloVerifier verifier(/*allow_mixed_precision=*/true);
+    HloVerifier verifier(/*layout_sensitive=*/false,
+                         /*allow_mixed_precision=*/true);
     EXPECT_IS_OK(verifier.Run(module).status());
 
     return result.ValueOrDie();
@@ -251,8 +252,8 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
   HloInstruction* crs =
       builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
           ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}, reduction,
-          /*replica_group_ids=*/{}, /*barrier=*/"",
-          /*all_reduce_id=*/tensorflow::gtl::nullopt));
+          /*replica_groups=*/{}, /*barrier=*/"",
+          /*all_reduce_id=*/absl::nullopt));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1));
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h
index 02b8cad089..1ee64971ab 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.h
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h
@@ -64,9 +64,7 @@ class BFloat16Propagation : public HloPassInterface {
 
   ~BFloat16Propagation() override = default;
 
-  tensorflow::StringPiece name() const override {
-    return "bfloat16-propagation";
-  }
+  absl::string_view name() const override { return "bfloat16-propagation"; }
 
   // Runs the pass on the given module. Returns whether the module was changed
   // (precision reductions were added).
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 118a11c8de..c8c36ae60e 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -22,8 +22,9 @@ limitations under the License.
 #include <ostream>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -36,20 +37,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
+namespace {
 
+using absl::StrAppend;
 using ::tensorflow::gtl::FlatMap;
 using ::tensorflow::gtl::FlatSet;
 using ::tensorflow::strings::Appendf;
 using ::tensorflow::strings::HumanReadableNumBytes;
 using ::tensorflow::strings::Printf;
-using ::tensorflow::strings::StrAppend;
-
-namespace {
 
 template <typename T>
 string ColocatedBufferSetsToString(const T& container, const char* title) {
@@ -139,6 +137,7 @@ Status GatherComputationsByAllocationType(
           case HloOpcode::kMap:
           case HloOpcode::kReduce:
           case HloOpcode::kReduceWindow:
+          case HloOpcode::kScatter:
           case HloOpcode::kSelectAndScatter:
           case HloOpcode::kFusion:
             // Map/reduce etc computations are always thread-local.
@@ -235,8 +234,8 @@ size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const {
 }
 
 string BufferAllocation::Slice::ToString() const {
-  return tensorflow::strings::StrCat("{index:", index(), ", offset:", offset_,
-                                     ", size:", size_, "}");
+  return absl::StrCat("{index:", index(), ", offset:", offset_,
+                      ", size:", size_, "}");
 }
 
 BufferAllocation::Slice BufferAllocation::GetSlice(
@@ -626,7 +625,7 @@ Status BufferAssignment::ComputeSummaryStats() {
     stats_.total_allocation_bytes += allocation.size();
   }
 
-  // Only compute total fragmentation if all computations are sequential.
+  // Only compute total fragmentation if all computations have schedules.
   SequentialHloOrdering::HloModuleSequence module_sequence;
   for (const auto& computation : module_->computations()) {
     const std::vector<const HloInstruction*>* sequence =
@@ -677,9 +676,9 @@ string BufferAssignment::Stats::ToString() const {
 
 string BufferAssignment::ToString() const {
   string output;
-  tensorflow::strings::StrAppend(&output, "BufferAssignment:\n");
+  absl::StrAppend(&output, "BufferAssignment:\n");
   for (auto& allocation : allocations_) {
-    tensorflow::strings::StrAppend(&output, allocation.ToString());
+    absl::StrAppend(&output, allocation.ToString());
   }
   return output;
 }
@@ -1099,8 +1098,8 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       options.buffers_to_assign = &buffer_value_set;
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
-          HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                                 MakeUnique<LazyBestFitHeap>(alignment)),
+          HeapSimulator::Run(absl::make_unique<DecreasingSizeRunsHeap>(
+                                 absl::make_unique<LazyBestFitHeap>(alignment)),
                              assignment->module(), module_sequence,
                              assignment->points_to_analysis(),
                              assignment->buffer_size_, options));
@@ -1129,11 +1128,12 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         options.buffers_to_assign = &buffer_value_set;
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
-            HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                                   MakeUnique<LazyBestFitHeap>(alignment)),
-                               *computation, *instruction_sequence,
-                               assignment->points_to_analysis(),
-                               assignment->buffer_size_, options));
+            HeapSimulator::Run(
+                absl::make_unique<DecreasingSizeRunsHeap>(
+                    absl::make_unique<LazyBestFitHeap>(alignment)),
+                *computation, *instruction_sequence,
+                assignment->points_to_analysis(), assignment->buffer_size_,
+                options));
         AssignBuffersFromHeapSimulator(result, assignment,
                                        single_colored_set.first);
       }
@@ -1645,7 +1645,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   XLA_VLOG_LINES(3, liveness->ToString());
   XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString());
 
-  // Can't use MakeUnique because BufferAssignment constructor is private.
+  // Can't use absl::make_unique because BufferAssignment constructor is
+  // private.
   std::unique_ptr<BufferAssignment> assignment(
       new BufferAssignment(module, std::move(liveness), std::move(buffer_size),
                            std::move(color_alignment)));
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index eccb146a0d..52abda16c4 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
@@ -87,7 +87,7 @@ class BufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
                                                         int64 alignment = 1) {
     return BufferAssigner::Run(
-               module, xla::MakeUnique<DependencyHloOrdering>(module),
+               module, absl::make_unique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allow_input_output_aliasing=*/false,
@@ -98,7 +98,7 @@ class BufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignmentNoBuffersForConstants(
       HloModule* module, int64 alignment = 1) {
     return BufferAssigner::Run(
-               module, xla::MakeUnique<DependencyHloOrdering>(module),
+               module, absl::make_unique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allow_input_output_aliasing=*/false,
@@ -109,7 +109,7 @@ class BufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
       HloModule* module, BufferLiveness::Colorer colorer, int64 alignment = 1) {
     return BufferAssigner::Run(
-               module, xla::MakeUnique<DependencyHloOrdering>(module),
+               module, absl::make_unique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allow_input_output_aliasing=*/false,
@@ -127,7 +127,8 @@ class BufferAssignmentTest : public HloTestBase {
                                            instruction_sequence.end());
     return BufferAssigner::Run(
                module,
-               xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
+               absl::make_unique<SequentialHloOrdering>(module,
+                                                        module_sequence),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allow_input_output_aliasing=*/false,
@@ -1769,7 +1770,8 @@ class WhileBufferAssignmentTest : public HloTestBase {
     auto sequence =
         ScheduleComputationsInModule(*module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
-               module, xla::MakeUnique<SequentialHloOrdering>(module, sequence),
+               module,
+               absl::make_unique<SequentialHloOrdering>(module, sequence),
                ByteSizeOf,
                [alignment](LogicalBuffer::Color) { return alignment; },
                /*allow_input_output_aliasing=*/false,
@@ -2083,7 +2085,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
       auto assignment,
       BufferAssigner::Run(
           module.get(),
-          xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
+          absl::make_unique<SequentialHloOrdering>(module.get(), sequence),
           backend().compiler()->BufferSizeBytesFunction(),
           [](LogicalBuffer::Color) { return 1; },
           /*allow_input_output_aliasing=*/false,
@@ -2340,7 +2342,7 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
   auto assignment =
       BufferAssigner::Run(
           module.get(),
-          xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
+          absl::make_unique<SequentialHloOrdering>(module.get(), sequence),
           ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
           /*allow_input_output_aliasing=*/false,
           /*allocate_buffers_for_constants=*/true)
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 810d597e73..8d0ac3b84a 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -89,13 +89,13 @@ string BufferLiveness::ToString() const {
     pieces.push_back(
         tensorflow::strings::Printf("  %s", buffer->ToString().c_str()));
   }
-  return tensorflow::str_util::Join(pieces, "\n");
+  return absl::StrJoin(pieces, "\n");
 }
 
 bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
                                                 const LogicalBuffer& b) const {
-  TF_CHECK_OK(points_to_analysis_->VerifyBuffer(a));
-  TF_CHECK_OK(points_to_analysis_->VerifyBuffer(b));
+  TF_DCHECK_OK(points_to_analysis_->VerifyBuffer(a));
+  TF_DCHECK_OK(points_to_analysis_->VerifyBuffer(b));
 
   if (!hlo_ordering_->ExecutesBefore(a.instruction(), b.instruction())) {
     return false;
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 4a927b5767..26e26e316d 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -18,8 +18,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -119,8 +120,8 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
@@ -167,10 +168,10 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
 
   SequentialHloOrdering::HloModuleSequence sequence;
   sequence.insert({entry, {param0, negate, param1, exp, add}});
-  auto liveness =
-      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
-                                            module.get(), sequence))
-          .ConsumeValueOrDie();
+  auto liveness = BufferLiveness::Run(module.get(),
+                                      absl::make_unique<SequentialHloOrdering>(
+                                          module.get(), sequence))
+                      .ConsumeValueOrDie();
 
   // Entry parameters interfere as if they are defined simultaneously at
   // the very beginning.
@@ -215,8 +216,8 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
@@ -249,8 +250,8 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
@@ -293,10 +294,10 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   SequentialHloOrdering::HloModuleSequence module_sequence;
   std::vector<const HloInstruction*> order = {param, negate, exp, add};
   module_sequence.emplace(computation, order);
-  auto liveness =
-      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
-                                            module.get(), module_sequence))
-          .ConsumeValueOrDie();
+  auto liveness = BufferLiveness::Run(module.get(),
+                                      absl::make_unique<SequentialHloOrdering>(
+                                          module.get(), module_sequence))
+                      .ConsumeValueOrDie();
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
@@ -342,10 +343,10 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) {
   std::vector<const HloInstruction*> order = {param,     add,  recv,
                                               recv_done, send, send_done};
   module_sequence.emplace(computation, order);
-  auto liveness =
-      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
-                                            module.get(), module_sequence))
-          .ConsumeValueOrDie();
+  auto liveness = BufferLiveness::Run(module.get(),
+                                      absl::make_unique<SequentialHloOrdering>(
+                                          module.get(), module_sequence))
+                      .ConsumeValueOrDie();
 
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, add));
   // Check the root instruction (add) buffer interferes with the recv buffer.
@@ -376,8 +377,8 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // All buffers should be live out except the param
@@ -412,8 +413,8 @@ TEST_F(BufferLivenessTest, EmbeddedComputation) {
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // Buffers in different computations should always interfere.
@@ -453,8 +454,8 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // Only the element buffers of the tuple constant which are pointed to by
@@ -518,8 +519,8 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
   module->AddEmbeddedComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // We compare tuple element pairs that are input/output to the computation:
@@ -580,8 +581,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   module->AddEmbeddedComputation(builder.Build());
 
   auto liveness =
-      BufferLiveness::Run(module.get(),
-                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
+      BufferLiveness::Run(
+          module.get(), absl::make_unique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // We compare tuple element pairs that are input/output to the computation:
@@ -610,11 +611,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
 class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
  protected:
   // Builds and runs a computation (see test case computation graphs below).
-  // Runs BufferLiveness on this computation.
-  // Returns whether buffer interference is detected between tuple-shaped
-  // parameter and root instructions at tuple element 1.
-  bool Run(const bool update_uses_tuple_element1,
-           const bool fuse_gte0 = false) {
+  std::unique_ptr<HloModule> BuildModule(const bool update_uses_tuple_element1,
+                                         const bool fuse_gte0) {
     auto builder = HloComputation::Builder(TestName());
     // Create param0 Tuple.
     Shape data_shape = ShapeUtil::MakeShape(F32, {8});
@@ -645,12 +643,12 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
         builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
             data_shape, gte1, update, starts));
     // Create output tuple.
-    auto tuple_root = builder.AddInstruction(
+    builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
     auto module = CreateNewModule();
-    module->AddEntryComputation(BuildDummyComputation());
-    auto* computation = module->AddEmbeddedComputation(builder.Build());
+    module->AddEntryComputation(builder.Build());
+    auto* computation = module->entry_computation();
     // Create fusion instruction based on number of tuple element 1 users.
     if (update_uses_tuple_element1) {
       computation->CreateFusionInstruction(
@@ -666,16 +664,39 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
       computation->CreateFusionInstruction({gte0},
                                            HloInstruction::FusionKind::kLoop);
     }
+    return module;
+  }
 
+  // Returns whether buffer interference is detected between tuple-shaped
+  // parameter and root instructions at tuple element 1.
+  bool Run(const bool update_uses_tuple_element1,
+           const bool fuse_gte0 = false) {
+    auto module = BuildModule(update_uses_tuple_element1, fuse_gte0);
     // Run BufferLiveness on 'module'.
-    auto liveness =
-        BufferLiveness::Run(
-            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
-            .ConsumeValueOrDie();
+    auto liveness = BufferLiveness::Run(
+                        module.get(),
+                        absl::make_unique<DependencyHloOrdering>(module.get()))
+                        .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
+    auto tuple_param0 = FindInstruction(module.get(), "param0");
+    auto tuple_root = module->entry_computation()->root_instruction();
     return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1});
   }
+  bool RunWithHloDataflowAnalysis(const bool update_uses_tuple_element1,
+                                  const bool fuse_gte0 = false) {
+    auto module = BuildModule(update_uses_tuple_element1, fuse_gte0);
+    // Run BufferLiveness on 'module'.
+    auto dataflow = HloDataflowAnalysis::Run(*module).ConsumeValueOrDie();
+    auto hlo_ordering = absl::make_unique<DependencyHloOrdering>(module.get());
+    // Return whether or not buffers interference is detected between
+    // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
+    auto tuple_param0 = FindInstruction(module.get(), "param0");
+    auto tuple_root = module->entry_computation()->root_instruction();
+    return hlo_ordering->MayInterfere(
+        dataflow->GetUniqueValueAt(tuple_param0, {1}),
+        dataflow->GetUniqueValueAt(tuple_root, {1}), *dataflow);
+  }
 };
 
 // Tests that live ranges of buffers Param0[1] and Tuple[1] (which alias fusion)
@@ -693,6 +714,8 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
 //
 TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterference) {
   EXPECT_FALSE(Run(/*update_uses_tuple_element1=*/false));
+  EXPECT_FALSE(
+      RunWithHloDataflowAnalysis(/*update_uses_tuple_element1=*/false));
 }
 
 // Tests that live ranges of buffers Param0[1] and Tuple[1] (which aliases
@@ -712,6 +735,8 @@ TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterference) {
 //
 TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterferenceWithUnrelatedFusion) {
   EXPECT_FALSE(Run(/*update_uses_tuple_element1=*/false, /*fuse_gte0=*/true));
+  EXPECT_FALSE(RunWithHloDataflowAnalysis(/*update_uses_tuple_element1=*/false,
+                                          /*fuse_gte0=*/true));
 }
 
 // Tests that live ranges of buffers Param0[1] and Tuple[1] (which alias fusion)
@@ -736,6 +761,7 @@ TEST_F(FusedDynamicUpdateSliceLivenessTest, NoInterferenceWithUnrelatedFusion) {
 //
 TEST_F(FusedDynamicUpdateSliceLivenessTest, WithInterference) {
   EXPECT_TRUE(Run(/*update_uses_tuple_element1=*/true));
+  EXPECT_TRUE(RunWithHloDataflowAnalysis(/*update_uses_tuple_element1=*/true));
 }
 
 class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
@@ -780,10 +806,10 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     module->AddEntryComputation(BuildDummyComputation());
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
-    auto liveness =
-        BufferLiveness::Run(
-            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
-            .ConsumeValueOrDie();
+    auto liveness = BufferLiveness::Run(
+                        module.get(),
+                        absl::make_unique<DependencyHloOrdering>(module.get()))
+                        .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
     return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1});
diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc
index 2bc556a9e2..fdf822c666 100644
--- a/tensorflow/compiler/xla/service/buffer_value.cc
+++ b/tensorflow/compiler/xla/service/buffer_value.cc
@@ -17,11 +17,10 @@ limitations under the License.
 
 #include <iosfwd>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index a23427f00c..37523a73ff 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -17,21 +17,21 @@ limitations under the License.
 
 #include <queue>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+using absl::StrCat;
 using ::tensorflow::strings::Appendf;
-using ::tensorflow::strings::StrCat;
 
 string CallContextToString(CallContext context) {
   switch (context) {
@@ -61,6 +61,7 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kMap:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
+    case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kFusion:
       return CallContext::kParallel;
@@ -70,10 +71,10 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
 }
 
 string CallSite::ToString() const {
-  return StrCat(instruction()->name(), " calls in context ",
-                CallContextToString(context()), ": ",
-                tensorflow::str_util::Join(
-                    called_computations(), ", ",
+  return StrCat(
+      instruction()->name(), " calls in context ",
+      CallContextToString(context()), ": ",
+      absl::StrJoin(called_computations(), ", ",
                     [](string* out, const HloComputation* computation) {
                       out->append(computation->name());
                     }));
@@ -236,8 +237,8 @@ void CallGraph::SetCallContexts() {
 
 /* static */
 std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
-  // Constructor for CallGraph is private so MakeUnique can't be used.
-  auto call_graph = WrapUnique<CallGraph>(new CallGraph(module));
+  // Constructor for CallGraph is private so absl::make_unique can't be used.
+  auto call_graph = absl::WrapUnique<CallGraph>(new CallGraph(module));
 
   VLOG(2) << "Building call graph for:";
   XLA_VLOG_LINES(2, module->ToString());
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 97d3811508..3af2ab5edf 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Call graph for an HLO module.
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_
 
 #include <ostream>
 
@@ -272,4 +272,4 @@ class CallGraph {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CALL_GRAPH_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_
diff --git a/tensorflow/compiler/xla/service/call_inliner.h b/tensorflow/compiler/xla/service/call_inliner.h
index a8345a394d..c5cd88b9ea 100644
--- a/tensorflow/compiler/xla/service/call_inliner.h
+++ b/tensorflow/compiler/xla/service/call_inliner.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE__CALL_INLINER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE__CALL_INLINER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CALL_INLINER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CALL_INLINER_H_
 
 #include <deque>
 
@@ -35,11 +35,11 @@ class CallInliner : public HloPassInterface {
   static StatusOr<InlinedInstructionMap> Inline(HloInstruction* call);
 
   ~CallInliner() override = default;
-  tensorflow::StringPiece name() const override { return "CallInliner"; }
+  absl::string_view name() const override { return "CallInliner"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 };
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE__CALL_INLINER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CALL_INLINER_H_
diff --git a/tensorflow/compiler/xla/service/call_inliner_test.cc b/tensorflow/compiler/xla/service/call_inliner_test.cc
index ff968bca29..5d85a3f173 100644
--- a/tensorflow/compiler/xla/service/call_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/call_inliner_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace op = xla::testing::opcode_matchers;
 
diff --git a/tensorflow/compiler/xla/service/channel_tracker.cc b/tensorflow/compiler/xla/service/channel_tracker.cc
index 13008efed1..601a3e9a01 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.cc
+++ b/tensorflow/compiler/xla/service/channel_tracker.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 7426672a7a..3079695e96 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -76,9 +76,9 @@ CompileOnlyService::CompileAheadOfTime(
     if (!directory_path.empty()) {
       HloSnapshot hlo_snapshot;
       *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = instance.computation;
-      string filename = tensorflow::strings::StrCat(
-          "computation_", instance.computation.id(), "__",
-          instance.computation.entry_computation_name());
+      string filename =
+          absl::StrCat("computation_", instance.computation.id(), "__",
+                       instance.computation.entry_computation_name());
       const string& per_host_path = tensorflow::io::JoinPath(
           directory_path, tensorflow::port::Hostname());
 
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index cb61f3da39..af8f7f1027 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
@@ -52,9 +52,8 @@ string ComputationLayout::ToString() const {
   for (auto& param_layout : parameter_layouts_) {
     params.push_back(param_layout.ToString());
   }
-  return tensorflow::strings::StrCat("(",
-                                     tensorflow::str_util::Join(params, ", "),
-                                     ") => ", result_layout_.ToString());
+  return absl::StrCat("(", absl::StrJoin(params, ", "), ") => ",
+                      result_layout_.ToString());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index 187ce568cb..61b1dba6c9 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -19,8 +19,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -29,12 +30,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 namespace xla {
 
@@ -60,8 +60,8 @@ DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
         "computation_count=%d",
         proto.replica_count(), proto.computation_count());
   }
-  auto assignment = MakeUnique<DeviceAssignment>(proto.replica_count(),
-                                                 proto.computation_count());
+  auto assignment = absl::make_unique<DeviceAssignment>(
+      proto.replica_count(), proto.computation_count());
   for (int computation = 0; computation < proto.computation_count();
        ++computation) {
     const auto& computation_device = proto.computation_devices(computation);
@@ -156,7 +156,7 @@ ComputationPlacer::GetPlatformComputationPlacers() {
 }  // namespace xla
 
 static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
-  return xla::MakeUnique<xla::ComputationPlacer>();
+  return absl::make_unique<xla::ComputationPlacer>();
 }
 
 static bool InitModule() {
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index b7be3ba605..4ea3a13f28 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -28,8 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.h b/tensorflow/compiler/xla/service/conditional_simplifier.h
index 063261e26d..3de50cbd7f 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.h
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace xla {
 
@@ -27,9 +27,7 @@ namespace xla {
 // with their true or false computation as appropriate.
 class ConditionalSimplifier : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override {
-    return "simplify-conditional";
-  }
+  absl::string_view name() const override { return "simplify-conditional"; }
   StatusOr<bool> Run(HloModule* module) override;
 };
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index c43a31b167..6c477da038 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -39,6 +39,10 @@ namespace op = xla::testing::opcode_matchers;
 
 class ConditionalSimplifierTest : public HloVerifiedTestBase {
  public:
+  ConditionalSimplifierTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+
   // Makes a computation that contains a conditional with constant predicate.
   HloComputation* MakeConditional(HloModule* module);
 };
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
new file mode 100644
index 0000000000..9c81a86bbb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -0,0 +1,249 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+// ConvolutionVisitor traverses the HLO computation and rewrites Convolution
+// operations with feature_group_count > 1 into convolutions with
+// feature_group_count = 1.
+class ConvolutionVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleConvolution(HloInstruction* convolution) override;
+
+  // Runs the visitor on a computation.
+  static bool Run(HloComputation* computation);
+
+  // Returns whether any convolution ops were rewritten.
+  const bool changed() const { return changed_; }
+
+  ~ConvolutionVisitor() override = default;
+
+ private:
+  explicit ConvolutionVisitor(HloComputation* computation)
+      : computation_(computation) {}
+
+  // Current HloComputation instance the ConvolutionVisitor is traversing.
+  HloComputation* computation_;
+
+  // Whether rewrite has occurred.
+  bool changed_ = false;
+};
+
+bool ConvolutionVisitor::Run(HloComputation* computation) {
+  ConvolutionVisitor visitor(computation);
+  TF_CHECK_OK(computation->Accept(&visitor));
+  return visitor.changed_;
+}
+
+Shape ExpandedFilterShape(const Shape& shape, int64 group_count,
+                          int64 input_feature_dim) {
+  int64 num_dims = shape.dimensions_size();
+  CHECK_GE(num_dims, 2);
+  Shape expanded_shape = shape;
+  expanded_shape.set_dimensions(
+      input_feature_dim, shape.dimensions(input_feature_dim) * group_count);
+  return expanded_shape;
+}
+
+// Returns a vector with 'group_count' many groups, where the i-th group
+// consists of 'group_size' times the value i.
+std::vector<int32> GetMaskIds(int64 group_size, int64 group_count) {
+  std::vector<int32> values;
+  for (int i = 0; i < group_count; ++i) {
+    for (int j = 0; j < group_size; ++j) {
+      values.push_back(i);
+    }
+  }
+  return values;
+}
+
+// Create a mask for grouped convolution that will make a normal convolution
+// produce the same results as a grouped convolution. For a [2, 1, 6]
+// filter this returns a [2, 3, 6] mask
+//   1 1 0 0 0 0
+//   0 0 1 1 0 0
+//   0 0 0 0 1 1
+//
+//   1 1 0 0 0 0
+//   0 0 1 1 0 0
+//   0 0 0 0 1 1
+//
+// The first step is to create a rank 1 constant:
+//   0 1 2
+//
+// This is broadcasted to
+//   0 0 0 0 0 0
+//   1 1 1 1 1 1
+//   2 2 2 2 2 2
+//
+//   0 0 0 0 0 0
+//   1 1 1 1 1 1
+//   2 2 2 2 2 2
+//
+// Then we create another rank 1 constant
+//   0 0 1 1 2 2
+//
+// This is broadcasted to
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//   0 0 1 1 2 2
+//
+// Finally we use the Eq op of these two broadcasted constants and get the
+// desired mask.
+HloInstruction* GetExpandedFilterMask(
+    const Shape& filter_shape, int64 input_feature_dim,
+    int64 output_feature_dim, int64 group_count,
+    const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+        add_instruction) {
+  Shape expanded_filter_shape =
+      ExpandedFilterShape(filter_shape, group_count, input_feature_dim);
+  Shape mask_shape = ShapeUtil::MakeShape(
+      S32, AsInt64Slice(expanded_filter_shape.dimensions()));
+  int64 output_feature = filter_shape.dimensions(output_feature_dim);
+  int64 group_size = filter_shape.dimensions(input_feature_dim);
+
+  // Create a 'input_feature' sized linspace and 'output_feature' sized linspace
+  // that will be broadcasted into perpendicular dimensions and compared.
+  const std::vector<int32> input_feature_filter_mask =
+      GetMaskIds(group_size, group_count);
+  const std::vector<int32> output_feature_filter_mask =
+      GetMaskIds(output_feature / group_count, group_count);
+
+  auto mask1 = add_instruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<int32>(input_feature_filter_mask)));
+  auto broadcasted_mask1 = add_instruction(
+      HloInstruction::CreateBroadcast(mask_shape, mask1, {input_feature_dim}));
+  auto mask2 = add_instruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<int32>(output_feature_filter_mask)));
+  auto broadcasted_mask2 = add_instruction(
+      HloInstruction::CreateBroadcast(mask_shape, mask2, {output_feature_dim}));
+
+  // Compare the broadcasted output feature linspace to the input feature
+  // linspace to create a diagonal predicate.
+  Shape predicate_shape = ShapeUtil::MakeShape(
+      PRED, AsInt64Slice(expanded_filter_shape.dimensions()));
+  return add_instruction(HloInstruction::CreateBinary(
+      predicate_shape, HloOpcode::kEq, broadcasted_mask1, broadcasted_mask2));
+}
+
+Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
+  int64 group_count = convolution->feature_group_count();
+  if (group_count == 1) {
+    return Status::OK();
+  }
+  auto filter = convolution->mutable_operand(1);
+  changed_ = true;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    return computation_->AddInstruction(std::move(inst));
+  };
+
+  auto dim_numbers = convolution->convolution_dimension_numbers();
+  int64 input_feature_dim = dim_numbers.kernel_input_feature_dimension();
+  int64 group_size = filter->shape().dimensions(input_feature_dim);
+  int64 output_feature_dim = dim_numbers.kernel_output_feature_dimension();
+  auto expanded_filter_shape =
+      ExpandedFilterShape(filter->shape(), group_count, input_feature_dim);
+  HloInstruction* filter_mask = GetExpandedFilterMask(
+      filter->shape(), input_feature_dim, output_feature_dim, group_count, add);
+  HloInstruction* expanded_filter;
+  // We want to repeat 'filter' in the 'input_feature_dim' dimension
+  // 'group_count' times.
+  if (group_size == 1) {
+    Shape reshaped_filter_shape =
+        ShapeUtil::DeleteDimension(input_feature_dim, filter->shape());
+    auto reshaped_filter =
+        add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+    std::vector<int64> broadcast_dims;
+    for (int64 i = 0; i < filter->shape().dimensions_size(); ++i) {
+      if (i == input_feature_dim) {
+        continue;
+      }
+      broadcast_dims.push_back(i);
+    }
+    expanded_filter = add(HloInstruction::CreateBroadcast(
+        expanded_filter_shape, reshaped_filter, broadcast_dims));
+  } else {
+    // We could possibly also use reshape, broadcast, reshape instead of concat
+    // here, but it would require more complex code, and for depthwise
+    // convolution we would never end up in this branch.
+    std::vector<HloInstruction*> concat_operands(group_count, filter);
+    expanded_filter = add(HloInstruction::CreateConcatenate(
+        expanded_filter_shape, concat_operands, input_feature_dim));
+  }
+  auto zero = add(HloInstruction::CreateConstant(absl::make_unique<Literal>(
+      LiteralUtil::Zero(expanded_filter_shape.element_type()))));
+  auto zero_filter =
+      add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
+  auto new_filter = add(
+      HloInstruction::CreateTernary(expanded_filter_shape, HloOpcode::kSelect,
+                                    filter_mask, expanded_filter, zero_filter));
+  auto new_convolution = HloInstruction::CreateConvolve(
+      convolution->shape(), convolution->mutable_operand(0), new_filter,
+      convolution->window(), dim_numbers, /*feature_group_count=*/1);
+  new_convolution->set_precision_config(convolution->precision_config());
+  TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+      convolution, std::move(new_convolution)));
+  return Status::OK();
+}
+
+}  // namespace
+
+StatusOr<bool> ConvolutionFeatureGroupConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), before:\n" +
+                        module->ToString());
+  bool changed = false;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    if (ConvolutionVisitor::Run(comp)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), after:\n" +
+                        module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
new file mode 100644
index 0000000000..498894737f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+// A pass which rewrites convolutions with feature_group_count > 1 into
+// convolutions with feature_group_count = 1.
+class ConvolutionFeatureGroupConverter : public HloPassInterface {
+ public:
+  ConvolutionFeatureGroupConverter() {}
+
+  absl::string_view name() const override {
+    return "convolution-feature-group-converter";
+  }
+
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
new file mode 100644
index 0000000000..28373ebf63
--- /dev/null
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace {
+
+using ConvolutionFeatureGroupConverterTest = HloTestBase;
+namespace op = testing::opcode_matchers;
+
+TEST_F(ConvolutionFeatureGroupConverterTest,
+       ConvertFeatureGroupCountEqualToInputFeatureDim) {
+  string hlo_string = R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,2], filter: f32[1,1,2]) -> f32[1,2,2] {
+  %input = f32[1,2,2]{2,1,0} parameter(0)
+  %copy = f32[1,2,2]{2,0,1} copy(f32[1,2,2]{2,1,0} %input)
+  %filter = f32[1,1,2]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,2]{2,0,1} convolution(f32[1,2,2]{2,0,1} %copy, f32[1,1,2]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=2
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  ConvolutionFeatureGroupConverter converter;
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  // Make sure the convolution is converted to one with feature_group_count = 1.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->feature_group_count(), 1);
+  // Verify that the filter operand has been replaced.
+  EXPECT_THAT(root->operand(1),
+              op::Select(op::Eq(op::Broadcast(op::Constant()),
+                                op::Broadcast(op::Constant())),
+                         op::Broadcast(op::Reshape(op::Parameter())),
+                         op::Broadcast(op::Constant())));
+}
+
+TEST_F(ConvolutionFeatureGroupConverterTest,
+       ConvertFeatureGroupCountDivisorOfInputFeatureDim) {
+  string hlo_string = R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2,2] {
+  %input = f32[1,2,4]{2,1,0} parameter(0)
+  %copy = f32[1,2,4]{2,0,1} copy(f32[1,2,4]{2,1,0} %input)
+  %filter = f32[1,2,2]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,2]{2,0,1} convolution(f32[1,2,4]{2,0,1} %copy, f32[1,2,2]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=2
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  ConvolutionFeatureGroupConverter converter;
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  // Make sure the convolution is converted to one with feature_group_count = 1.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->feature_group_count(), 1);
+  // Verify that the filter operand has been replaced.
+  EXPECT_THAT(root->operand(1),
+              op::Select(op::Eq(op::Broadcast(op::Constant()),
+                                op::Broadcast(op::Constant())),
+                         // We expect to see Concatenate here instead of
+                         // Broadcast, because feature_group_count < input
+                         // feature dimension.
+                         op::Concatenate(op::Parameter(), op::Parameter()),
+                         op::Broadcast(op::Constant())));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 3e39c1bab1..1b7a7b36ea 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
@@ -31,18 +33,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
-
-using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
-
 namespace {
 
+using absl::StrAppend;
+
 bool IsEntryParameterValue(const HloValue& value) {
   const HloComputation* computation = value.defining_instruction()->parent();
   return value.defining_instruction()->opcode() == HloOpcode::kParameter &&
@@ -381,7 +378,7 @@ class CopyRemover {
   }
 
   string ToString() const {
-    string out = StrCat("CopyRemover, module ", module_->name(), "\n");
+    string out = absl::StrCat("CopyRemover, module ", module_->name(), "\n");
     StrAppend(&out, "  Buffer values, in dependency order:\n");
     for (const HloBuffer& buffer : alias_analysis_.buffers()) {
       StrAppend(&out, "    HloBuffer ", buffer.id(), ":\n");
@@ -863,16 +860,16 @@ class CopyRemover {
       for (const ValueNode* p = head; p != nullptr; p = Next(*p)) {
         values.push_back(p->value);
       }
-      return StrCat("{",
-                    Join(values, ", ",
-                         [](string* s, const HloValue* value) {
-                           StrAppend(s, value->ToShortString());
-                         }),
-                    "}");
+      return absl::StrCat("{",
+                          absl::StrJoin(values, ", ",
+                                        [](string* s, const HloValue* value) {
+                                          StrAppend(s, value->ToShortString());
+                                        }),
+                          "}");
     }
 
     string ToString() const {
-      string out = StrCat("BufferValueTracker:\n");
+      string out = absl::StrCat("BufferValueTracker:\n");
       StrAppend(&out, "  Def-use chains in each buffer:\n");
       for (const ValueNode* head : value_lists_) {
         StrAppend(&out, "    Buffer defined by ", head->value->ToShortString(),
@@ -880,10 +877,10 @@ class CopyRemover {
         const ValueNode* p = head;
         do {
           StrAppend(&out, "      ", p->value->ToShortString(), ", uses: ",
-                    Join(p->uses, "; ",
-                         [](string* s, const HloUse* use) {
-                           StrAppend(s, use->ToString());
-                         }),
+                    absl::StrJoin(p->uses, "; ",
+                                  [](string* s, const HloUse* use) {
+                                    StrAppend(s, use->ToString());
+                                  }),
                     "\n");
 
           p = p->next;
@@ -960,16 +957,11 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
   return Status::OK();
 }
 
-// Add copies to address special constraints on the roots of computations not
-// related to live range interference:
-//
-//    (1) Entry computation root must be unambiguous and distinct.
-//
-//    (2) Any computation called by a kCall instruction must have an
-//        unambiguous root.
-//
-//    (3) Constants and parameters cannot be live out of the entry computation
-//
+Status CopyInsertion::AddSpecialCaseCopies(HloModule* module) {
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  return AddSpecialCaseCopies(*call_graph, module);
+}
+
 Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
                                            HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
@@ -1065,15 +1057,6 @@ Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
     for (HloInstruction* user : users) {
       TF_RETURN_IF_ERROR(instruction->ReplaceUseWith(user, deep_copy));
     }
-    // Special case copies are not eligible for later copy elision passes.
-    indices_to_copy.ForEachElement([&](const ShapeIndex& index, bool has_copy) {
-      if (has_copy) {
-        HloInstruction* copy = *copies_added.mutable_element(index);
-        if (copy != nullptr) {
-          copy->SetCopyElisionAllowed(false);
-        }
-      }
-    });
     if (instruction == instruction->parent()->root_instruction()) {
       instruction->parent()->set_root_instruction(deep_copy);
     }
@@ -1081,10 +1064,10 @@ Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
   return Status::OK();
 }
 
-Status CopyInsertion::VerifyNoLiveRangeInterference(HloModule* module) {
+Status CopyInsertion::VerifyNoLiveRangeInterference(const HloOrdering& ordering,
+                                                    HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
-  DependencyHloOrdering ordering(module);
   TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering));
   return Status::OK();
 }
@@ -1101,8 +1084,7 @@ Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kCopy &&
-          instruction->CopyElisionAllowed()) {
+      if (instruction->opcode() == HloOpcode::kCopy) {
         TF_RETURN_IF_ERROR(copy_remover.TryElideCopy(instruction).status());
       }
     }
@@ -1168,10 +1150,10 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   TF_RETURN_IF_ERROR(dce.Run(module).status());
 
-  TF_DCHECK_OK(VerifyNoLiveRangeInterference(module));
+  DependencyHloOrdering dep_ordering(module);
+  TF_DCHECK_OK(VerifyNoLiveRangeInterference(dep_ordering, module));
 
-  DependencyHloOrdering ordering(module);
-  TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(ordering, module));
+  TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(dep_ordering, module));
 
   TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module));
 
@@ -1179,7 +1161,8 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
 
   TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
   TF_RETURN_IF_ERROR(dce.Run(module).status());
-  TF_DCHECK_OK(VerifyNoLiveRangeInterference(module));
+  TF_DCHECK_OK(
+      VerifyNoLiveRangeInterference(DependencyHloOrdering(module), module));
 
   MaybeDumpModule("after copy insertion", *module);
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index 5ba64b78a3..d308f6bc84 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -45,7 +45,7 @@ namespace xla {
 //       InstructionAliasSet::IsDistinct return true.
 class CopyInsertion : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "copy-insertion"; }
+  absl::string_view name() const override { return "copy-insertion"; }
 
   // fusion_can_share_buffer: backend specific function that decides whether a
   // fusion can share buffer with its operand.
@@ -77,15 +77,29 @@ class CopyInsertion : public HloPassInterface {
   Status RemoveUnnecessaryCopies(const HloOrdering& ordering,
                                  HloModule* module);
 
- private:
-  // Verifies that no HLO values have interfering live ranged assuming the
-  // ordering used by copy insertion.
-  Status VerifyNoLiveRangeInterference(HloModule* module);
+  // Add copies to address special constraints on the roots of computations not
+  // related to live range interference:
+  //
+  //    (1) Entry computation root must be unambiguous and distinct.
+  //
+  //    (2) Any computation called by a kCall instruction must have an
+  //        unambiguous root.
+  //
+  //    (3) Constants and parameters cannot be live out of the entry computation
+  //
+  Status AddSpecialCaseCopies(HloModule* module);
 
-  Status AddCopiesToResolveInterference(HloModule* module);
+  // Verifies that no HLO values have interfering live ranges using the given
+  // ordering.
+  Status VerifyNoLiveRangeInterference(const HloOrdering& ordering,
+                                       HloModule* module);
 
+ private:
+  // Override which requires the caller to pass in a call graph.
   Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module);
 
+  Status AddCopiesToResolveInterference(HloModule* module);
+
   // Backend specific function that decides whether a fusion can share buffer
   // with its operand.
   HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer_;
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 3efe3e2f93..e01fecffd0 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -20,7 +20,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
+    "mkl_deps",
 )
 
 # Filegroup used to collect source files for dependency checking.
@@ -50,6 +50,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -85,7 +86,11 @@ cc_library(
         ":ir_emitter",
         ":parallel_task_assignment",
         ":simple_orc_jit",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        ":target_machine_features",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -100,6 +105,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
+        "//tensorflow/compiler/xla/service:convolution_feature_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
@@ -176,6 +182,7 @@ cc_library(
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
+        "@com_google_absl//absl/memory",
         "@llvm//:execution_engine",
         "@llvm//:core",
         "@llvm//:mc",  # fixdeps: keep
@@ -227,6 +234,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/strings",
         "@llvm//:orc_jit",
     ],
 )
@@ -274,6 +282,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:code_gen",
         "@llvm//:core",
         "@llvm//:support",
@@ -318,6 +327,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
 )
@@ -360,6 +370,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
 )
@@ -416,6 +427,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
         "@llvm//:analysis",
         "@llvm//:core",
         "@llvm//:ipo",
@@ -497,10 +509,7 @@ cc_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/kernels:eigen_helpers",
         "//third_party/eigen3",
-    ] + if_mkl([
-        "@mkl_dnn",
-        "//third_party/mkl:intel_binary_blob",
-    ]),
+    ] + mkl_deps(),
 )
 
 cc_library(
@@ -554,10 +563,7 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
-    ] + if_mkl([
-        "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ]),
+    ] + mkl_deps(),
 )
 
 cc_library(
@@ -638,6 +644,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -652,6 +659,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -814,6 +822,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -850,6 +860,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -897,6 +908,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
         "@llvm//:core",
         "@llvm//:support",
     ],
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 128eea4828..73b03440cb 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -205,7 +205,7 @@ void CompilerFunctor::AddTargetInfoPasses(
     llvm::legacy::PassManagerBase* passes) const {
   llvm::Triple target_triple(target_machine_->getTargetTriple());
   auto target_library_info_impl =
-      MakeUnique<llvm::TargetLibraryInfoImpl>(target_triple);
+      absl::make_unique<llvm::TargetLibraryInfoImpl>(target_triple);
   target_library_info_impl->addVectorizableFunctions(
       VectorFunctionsForTargetLibraryInfoImpl());
   passes->add(
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index 0985b9297f..098ce17a56 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -132,6 +132,7 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       HloInstruction* new_conv = module->entry_computation()->AddInstruction(
           HloInstruction::CreateConvolve(new_conv_shape, new_input, new_kernel,
                                          hlo->window(), new_dnums));
+      new_conv->set_precision_config(hlo->precision_config());
 
       // Reshape the output back to the shape of the original convolution.
       TF_RETURN_IF_ERROR(module->entry_computation()->ReplaceWithNewInstruction(
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
index e6fd1499ed..59437e88af 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
@@ -38,7 +38,7 @@ class ConvCanonicalization : public HloPassInterface {
       : target_machine_features_(*target_machine_features) {}
 
   ~ConvCanonicalization() override {}
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "convolution-canonicalization";
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 62272c29c0..279aa42fe2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -26,6 +26,8 @@ limitations under the License.
 
 // IWYU pragma: no_include "llvm/Config/Disassemblers.def.inc"
 // IWYU pragma: no_include "llvm/Config/Targets.def.inc"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Function.h"
@@ -42,7 +44,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
 #include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
@@ -88,6 +90,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -99,8 +102,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace cpu {
@@ -233,15 +234,15 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
   std::unordered_map<const HloInstruction*, int64>* hlo_to_profile_idx_;
   const std::unordered_map<const HloInstruction*, int64>& assigned_indices_;
 };
-}  // namespace
 
-Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
-                                 llvm::TargetMachine* target_machine) {
-  LLVMTargetMachineFeatures target_machine_features(target_machine);
+}  // namespace
 
-  // Optimization pipeline.
-  HloPassPipeline pipeline("CPU");
-  pipeline.AddInvariantChecker<HloVerifier>();
+Status CpuCompiler::RunHloPassesThroughLayoutAssn(
+    HloModule* module, bool /*is_aot_compile*/,
+    LLVMTargetMachineFeatures* target_machine_features) {
+  HloPassPipeline pipeline("HLO passes through layout assignment");
+  pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                            /*allow_mixed_precision=*/false);
   pipeline.AddPass<CpuHloSupportChecker>();
 
   ReducePrecisionInsertion::AddPasses(
@@ -257,11 +258,13 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
   pipeline.AddPass<CallInliner>();
   pipeline.AddPass<BatchDotSimplification>();
   pipeline.AddPass<DotDecomposer>();
-  pipeline.AddPass<ConvCanonicalization>(&target_machine_features);
+  pipeline.AddPass<ConvolutionFeatureGroupConverter>();
+  pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
         pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
-    pass.AddInvariantChecker<HloVerifier>();
+    pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                          /*allow_mixed_precision=*/false);
 
     pass.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
@@ -275,7 +278,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
     // elimination has to come after that pass.
-    pipeline.AddPass<ZeroSizedHloElimination>();
+    pass.AddPass<ZeroSizedHloElimination>();
 
     pass.AddPass<WhileLoopInvariantCodeMotion>();
     pass.AddPass<TupleSimplifier>();
@@ -288,10 +291,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
   }
   pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
   pipeline.AddPass<TransposeFolding>(
-      [&target_machine_features](
-          const HloInstruction& dot,
+      [&](const HloInstruction& dot,
           const TransposeFolding::OperandIndices& candidate_operands) {
-        return PotentiallyImplementedAsEigenDot(dot, target_machine_features)
+        return PotentiallyImplementedAsEigenDot(dot, *target_machine_features)
                    ? candidate_operands
                    : TransposeFolding::OperandIndices{};
       },
@@ -299,17 +301,35 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
 
+  pipeline.AddPass<ScatterExpander>();
+
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
 
   pipeline.AddPass<CpuLayoutAssignment>(
-      module->mutable_entry_computation_layout(), &target_machine_features);
+      module->mutable_entry_computation_layout(), target_machine_features);
+  return pipeline.Run(module).status();
+}
+
+Status CpuCompiler::RunHloPassesAfterLayoutAssn(
+    HloModule* module, bool is_aot_compile,
+    LLVMTargetMachineFeatures* target_machine_features) {
+  HloPassPipeline pipeline("HLO passes after layout assignment");
+  // After layout assignment, use a layout-sensitive verifier.
+  auto& after_layout_assn =
+      pipeline.AddPass<HloPassPipeline>("after layout assignment");
+  after_layout_assn.AddInvariantChecker<HloVerifier>(
+      /*layout_sensitive=*/true,
+      /*allow_mixed_precision=*/false);
+
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   {
     auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
-        "after layout assignement");
+        "simplification after layout assignement");
+    pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
+                                          /*allow_mixed_precision=*/false);
     pass.AddPass<HloPassFix<AlgebraicSimplifier>>(
         /*is_layout_sensitive=*/true,
         [](const Shape&, const Shape&) { return true; },
@@ -317,7 +337,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
     pass.AddPass<HloDCE>();
     pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   }
+
   pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
+
   // Outline ops in the entry computation into calls to subcomputations.
   const int max_parallelism =
       module->config().intra_op_parallelism_threads() > 0
@@ -330,14 +352,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
     // binary size (and most AOT applications are single-threaded).
     // TODO(b/29630486) Support multi-threaded AOT.
     pipeline.AddPass<ParallelTaskAssigner>(
-        max_parallelism, ShapeSizeBytesFunction(), &target_machine_features);
+        max_parallelism, ShapeSizeBytesFunction(), target_machine_features);
   }
-  // Copy insertion should be performed immediately before IR emission to avoid
-  // inserting unnecessary copies (later pass adds an instruction which
-  // materializes the value) or missing a necessary copy (later pass removes an
-  // instruction which materializes a value). DCE must be run immediately before
-  // (and sometime after) copy insertion, to avoid dead code from interfering
-  // with the rewrites.
+  // Copy insertion should be performed immediately before IR emission to
+  // avoid inserting unnecessary copies (later pass adds an instruction which
+  // materializes the value) or missing a necessary copy (later pass removes
+  // an instruction which materializes a value). DCE must be run immediately
+  // before (and sometime after) copy insertion, to avoid dead code from
+  // interfering with the rewrites.
   pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
   pipeline.AddPass<CpuCopyInsertion>();
@@ -345,6 +367,15 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
   return pipeline.Run(module).status();
 }
 
+Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
+                                 llvm::TargetMachine* target_machine) {
+  LLVMTargetMachineFeatures target_machine_features(target_machine);
+  TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(module, is_aot_compile,
+                                                   &target_machine_features));
+  return RunHloPassesAfterLayoutAssn(module, is_aot_compile,
+                                     &target_machine_features);
+}
+
 namespace {
 
 // Align buffers to 16-byte boundaries.
@@ -448,7 +479,7 @@ Status CreateHloProfilingArtifacts(
         computation_to_profile_idx,
     std::unique_ptr<HloProfileIndexMap>* hlo_profile_index_map,
     std::unique_ptr<HloProfilePrinterData>* hlo_profile_printer_data) {
-  *hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(module);
+  *hlo_profile_index_map = absl::make_unique<HloProfileIndexMap>(module);
   const HloComputation& entry_computation = *module.entry_computation();
 
   TF_ASSIGN_OR_RETURN(
@@ -515,11 +546,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       &pre_optimization_ir_hook, &post_optimization_ir_hook));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  auto llvm_context = xla::MakeUnique<llvm::LLVMContext>();
+  auto llvm_context = absl::make_unique<llvm::LLVMContext>();
   auto llvm_module =
-      xla::MakeUnique<llvm::Module>("__compute_module", *llvm_context);
+      absl::make_unique<llvm::Module>("__compute_module", *llvm_context);
 
-  auto jit = xla::MakeUnique<SimpleOrcJIT>(
+  auto jit = absl::make_unique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
       CodeGenOptLevel(module->config()),
       options::OptimizeForSizeRequested(module->config()),
@@ -561,12 +592,12 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // temporary buffers are required to run the computation.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> assignment,
-      BufferAssigner::Run(
-          module.get(),
-          xla::MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
-          BufferSizeBytesFunction(), memory_alignment,
-          /*allow_input_output_aliasing=*/false,
-          /*allocate_buffers_for_constants=*/true));
+      BufferAssigner::Run(module.get(),
+                          absl::make_unique<SequentialHloOrdering>(
+                              module.get(), module_sequence),
+                          BufferSizeBytesFunction(), memory_alignment,
+                          /*allow_input_output_aliasing=*/false,
+                          /*allocate_buffers_for_constants=*/true));
   // BufferAssignment::ToString() includes a header, so no need for us to
   // print one ourselves.
   XLA_VLOG_LINES(2, assignment->ToString());
@@ -711,7 +742,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
   llvm::StringRef cpu_name = llvm_ir::AsStringRef(options.cpu_name());
   llvm::StringRef features = llvm_ir::AsStringRef(options.features());
   llvm::CodeGenOpt::Level opt_level = CodeGenOptLevel(modules[0]->config());
-  std::unique_ptr<llvm::TargetMachine> target_machine = WrapUnique(
+  std::unique_ptr<llvm::TargetMachine> target_machine = absl::WrapUnique(
       target->createTargetMachine(triple.getTriple(), cpu_name, features,
                                   CompilerTargetOptions(modules[0]->config()),
                                   reloc_model, llvm::None, opt_level));
@@ -752,7 +783,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         std::unique_ptr<BufferAssignment> assignment,
         BufferAssigner::Run(
             module,
-            xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
+            absl::make_unique<SequentialHloOrdering>(module, module_sequence),
             BufferSizeBytesFunction(), memory_alignment,
             /*allow_input_output_aliasing=*/false,
             /*allocate_buffers_for_constants=*/true));
@@ -846,7 +877,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
                         assignment->GetUniqueTopLevelOutputSlice());
 
-    results.emplace_back(MakeUnique<CpuAotCompilationResult>(
+    results.emplace_back(absl::make_unique<CpuAotCompilationResult>(
         std::move(object_file_data), std::move(buffer_infos),
         result_slice.index(), std::move(hlo_profile_printer_data)));
   }
@@ -869,7 +900,7 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
 static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(
       stream_executor::host::kHostPlatformId,
-      []() { return xla::MakeUnique<xla::cpu::CpuCompiler>(); });
+      []() { return absl::make_unique<xla::cpu::CpuCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 04e1c48872..47b5edabff 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/tf2xla/cpu_function_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
@@ -157,6 +158,16 @@ class CpuCompiler : public LLVMCompiler {
   Status RunHloPasses(HloModule* module, bool is_aot_compile,
                       llvm::TargetMachine* target_machine);
 
+  // Runs HLO passes up to and including layout assignment.
+  Status RunHloPassesThroughLayoutAssn(
+      HloModule* module, bool /*is_aot_compile*/,
+      LLVMTargetMachineFeatures* target_machine_features);
+
+  // Runs HLO passes after layout assignment.
+  Status RunHloPassesAfterLayoutAssn(
+      HloModule* module, bool is_aot_compile,
+      LLVMTargetMachineFeatures* target_machine_features);
+
   TF_DISALLOW_COPY_AND_ASSIGN(CpuCompiler);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h
index 3313d1e6eb..d49f7d7cc2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -32,11 +32,11 @@ namespace xla {
 // (module-scoped).
 class CpuCopyInsertion : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "copy-insertion"; }
+  absl::string_view name() const override { return "copy-insertion"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 };
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COPY_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index c376864c3e..fbcbbbd200 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
@@ -35,8 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -177,12 +177,12 @@ Status CpuExecutable::ExecuteComputeFunction(
         buffer_pointers.size(), profile_counters_size);
     VLOG(3) << tensorflow::strings::Printf("    result = %p", result_buffer);
     auto ptr_printer = [](string* out, const void* p) {
-      tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
+      absl::StrAppend(out, tensorflow::strings::Printf("%p", p));
     };
     VLOG(3) << "    params = nullptr";
     VLOG(3) << tensorflow::strings::Printf(
         "    temps = [%s]",
-        tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
+        absl::StrJoin(buffer_pointers, ", ", ptr_printer).c_str());
     VLOG(3) << tensorflow::strings::Printf("    profile_counters = %p",
                                            profile_counters);
   }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
index 2924b63659..6af724b2a5 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
@@ -28,9 +28,7 @@ class CpuHloSupportChecker : public HloPassInterface {
   CpuHloSupportChecker() = default;
   ~CpuHloSupportChecker() override = default;
 
-  tensorflow::StringPiece name() const override {
-    return "cpu_hlo_support_checker";
-  }
+  absl::string_view name() const override { return "cpu_hlo_support_checker"; }
 
   // Note: always returns false (no instructions are ever modified by this
   // pass).
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 991b14f17d..c3e03056f0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <set>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
@@ -697,8 +698,9 @@ void CreateComputationForDotAddOutputFusionTest(const string& test_name,
       HloInstruction::CreateBinary(dot_shape, HloOpcode::kAdd, dot, addend));
 
   if (add_extra_use_for_dot) {
+    auto* token = builder.AddInstruction(HloInstruction::CreateToken());
     builder.AddInstruction(
-        HloInstruction::CreateOutfeed(dot_shape, dot, "no_config"));
+        HloInstruction::CreateOutfeed(dot_shape, dot, token, "no_config"));
   }
 
   module->AddEntryComputation(builder.Build());
@@ -772,8 +774,8 @@ class GatherLoopFusionTest
 
 TEST_P(GatherLoopFusionTest, GatherLoopFusion) {
   const GatherLoopFusionTestSpec& spec = GetParam();
-  string hlo_string = tensorflow::strings::StrCat(
-      "HloModule ", spec.test_name, "\n\n", spec.hlo_computation_text);
+  string hlo_string = absl::StrCat("HloModule ", spec.test_name, "\n\n",
+                                   spec.hlo_computation_text);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseHloString(hlo_string));
 
@@ -791,11 +793,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[3,2] broadcast(one), dimensions={}
   ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
@@ -807,11 +809,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
   ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
@@ -823,11 +825,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -839,11 +841,11 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -855,11 +857,11 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -871,11 +873,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[1,1] broadcast(one), dimensions={}
   ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
@@ -887,11 +889,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
   ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index aa872d5ec9..bfecbd6e01 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -34,8 +34,8 @@ namespace cpu {
 // instruction stream.
 
 namespace {
-using ::tensorflow::gtl::nullopt;
-using ::tensorflow::gtl::optional;
+using absl::nullopt;
+using absl::optional;
 
 using ShouldMakeOperandColMajorCache =
     tensorflow::gtl::FlatMap<const HloInstruction*, bool>;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 3ed7876715..b8ace57026 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace {
 
@@ -45,17 +46,16 @@ bool VectorizedReduceDisabled(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaOptimizeForSizeCpuOption) > 0;
 }
 
-tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
-    const HloModuleConfig& config) {
+absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
   auto it = extra_options_map.find(kLlvmIrDotTilingFactor);
   int64 tiling_factor;
   if (it != extra_options_map.end() &&
-      tensorflow::strings::safe_strto64(it->second, &tiling_factor)) {
+      absl::SimpleAtoi(it->second, &tiling_factor)) {
     return tiling_factor;
   }
-  return tensorflow::gtl::nullopt;
+  return absl::nullopt;
 }
 
 bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
@@ -64,38 +64,37 @@ bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
 }
 
-static tensorflow::StringPiece RemoveSuffix(tensorflow::StringPiece str,
-                                            tensorflow::StringPiece suffix) {
+static absl::string_view RemoveSuffix(absl::string_view str,
+                                      absl::string_view suffix) {
   CHECK_GE(str.size(), suffix.size());
   CHECK_EQ(str.substr(str.size() - suffix.size()), suffix);
   return str.substr(0, str.size() - suffix.size());
 }
 
-tensorflow::gtl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
+absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
   auto it = extra_options_map.find(kLlvmIrGemmTileSize);
   if (it == extra_options_map.end()) {
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
 
-  std::vector<string> tile_components =
-      tensorflow::str_util::Split(it->second, ':');
+  std::vector<string> tile_components = absl::StrSplit(it->second, ':');
   CHECK_EQ(tile_components.size(), 3);
 
   int64 tile_size_m;
   int64 tile_size_k;
   int64 tile_size_n_in_vector_width;
 
-  CHECK(tensorflow::strings::safe_strto64(tile_components[0], &tile_size_m));
-  CHECK(tensorflow::strings::safe_strto64(tile_components[1], &tile_size_k));
+  CHECK(absl::SimpleAtoi(tile_components[0], &tile_size_m));
+  CHECK(absl::SimpleAtoi(tile_components[1], &tile_size_k));
 
-  tensorflow::StringPiece tile_size_n_in_vector_width_str =
+  absl::string_view tile_size_n_in_vector_width_str =
       RemoveSuffix(tile_components[2], "*vectwidth");
 
-  CHECK(tensorflow::strings::safe_strto64(tile_size_n_in_vector_width_str,
-                                          &tile_size_n_in_vector_width));
+  CHECK(absl::SimpleAtoi(tile_size_n_in_vector_width_str,
+                         &tile_size_n_in_vector_width));
 
   return std::tuple<int64, int64, int64>(tile_size_m, tile_size_k,
                                          tile_size_n_in_vector_width);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 429b9e16cb..47c7eb13b6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,9 +27,8 @@ namespace options {
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
-tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
-    const HloModuleConfig& config);
-tensorflow::gtl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
+absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
+absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
 
 }  // namespace options
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index 2ac950e6d9..bc4cfc0999 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <string>
 #include <tuple>
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
@@ -46,7 +46,7 @@ std::unique_ptr<Array2D<float>> MaybeTransposeArray2D(const Array2D<T>& array,
   if (transpose) {
     std::swap(output_width, output_height);
   }
-  auto output = MakeUnique<Array2D<float>>(output_height, output_width);
+  auto output = absl::make_unique<Array2D<float>>(output_height, output_width);
   for (int y = 0; y < array.height(); y++) {
     for (int x = 0; x < array.width(); x++) {
       if (transpose) {
@@ -93,7 +93,7 @@ std::unique_ptr<Array2D<float>> EigenMatrixMultiply(const Array2D<float>& a,
 
   // Since we're going to transpose c before returning it. Swap the order of the
   // dimension sizes to ensure the returned array is properly dimensioned.
-  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
+  auto c_transpose = absl::make_unique<Array2D<float>>(n, m);
   if (single_threaded) {
     __xla_cpu_runtime_EigenSingleThreadedMatMulF32(
         nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
@@ -204,7 +204,7 @@ std::unique_ptr<Array2D<float>> MKLMatrixMultiply(const Array2D<float>& a,
 
   // Since we're going to transpose c before returning it, swap the order of the
   // dimension sizes to ensure the returned array is properly dimensioned.
-  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
+  auto c_transpose = absl::make_unique<Array2D<float>>(n, m);
   if (single_threaded) {
     __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
         nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 59bc7e0e16..b07cd675ff 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
@@ -256,7 +257,7 @@ StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
     VLOG(2)
         << "Enqueueing outfeed buffer (for the device to populate) of length "
         << size_32 << "B";
-    buffers.emplace_back(MakeUnique<CpuOutfeedBuffer>(b.first, size_32));
+    buffers.emplace_back(absl::make_unique<CpuOutfeedBuffer>(b.first, size_32));
   }
 
   std::vector<cpu::runtime::XfeedBuffer*> buffer_pointers;
@@ -283,7 +284,7 @@ StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
 }  // namespace xla
 
 static std::unique_ptr<xla::TransferManager> CreateCpuTransferManager() {
-  return xla::MakeUnique<xla::CpuTransferManager>();
+  return absl::make_unique<xla::CpuTransferManager>();
 }
 
 static bool InitModule() {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 80ef953d53..7b938e9fd7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TRANSFER_MANAGER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TRANSFER_MANAGER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_
 
 #include <vector>
 
@@ -76,4 +76,4 @@ class CpuTransferManager : public GenericTransferManager {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TRANSFER_MANAGER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_TRANSFER_MANAGER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index f2ac742b6e..4af16f4fa0 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -146,9 +147,9 @@ class GemvConfig {
   bool has_addend() const { return has_addend_; }
 
   string GetCacheKey() const {
-    return tensorflow::strings::StrCat(
-        name_, "_", PrimitiveType_Name(scalar_type()), "_", tile_rows(), "_",
-        tile_cols(), "_", m(), "_", k(), has_addend() ? "_with_addend" : "");
+    return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_",
+                        tile_rows(), "_", tile_cols(), "_", m(), "_", k(),
+                        has_addend() ? "_with_addend" : "");
   }
 
  protected:
@@ -621,19 +622,19 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
 }
 
 // This class implements a tiled matrix multiplication algorithm, intended for
-// use as the innermost GEBP loop in a GEMM kernel (GEBP is described in "Goto,
-// Kazushige, and Robert Van De Geijn. "High-performance implementation of the
-// level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1 (2008):
-// 4).
+// multiplying small matrices that don't need cache tiling.
+//
+// In the future this can be used as the innermost GEBP loop in a GEMM kernel as
+// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of
+// high-performance matrix multiplication." ACM Transactions on Mathematical
+// Software (TOMS) 34.3 (2008): 12.".
 //
 // This only supports canonical dot operations (i.e. where the lhs contraction
 // dimension is 1 and the rhs contraction dimension is 0) over row major
 // matrices.
-class MatrixMatrixBlockPanelEmitter {
+class TiledSmallGemmEmitter {
  public:
-  // Describe the dimensions of the GEBP kernel.  These will usually not be the
-  // dimensions of the GEMM itself, the GEMM will usually be broken up into GEBP
-  // kernels with smaller dimensions.
+  // Describe the dimensions of the kernel.
   class Dimensions {
    public:
     explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
@@ -642,9 +643,7 @@ class MatrixMatrixBlockPanelEmitter {
     int64 k() const { return k_; }
     int64 n() const { return n_; }
 
-    string ToString() const {
-      return tensorflow::strings::StrCat(m(), "x", k(), "x", n());
-    }
+    string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); }
 
    private:
     const int64 m_;
@@ -652,9 +651,9 @@ class MatrixMatrixBlockPanelEmitter {
     const int64 n_;
   };
 
-  // Represents the configuration of the GEBP emitter.  The LLVM IR emitted by
-  // the emitter, modulo the LLVM values holding the input and output buffers,
-  // must be a function of the instance of `Config` passed to it.
+  // Represents the configuration of the emitter.  The LLVM IR emitted by the
+  // emitter, modulo the LLVM values holding the input and output buffers, must
+  // be a function of the instance of `Config` passed to it.
   //
   // `dims` holds the matrix multiplication dimensions.
   //
@@ -687,10 +686,10 @@ class MatrixMatrixBlockPanelEmitter {
           tile_size_k_(tile_size_k) {}
 
     string GetCacheKey() const {
-      return tensorflow::strings::StrCat(
-          "gebp_", PrimitiveType_Name(scalar_type()), "_", dims().ToString(),
-          "_", max_vectorization_width(), "_", min_vectorization_width(), "_",
-          tile_size_m(), "_", tile_size_k());
+      return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_",
+                          dims().ToString(), "_", max_vectorization_width(),
+                          "_", min_vectorization_width(), "_", tile_size_m(),
+                          "_", tile_size_k());
     }
 
     PrimitiveType scalar_type() const { return scalar_type_; }
@@ -712,11 +711,11 @@ class MatrixMatrixBlockPanelEmitter {
     int64 tile_size_k_;
   };
 
-  // Creates an instance of MatrixMatrixBlockPanelEmitter that matrix-multiplies
+  // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies
   // `lhs` with `rhs` and stores the result in `result`.
-  explicit MatrixMatrixBlockPanelEmitter(Config config, llvm::Value* lhs,
-                                         llvm::Value* rhs, llvm::Value* result,
-                                         llvm::IRBuilder<>* b)
+  explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs,
+                                 llvm::Value* rhs, llvm::Value* result,
+                                 llvm::IRBuilder<>* b)
       : lhs_(lhs),
         rhs_(rhs),
         result_(result),
@@ -780,9 +779,9 @@ class MatrixMatrixBlockPanelEmitter {
   KernelSupportLibrary ksl_;
 };
 
-void MatrixMatrixBlockPanelEmitter::Emit() { HandleResiduesOnN(); }
+void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); }
 
-void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
+void TiledSmallGemmEmitter::HandleResiduesOnN() {
   // We can only iterate the `n` dimension for an extent that is divisible by
   // the vectorization width.  So we emit an outer loop that first processes the
   // largest extent in `n` that is divisible by max_vectorization_width, then
@@ -799,7 +798,7 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
     int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
     if (n_start != n_end) {
       VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
-                               "gebp");
+                               "gemm");
       HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
       n_start = n_end;
     }
@@ -813,7 +812,7 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
   }
 
   if (n_start != dims().n()) {
-    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gebp");
+    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
     ksl_.ForReturnVoid("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
       llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
       HandleResiduesOnK(&vsl, n_i, n_i_next);
@@ -821,9 +820,9 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
   }
 }
 
-void MatrixMatrixBlockPanelEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
-                                                      llvm::Value* n_start,
-                                                      llvm::Value* n_end) {
+void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
+                                              llvm::Value* n_start,
+                                              llvm::Value* n_end) {
   int64 k_start = 0;
   int64 k_end = dims().k() - (dims().k() % tile_size_k());
   if (k_end != k_start) {
@@ -838,7 +837,7 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
   }
 }
 
-void MatrixMatrixBlockPanelEmitter::HandleResiduesOnM(
+void TiledSmallGemmEmitter::HandleResiduesOnM(
     VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
     llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
   const int64 m_end = dims().m() - dims().m() % tile_size_m();
@@ -921,7 +920,7 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnM(
 //   +-------------------+-------------------+-------------------+---------
 //   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
 //   +-------------------+-------------------+-------------------+---------
-void MatrixMatrixBlockPanelEmitter::EmitTiledGemm(
+void TiledSmallGemmEmitter::EmitTiledGemm(
     VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
     llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
     int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
@@ -1001,12 +1000,22 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
   return dot_emitter.Emit();
 }
 
-bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
+bool DotOpEmitter::EmitSmallGemmIfProfitable(
     const DotOpEmitter::MatMultDims& mat_mult_dims) {
-  if (!EnableExperimentalLlvmIrGemm() || ShouldUseMultiThreadedEigen()) {
+  if (ShouldUseMultiThreadedEigen()) {
     return false;
   }
 
+  if (!EnableExperimentalLlvmIrGemm()) {
+    // TODO(sanjoy):  We should make these numbers micro-arch specific.
+    bool small_gemm = mat_mult_dims.k <= 128 &&
+                      ((mat_mult_dims.m <= 32 && mat_mult_dims.n <= 128) ||
+                       (mat_mult_dims.m <= 128 && mat_mult_dims.n <= 32));
+    if (!small_gemm) {
+      return false;
+    }
+  }
+
   if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) {
     return false;
   }
@@ -1054,15 +1063,15 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
   std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
       GetGemmTileSize();
 
-  MatrixMatrixBlockPanelEmitter::Config config(
+  TiledSmallGemmEmitter::Config config(
       /*scalar_type=*/primitive_type,
-      MatrixMatrixBlockPanelEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
+      TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
       /*max_vectorization_width=*/max_target_vector_width,
       /*max_vector_count=*/tile_size_n_in_vector_width,
       /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
       /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
 
-  VLOG(2) << "Emitting GEBP kernel in LLVM IR with config "
+  VLOG(2) << "Emitting GEMM kernel in LLVM IR with config "
           << config.GetCacheKey();
 
   const bool enable_fast_math =
@@ -1075,10 +1084,10 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
       /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), lhs,
       rhs, target,
       [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) {
-        MatrixMatrixBlockPanelEmitter gebp_emitter(config, /*lhs=*/lhs,
-                                                   /*rhs=*/rhs,
-                                                   /*result=*/target, b_);
-        gebp_emitter.Emit();
+        TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
+                                                 /*rhs=*/rhs,
+                                                 /*result=*/target, b_);
+        small_gemm_emitter.Emit();
       });
 
   return true;
@@ -1136,7 +1145,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   }
 
   if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return EmitExperimentalGebpDotIfEnabled(mat_mult_dims);
+    return EmitSmallGemmIfProfitable(mat_mult_dims);
   }
 
   int64 tiling_factor = GetGemvTilingFactor();
@@ -1610,7 +1619,7 @@ bool PotentiallyImplementedAsEigenDot(
 
 // For vector-matrix dot products, it is always profitable to make the Rhs
 // column major.
-tensorflow::gtl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
+absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
     const HloInstruction& hlo) {
   if (hlo.opcode() == HloOpcode::kDot && hlo.shape().dimensions_size() == 2 &&
       hlo.shape().dimensions(0) == 1) {
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 590032fbe9..4c2041b556 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
 
+#include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -38,7 +38,7 @@ bool PotentiallyImplementedAsEigenDot(
 // Returns the index for an operand to `hlo` that should ideally be column
 // major.  Returns nullopt if there is no such operand or if `hlo` is not a dot
 // or a fusion containing a dot.
-tensorflow::gtl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
+absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
     const HloInstruction& hlo);
 
 // Returns true to indicate that we can generate a tiled LLVM IR implementation
@@ -121,7 +121,7 @@ class DotOpEmitter {
   // of rank 2 as well).
   MatMultDims GetMatMultDims() const;
 
-  bool EmitExperimentalGebpDotIfEnabled(const MatMultDims& mat_mult_dims);
+  bool EmitSmallGemmIfProfitable(const MatMultDims& mat_mult_dims);
 
   // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
   // registers.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 6f433b4f30..417a1dba1f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/strings/str_cat.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -67,7 +68,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
@@ -502,7 +502,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
 llvm::Value* IrEmitter::EmitElementalMap(
     const HloMapInstruction& map_instr,
     tensorflow::gtl::ArraySlice<llvm::Value*> elemental_operands,
-    tensorflow::StringPiece name) {
+    absl::string_view name) {
   return EmitThreadLocalCall(*map_instr.to_apply(), elemental_operands, name);
 }
 
@@ -846,7 +846,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
         loops
             .AddLoop(
                 0, rhs->shape().dimensions(dnums.kernel_spatial_dimensions(i)),
-                tensorflow::strings::StrCat("k", i))
+                absl::StrCat("k", i))
             ->GetIndVarValue();
   }
   llvm::Value* input_feature =
@@ -2118,7 +2118,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   gtl::ArraySlice<HloInstruction*> operands(custom_call->operands());
-  tensorflow::StringPiece custom_call_target(custom_call->custom_call_target());
+  absl::string_view custom_call_target(custom_call->custom_call_target());
   llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
@@ -2687,9 +2687,8 @@ llvm::Value* IrEmitter::EmitThreadLocalTempBufferPointer(
     auto buf_it = thread_local_buffers_.find(key);
     if (buf_it == thread_local_buffers_.end()) {
       llvm::Value* buffer = llvm_ir::EmitAllocaAtFunctionEntry(
-          IrShapeType(shape),
-          tensorflow::strings::StrCat("thread_local", slice.ToString()), &b_,
-          MinimumAlignmentForShape(target_shape));
+          IrShapeType(shape), absl::StrCat("thread_local", slice.ToString()),
+          &b_, MinimumAlignmentForShape(target_shape));
       auto it_inserted_pair = thread_local_buffers_.insert({key, buffer});
       CHECK(it_inserted_pair.second);
       buf_it = it_inserted_pair.first;
@@ -2753,7 +2752,7 @@ Status IrEmitter::EmitTargetElementLoop(
 }
 
 Status IrEmitter::EmitTargetElementLoop(
-    HloInstruction* target_op, tensorflow::StringPiece desc,
+    HloInstruction* target_op, absl::string_view desc,
     const llvm_ir::ElementGenerator& element_generator) {
   VLOG(2) << "EmitTargetElementLoop: " << target_op->ToString();
 
@@ -2848,7 +2847,7 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
 llvm::Value* IrEmitter::EmitThreadLocalCall(
     const HloComputation& callee,
     tensorflow::gtl::ArraySlice<llvm::Value*> parameters,
-    tensorflow::StringPiece name) {
+    absl::string_view name) {
   const Shape& return_shape = callee.root_instruction()->shape();
 
   // Lifting this restriction to allow "small" arrays should be easy.  Allowing
@@ -2869,7 +2868,7 @@ llvm::Value* IrEmitter::EmitThreadLocalCall(
 
   llvm::Value* return_value_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(return_type, module_),
-      tensorflow::strings::StrCat(name, "_retval_addr"), &b_,
+      absl::StrCat(name, "_retval_addr"), &b_,
       MinimumAlignmentForPrimitiveType(return_type));
 
   b_.CreateCall(
@@ -2886,7 +2885,7 @@ llvm::Value* IrEmitter::EmitThreadLocalCall(
 }
 
 void IrEmitter::EmitGlobalCall(const HloComputation& callee,
-                               tensorflow::StringPiece name) {
+                               absl::string_view name) {
   b_.CreateCall(FindOrDie(emitted_functions_, &callee),
                 GetArrayFunctionCallArguments(
                     /*parameter_addresses=*/{}, &b_, name,
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index c9a1dab62d..99c080b3db 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/macros.h"
@@ -107,7 +107,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   llvm::Value* EmitElementalMap(
       const HloMapInstruction& map_instr,
       tensorflow::gtl::ArraySlice<llvm::Value*> elemental_operands,
-      tensorflow::StringPiece name);
+      absl::string_view name);
 
  protected:
   //
@@ -239,7 +239,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // function that a map operation applies.
   StatusOr<llvm::Function*> EmitFunction(
       HloComputation* function,  // The function to emit.
-      tensorflow::StringPiece
+      absl::string_view
           function_name_suffix);  // Used for LLVM IR register names.
 
   // Emits a call to a thread local function (e.g. to the computation nested
@@ -251,14 +251,13 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   llvm::Value* EmitThreadLocalCall(
       const HloComputation& callee,
       tensorflow::gtl::ArraySlice<llvm::Value*> parameters,
-      tensorflow::StringPiece name);
+      absl::string_view name);
 
   // Emits a call to a "global" function (e.g. to the computation nested within
   // a kWhile or a kCall).  Buffer assignment unabiguously assignes buffers to
   // the parameters and return values for these computations so there is no need
   // to explicitly pass parameters or return results.
-  void EmitGlobalCall(const HloComputation& callee,
-                      tensorflow::StringPiece name);
+  void EmitGlobalCall(const HloComputation& callee, absl::string_view name);
 
   // Returns the buffer to which a global call to `callee` would have written
   // its result.
@@ -285,7 +284,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       HloInstruction* target_op,
       const llvm_ir::ElementGenerator& element_generator);
   Status EmitTargetElementLoop(
-      HloInstruction* target_op, tensorflow::StringPiece desc,
+      HloInstruction* target_op, absl::string_view desc,
       const llvm_ir::ElementGenerator& element_generator);
 
   // Emits a memcpy from the source instruction's result value to the
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index 2db4d000f5..784045313d 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -189,7 +190,7 @@ void IrFunction::Initialize(const string& function_name,
 llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
   CHECK_GT(num_dynamic_loop_bounds_, 0);
   CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
-  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
+  string name = absl::StrCat("dynamic_loop_bound_", offset);
   return b_->CreateLoad(b_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
                                       b_->getInt64(offset), AsStringRef(name)));
 }
@@ -200,7 +201,7 @@ llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
 // address buffer).
 std::vector<llvm::Value*> GetArrayFunctionCallArguments(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::IRBuilder<>* b, tensorflow::StringPiece name,
+    llvm::IRBuilder<>* b, absl::string_view name,
     llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
     llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) {
   llvm::Value* parameter_addresses_buffer;
@@ -211,13 +212,13 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
   } else {
     parameter_addresses_buffer = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
         b->getInt8PtrTy(), b->getInt32(parameter_addresses.size()),
-        tensorflow::strings::StrCat(name, "_parameter_addresses"), b);
+        absl::StrCat(name, "_parameter_addresses"), b);
 
     for (size_t i = 0; i < parameter_addresses.size(); ++i) {
       llvm::Value* parameter_as_i8ptr =
           b->CreateBitCast(parameter_addresses[i], b->getInt8PtrTy(),
-                           AsStringRef(tensorflow::strings::StrCat(
-                               name, "_parameter_", i, "_address_as_i8ptr")));
+                           AsStringRef(absl::StrCat(name, "_parameter_", i,
+                                                    "_address_as_i8ptr")));
       llvm::Value* slot_in_param_addresses =
           b->CreateInBoundsGEP(parameter_addresses_buffer, {b->getInt64(i)});
       b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
@@ -320,8 +321,7 @@ Status EmitCallToParallelForkJoin(
       /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
       /*Initializer=*/partitions_array,
       /*Name=*/
-      AsStringRef(
-          tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
+      AsStringRef(absl::StrCat(name, "_parallel_dimension_partitions")));
 
   // Add argument specifying parallel dimension partitions.
   fork_join_arguments.push_back(
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index a41cbb64cd..ee7595f6e9 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -116,7 +116,7 @@ class IrFunction {
 // Returns an array of compute function call argument ir values.
 std::vector<llvm::Value*> GetArrayFunctionCallArguments(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::IRBuilder<>* b, tensorflow::StringPiece name,
+    llvm::IRBuilder<>* b, absl::string_view name,
     llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
     llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg);
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index 8560e4296a..aedb069dce 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -30,8 +30,8 @@ ParallelLoopEmitter::ParallelLoopEmitter(
       dynamic_loop_bounds_(dynamic_loop_bounds) {}
 
 std::vector<llvm_ir::IrArray::Index>
-ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
-    tensorflow::StringPiece loop_name, llvm::Type* index_type) {
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
+                                                   llvm::Type* index_type) {
   CHECK_NE(index_type, nullptr);
 
   CHECK(!ShapeUtil::IsTuple(shape_));
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index 076c683ca5..a604e1db22 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -61,7 +61,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ~ParallelLoopEmitter() override = default;
 
   std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
-      tensorflow::StringPiece loop_name, llvm::Type* index_type) override;
+      absl::string_view loop_name, llvm::Type* index_type) override;
 
  private:
   const DynamicLoopBounds* dynamic_loop_bounds_;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 4fa5984b04..b4c0c09ec0 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
@@ -109,7 +111,7 @@ ParallelTaskAssignment::ParallelTaskAssignment(
     : target_machine_features_(*target_machine_features) {
   VLOG(1) << "ParallelTaskAssignment max_parallelism: " << max_parallelism;
   // Run cost analysis on 'module'.
-  auto cost_analysis = MakeUnique<HloCostAnalysis>(shape_size);
+  auto cost_analysis = absl::make_unique<HloCostAnalysis>(shape_size);
   HloComputation* computation = module->entry_computation();
   Status status = computation->root_instruction()->Accept(cost_analysis.get());
   if (status.ok()) {
@@ -216,8 +218,7 @@ bool ParallelTaskAssigner::AssignParallelTasksHelper(
 
     // Outline 'instruction' in 'computation' for parallel task assignment.
     auto* call = module->OutlineExpressionFromComputation(
-        {instruction},
-        tensorflow::strings::StrCat("parallel_", instruction->name()),
+        {instruction}, absl::StrCat("parallel_", instruction->name()),
         computation);
 
     // Set assigned dimension partitioning to 'instruction'.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
index 8becc8fa23..a99cd99c14 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
@@ -73,7 +73,7 @@ class ParallelTaskAssigner : public HloPassInterface {
         target_machine_features_(*target_machine_features) {}
   ~ParallelTaskAssigner() override {}
 
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "cpu-parallel-task-assigner";
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index 36c9f74385..a84ee78b19 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace {
@@ -36,7 +35,9 @@ class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
   cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features_;
 
   ParallelTaskAssignmentTest()
-      : target_machine_features_([](int64 shape_size) {
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false),
+        target_machine_features_([](int64 shape_size) {
           return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
         }) {}
 
@@ -110,9 +111,10 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
   const string hlo_string = R"(
     HloModule TestTaskParallel_infeed_outfeed
     ENTRY InfeedOutfeed {
-      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed()
+      token = token[] after-all()
+      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed(token)
       infeed0.data = u32[12345678,2]{1,0} get-tuple-element((u32[12345678,2]{1,0}, token[]) infeed0), index=0
-      ROOT outfeed0 = token[] outfeed(infeed0.data)
+      ROOT outfeed0 = token[] outfeed(infeed0.data, token)
     }
   )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index be772cfb7e..bf98064647 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <list>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Host.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
@@ -170,15 +170,14 @@ namespace {
 bool RegisterKnownJITSymbols() {
   CustomCallTargetRegistry* registry = CustomCallTargetRegistry::Global();
 
-#define REGISTER_CPU_RUNTIME_SYMBOL(base_name)                                \
-  do {                                                                        \
-    auto* function_address =                                                  \
-        reinterpret_cast<void*>(__xla_cpu_runtime_##base_name);               \
-    registry->Register(xla::cpu::runtime::k##base_name##SymbolName,           \
-                       function_address);                                     \
-    CHECK_EQ(                                                                 \
-        tensorflow::StringPiece(xla::cpu::runtime::k##base_name##SymbolName), \
-        "__xla_cpu_runtime_" #base_name);                                     \
+#define REGISTER_CPU_RUNTIME_SYMBOL(base_name)                               \
+  do {                                                                       \
+    auto* function_address =                                                 \
+        reinterpret_cast<void*>(__xla_cpu_runtime_##base_name);              \
+    registry->Register(xla::cpu::runtime::k##base_name##SymbolName,          \
+                       function_address);                                    \
+    CHECK_EQ(absl::string_view(xla::cpu::runtime::k##base_name##SymbolName), \
+             "__xla_cpu_runtime_" #base_name);                               \
   } while (false)
 
   REGISTER_CPU_RUNTIME_SYMBOL(AcquireInfeedBufferForDequeue);
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 181cec3cdd..2384166fd2 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -51,6 +51,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -94,6 +95,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
         "@llvm//:core",
     ],
 )
@@ -108,6 +110,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -121,6 +124,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 6fcce42eaa..fcd87b36b3 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <cctype>
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
index d98856fdbf..b68ac67574 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index 973aac8766..9457e57d7b 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <cctype>
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -32,9 +32,9 @@ const char* const kTriple_android_arm = "armv7-none-android";
 
 struct IntrinsicTestSpec {
   HloOpcode opcode;
-  tensorflow::StringPiece triple;
-  tensorflow::StringPiece features;
-  tensorflow::StringPiece check_lines;
+  absl::string_view triple;
+  absl::string_view features;
+  absl::string_view check_lines;
 };
 
 // Tests that unary functions get lowered using intrinsic calls.
@@ -65,9 +65,8 @@ class CpuUnaryIntrinsicTest
       features = "";
     }
 
-    return tensorflow::strings::StrCat(opcode.c_str(), "_On_", triple.c_str(),
-                                       features.empty() ? "" : "_With",
-                                       features.c_str());
+    return absl::StrCat(opcode.c_str(), "_On_", triple.c_str(),
+                        features.empty() ? "" : "_With", features.c_str());
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 90b99c828e..3b87683fff 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -38,7 +38,8 @@ while_body {
 
 while_cond {
   arg_cond = f32[2,3,2] parameter(0)
-  infeed = (pred[], token[]) infeed()
+  token = token[] after-all()
+  infeed = (pred[], token[]) infeed(token)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
@@ -50,8 +51,9 @@ ENTRY main {
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
   const_b = f32[2,3,2] while(f32[2,3,2] const_a), condition=while_cond, body=while_body
 
-  out0 = token[] outfeed(f32[2,3,2] const_a)
-  ROOT out1 = token[] outfeed(f32[2,3,2] const_b)
+  token = token[] after-all()
+  out0 = token[] outfeed(f32[2,3,2] const_a, token[] token)
+  ROOT out1 = token[] outfeed(f32[2,3,2] const_b, token[] token)
 }
 )";
 
@@ -85,7 +87,8 @@ while_body {
 
 while_cond {
   arg_cond = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
-  infeed = (pred[], token[]) infeed()
+  token = token[] after-all()
+  infeed = (pred[], token[]) infeed(token)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
@@ -94,8 +97,9 @@ ENTRY main {
   const_a = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
   const_b = (f32[2,1]{1,0}, f32[1]{0}) while((f32[2,1]{1,0}, f32[1]{0}) const_a), condition=while_cond, body=while_body
 
-  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a)
-  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b)
+  token = token[] after-all()
+  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a, token[] token)
+  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b, token[] token)
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index 01daed4bcd..bb105194f1 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -62,7 +62,8 @@ TEST_F(CpuNoAliasTest, Concat) {
 
   // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it.
   auto status_or_buffer_assn = BufferAssigner::Run(
-      hlo_module.get(), MakeUnique<DependencyHloOrdering>(hlo_module.get()),
+      hlo_module.get(),
+      absl::make_unique<DependencyHloOrdering>(hlo_module.get()),
       backend().compiler()->BufferSizeBytesFunction(),
       [](LogicalBuffer::Color) { return /*alignment=*/1; });
   ASSERT_EQ(status_or_buffer_assn.status(), Status::OK());
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index dac416e1c7..780c07f819 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -32,7 +32,8 @@ ENTRY main {
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
 
-  outfeed = token[] outfeed(f32[2,3,2] const_a)
+  token = token[] after-all()
+  outfeed = token[] outfeed(f32[2,3,2] const_a, token)
   ROOT root = () tuple()
 }
 )";
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 3274be8d9d..962ea69c09 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 
+#include "absl/algorithm/container.h"
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -422,8 +423,8 @@ TileVariable::TileVariable(VectorSupportLibrary* vector_support,
 
 std::vector<llvm::Value*> TileVariable::Get() const {
   std::vector<llvm::Value*> result;
-  c_transform(storage_, std::back_inserter(result),
-              [&](VectorVariable vect_var) { return vect_var.Get(); });
+  absl::c_transform(storage_, std::back_inserter(result),
+                    [&](VectorVariable vect_var) { return vect_var.Get(); });
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/defuser.h b/tensorflow/compiler/xla/service/defuser.h
index 56b28fd22d..c326beb899 100644
--- a/tensorflow/compiler/xla/service/defuser.h
+++ b/tensorflow/compiler/xla/service/defuser.h
@@ -29,7 +29,7 @@ class Defuser : public HloPassInterface {
  public:
   Defuser() {}
   ~Defuser() override {}
-  tensorflow::StringPiece name() const override { return "defuser"; }
+  absl::string_view name() const override { return "defuser"; }
 
   // Run defusion on the given module. Returns whether the module was
   // changed.
diff --git a/tensorflow/compiler/xla/service/defuser_test.cc b/tensorflow/compiler/xla/service/defuser_test.cc
index e727ba49cb..37d1895d41 100644
--- a/tensorflow/compiler/xla/service/defuser_test.cc
+++ b/tensorflow/compiler/xla/service/defuser_test.cc
@@ -26,6 +26,11 @@ namespace xla {
 namespace {
 
 class DefuserTest : public HloVerifiedTestBase {
+ public:
+  DefuserTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+
  protected:
   // Returns the number of fusion instructions in the module.
   int FusionCount() {
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
index d938f3a2c4..ba2a674d9a 100644
--- a/tensorflow/compiler/xla/service/despecializer.cc
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -21,8 +21,31 @@ limitations under the License.
 
 namespace xla {
 
+namespace {
+
+// Pass which strips control dependencies from all instructions in the module.
+class ControlDepRemover : public HloPassInterface {
+ public:
+  ControlDepRemover() = default;
+  absl::string_view name() const override { return "control-dep-remover"; }
+
+  StatusOr<bool> Run(HloModule* module) override {
+    bool changed = false;
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        changed = changed || !instruction->control_predecessors().empty();
+        TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
+      }
+    }
+    return changed;
+  }
+};
+
+}  // namespace
+
 Despecializer::Despecializer() : pipeline_("despecializer") {
   // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<ControlDepRemover>();
   pipeline_.AddPass<Defuser>();
   pipeline_.AddPass<ImplicitBroadcastRemover>();
   pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
index cc1695b7f8..7be70add2f 100644
--- a/tensorflow/compiler/xla/service/despecializer.h
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -33,7 +33,7 @@ namespace xla {
 class Despecializer : public HloPassInterface {
  public:
   Despecializer();
-  tensorflow::StringPiece name() const override { return "despecializer"; }
+  absl::string_view name() const override { return "despecializer"; }
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 86d57581f8..275e6cc61d 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/macros.h"
@@ -208,7 +208,6 @@ class DfsHloVisitorBase {
 
   virtual Status HandleInfeed(HloInstructionPtr hlo) = 0;
   virtual Status HandleOutfeed(HloInstructionPtr hlo) = 0;
-  virtual Status HandleHostCompute(HloInstructionPtr hlo) = 0;
   virtual Status HandleRng(HloInstructionPtr hlo) = 0;
   virtual Status HandleReverse(HloInstructionPtr hlo) = 0;
   virtual Status HandleSort(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 617a5a2eb4..6ec4893f7a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -106,9 +106,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleOutfeed(HloInstructionPtr outfeed) override {
     return DefaultAction(outfeed);
   }
-  Status HandleHostCompute(HloInstructionPtr host_compute) override {
-    return DefaultAction(host_compute);
-  }
   Status HandleReverse(HloInstructionPtr reverse) override {
     return DefaultAction(reverse);
   }
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index 12faed6967..09cb10d6ee 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -136,6 +136,7 @@ Status DecomposeBatchDot(HloInstruction* dot) {
     dot_dnums.add_rhs_contracting_dimensions(0);
     auto dot_r2 = computation->AddInstruction(HloInstruction::CreateDot(
         dot_shape_r2, lhs_slice_r2, rhs_slice_r2, dot_dnums));
+    dot_r2->set_precision_config(dot->precision_config());
 
     // Reshape Dot to R3 so we can concat along batch dimension.
     auto dot_r3 = computation->AddInstruction(
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.h b/tensorflow/compiler/xla/service/dot_decomposer.h
index 1959b687f1..fc38e31700 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.h
+++ b/tensorflow/compiler/xla/service/dot_decomposer.h
@@ -29,7 +29,7 @@ class DotDecomposer : public HloPassInterface {
   DotDecomposer(bool decompose_batch_dot = true)
       : decompose_batch_dot_(decompose_batch_dot) {}
   ~DotDecomposer() = default;
-  tensorflow::StringPiece name() const override { return "dot_decomposer"; }
+  absl::string_view name() const override { return "dot_decomposer"; }
 
   // Run DotDecomposer pass on computations in 'module'.
   // Returns whether the 'module' was changed.
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 2e9d6be2de..26af67cc1c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <vector>
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -38,17 +40,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+using absl::StrCat;
 using llvm_ir::AsStringRef;
 using llvm_ir::IrArray;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
-using tensorflow::strings::StrCat;
 
 namespace {
 
@@ -292,10 +293,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
       if (is_signed) {
         auto type =
             llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
-        auto zero = llvm::ConstantInt::get(type, 0);
-        auto cmp = b_->CreateICmpSGE(operand_value, zero);
-        return b_->CreateSelect(cmp, operand_value,
-                                b_->CreateNeg(operand_value));
+        auto cmp = b_->CreateICmpSGE(operand_value, GetZero(type));
+        return Select(cmp, operand_value, b_->CreateNeg(operand_value));
       } else {
         return operand_value;
       }
@@ -307,19 +306,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
                                           {operand_value->getType()}, b_);
     }
     case HloOpcode::kSign: {
-      bool is_signed =
-          primitive_util::IsSignedIntegralType(op->shape().element_type());
+      CHECK(primitive_util::IsSignedIntegralType(op->shape().element_type()))
+          << op->shape().element_type();
       auto type =
           llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
-      auto zero = llvm::ConstantInt::get(type, 0);
-      auto cmp = b_->CreateICmpEQ(operand_value, zero);
-      if (is_signed) {
-        auto ashr =
-            b_->CreateAShr(operand_value, type->getIntegerBitWidth() - 1);
-        return b_->CreateSelect(cmp, zero, b_->CreateOr(ashr, 1));
-      } else {
-        return b_->CreateSelect(cmp, zero, llvm::ConstantInt::get(type, 1));
-      }
+      auto cmp = b_->CreateICmpEQ(operand_value, GetZero(type));
+      auto ashr = b_->CreateAShr(operand_value, type->getIntegerBitWidth() - 1);
+      return Select(cmp, GetZero(type), b_->CreateOr(ashr, 1));
     }
     case HloOpcode::kNegate:
       return b_->CreateNeg(operand_value);
@@ -455,9 +448,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = b_->CreateFCmpOEQ(operand_value, zero);
       auto olt = b_->CreateFCmpOLT(operand_value, zero);
-      return b_->CreateSelect(
-          oeq, zero,
-          b_->CreateSelect(olt, llvm::ConstantFP::get(type, -1.0),
+      return Select(oeq, zero,
+                    Select(olt, llvm::ConstantFP::get(type, -1.0),
                            llvm::ConstantFP::get(type, 1.0)));
     }
     case HloOpcode::kIsFinite: {
@@ -675,7 +667,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto type = cplx_abs->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = b_->CreateFCmpOEQ(cplx_abs, zero);
-      return b_->CreateSelect(
+      return Select(
           oeq, EmitComposeComplex(op, zero, zero),
           EmitComposeComplex(
               op, b_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs),
@@ -807,7 +799,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
       auto oeq = b_->CreateFCmpOEQ(rhs_sum_sq, zero);
       auto real_inf_or_nan = b_->CreateFDiv(EmitExtractReal(lhs_value), zero);
       auto imag_inf_or_nan = b_->CreateFDiv(EmitExtractImag(lhs_value), zero);
-      return b_->CreateSelect(
+      return Select(
           oeq, EmitComposeComplex(op, real_inf_or_nan, imag_inf_or_nan),
           EmitComposeComplex(
               op,
@@ -1005,7 +997,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
   auto x_is_small = b_->CreateFCmpOLT(
       abs_x, llvm::ConstantFP::get(type, kAntilogarithmIsSmallThreshold));
-  return b_->CreateSelect(x_is_small, for_small_x, for_large_x);
+  return Select(x_is_small, for_small_x, for_large_x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
@@ -1046,7 +1038,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
   auto x_is_small = b_->CreateFCmpOLT(
       abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold));
-  return b_->CreateSelect(x_is_small, for_small_x, for_large_x);
+  return Select(x_is_small, for_small_x, for_large_x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
@@ -1099,6 +1091,95 @@ static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* b,
   return b->CreateSelect(shift_amt_in_range, shift_result, saturated_value);
 }
 
+llvm::Value* ElementalIrEmitter::GetOne(llvm::Type* type) const {
+  return llvm::ConstantInt::get(llvm::cast<llvm::IntegerType>(type), 1);
+}
+
+llvm::Value* ElementalIrEmitter::GetZero(llvm::Type* type) const {
+  return llvm::ConstantInt::get(llvm::cast<llvm::IntegerType>(type), 0);
+}
+
+llvm::Value* ElementalIrEmitter::GetIntSMin(llvm::Type* type) const {
+  auto* integer_type = llvm::cast<llvm::IntegerType>(type);
+  return llvm::ConstantInt::get(integer_type, llvm::APInt::getSignedMinValue(
+                                                  integer_type->getBitWidth()));
+}
+
+llvm::Value* ElementalIrEmitter::GetMinusOne(llvm::Type* type) const {
+  auto* integer_type = llvm::cast<llvm::IntegerType>(type);
+  return llvm::ConstantInt::get(
+      integer_type, llvm::APInt::getAllOnesValue(integer_type->getBitWidth()));
+}
+
+llvm::Value* ElementalIrEmitter::IsZero(llvm::Value* v) const {
+  return b_->CreateICmpEQ(v, llvm::ConstantInt::get(v->getType(), 0));
+}
+
+llvm::Value* ElementalIrEmitter::IsIntMinDivisionOverflow(
+    llvm::Value* lhs, llvm::Value* rhs) const {
+  return b_->CreateAnd(b_->CreateICmpEQ(lhs, GetIntSMin(lhs->getType())),
+                       b_->CreateICmpEQ(rhs, GetMinusOne(rhs->getType())));
+}
+
+llvm::Value* ElementalIrEmitter::Select(llvm::Value* cond, llvm::Value* if_true,
+                                        llvm::Value* if_false) const {
+  return b_->CreateSelect(cond, if_true, if_false);
+}
+
+llvm::Value* ElementalIrEmitter::EmitIntegerDivide(llvm::Value* lhs,
+                                                   llvm::Value* rhs,
+                                                   bool is_signed) const {
+  // Integer division overflow behavior:
+  //
+  // X / 0 == -1
+  // INT_SMIN /s -1 = INT_SMIN
+
+  if (!is_signed) {
+    llvm::Value* udiv_is_unsafe = IsZero(rhs);
+    llvm::Value* safe_rhs = Select(udiv_is_unsafe, GetOne(lhs->getType()), rhs);
+    llvm::Value* safe_div = b_->CreateUDiv(lhs, safe_rhs);
+    return Select(udiv_is_unsafe, GetMinusOne(lhs->getType()), safe_div);
+  }
+
+  llvm::Value* has_zero_divisor = IsZero(rhs);
+  llvm::Value* has_int_min_overflow = IsIntMinDivisionOverflow(lhs, rhs);
+  llvm::Value* sdiv_is_unsafe =
+      b_->CreateOr(has_int_min_overflow, has_zero_divisor);
+  llvm::Value* safe_rhs = Select(sdiv_is_unsafe, GetOne(lhs->getType()), rhs);
+  llvm::Value* safe_div = b_->CreateSDiv(lhs, safe_rhs);
+
+  return Select(
+      has_zero_divisor, GetMinusOne(lhs->getType()),
+      Select(has_int_min_overflow, GetIntSMin(lhs->getType()), safe_div));
+}
+
+llvm::Value* ElementalIrEmitter::EmitIntegerRemainder(llvm::Value* lhs,
+                                                      llvm::Value* rhs,
+                                                      bool is_signed) const {
+  // Integer remainder overflow behavior:
+  //
+  // X % 0 == X
+  // INT_SMIN %s -1 = 0
+
+  if (!is_signed) {
+    llvm::Value* urem_is_unsafe = IsZero(rhs);
+    llvm::Value* safe_rhs = Select(urem_is_unsafe, GetOne(lhs->getType()), rhs);
+    llvm::Value* safe_rem = b_->CreateURem(lhs, safe_rhs);
+    return Select(urem_is_unsafe, lhs, safe_rem);
+  }
+
+  llvm::Value* has_zero_divisor = IsZero(rhs);
+  llvm::Value* has_int_min_overflow = IsIntMinDivisionOverflow(lhs, rhs);
+  llvm::Value* srem_is_unsafe =
+      b_->CreateOr(has_int_min_overflow, has_zero_divisor);
+  llvm::Value* safe_rhs = Select(srem_is_unsafe, GetOne(lhs->getType()), rhs);
+  llvm::Value* safe_rem = b_->CreateSRem(lhs, safe_rhs);
+
+  return Select(
+      has_zero_divisor, lhs,
+      Select(has_int_min_overflow, GetZero(lhs->getType()), safe_rem));
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value,
     bool is_signed) const {
@@ -1111,11 +1192,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
     case HloOpcode::kMultiply:
       return b_->CreateMul(lhs_value, rhs_value);
     case HloOpcode::kDivide:
-      return is_signed ? b_->CreateSDiv(lhs_value, rhs_value)
-                       : b_->CreateUDiv(lhs_value, rhs_value);
+      return EmitIntegerDivide(lhs_value, rhs_value, is_signed);
     case HloOpcode::kRemainder:
-      return is_signed ? b_->CreateSRem(lhs_value, rhs_value)
-                       : b_->CreateURem(lhs_value, rhs_value);
+      return EmitIntegerRemainder(lhs_value, rhs_value, is_signed);
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_EQ, lhs_value,
                                      rhs_value, b_);
@@ -1175,19 +1254,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
 llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value,
                                                  llvm::Value* rhs_value,
                                                  bool is_signed) const {
-  return b_->CreateSelect(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
-                                                   : llvm::ICmpInst::ICMP_UGE,
-                                         lhs_value, rhs_value),
-                          lhs_value, rhs_value);
+  return Select(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
+                                         : llvm::ICmpInst::ICMP_UGE,
+                               lhs_value, rhs_value),
+                lhs_value, rhs_value);
 }
 
 llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
                                                  llvm::Value* rhs_value,
                                                  bool is_signed) const {
-  return b_->CreateSelect(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
-                                                   : llvm::ICmpInst::ICMP_ULE,
-                                         lhs_value, rhs_value),
-                          lhs_value, rhs_value);
+  return Select(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
+                                         : llvm::ICmpInst::ICMP_ULE,
+                               lhs_value, rhs_value),
+                lhs_value, rhs_value);
 }
 
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
@@ -1505,8 +1584,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalSelect(
   TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
                       operand_to_generator.at(hlo->operand(2))(
                           ElementwiseSourceIndex(index, *hlo, 2)));
-  return b_->CreateSelect(b_->CreateTrunc(pred_value, b_->getInt1Ty()),
-                          on_true_value, on_false_value);
+  return Select(b_->CreateTrunc(pred_value, b_->getInt1Ty()), on_true_value,
+                on_false_value);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalClamp(
@@ -1672,22 +1751,21 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
   std::vector<int64> operand_to_output_dim(operand_shape.dimensions_size(), -1);
   for (int64 i = 0, e = operand_shape.dimensions_size(), operand_index_dim = 0;
        i < e; i++) {
-    if (c_binary_search(dim_numbers.elided_window_dims(), i)) {
+    if (absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
       operand_index.push_back(index.GetConstantWithIndexType(0));
     } else {
-      int64 output_window_dim =
-          dim_numbers.output_window_dims(operand_index_dim++);
+      int64 output_window_dim = dim_numbers.offset_dims(operand_index_dim++);
       operand_to_output_dim[i] = output_window_dim;
       operand_index.push_back(index[output_window_dim]);
     }
   }
 
-  // This is the index of the index vector in the gather_indices tensor.
+  // This is the index of the index vector in the start_indices tensor.
   IrArray::Index gather_index_index(index_type);
   {
     std::vector<llvm::Value*> gather_index_index_components;
     for (int64 i = 0, e = output_shape.dimensions_size(); i < e; i++) {
-      if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+      if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) {
         gather_index_index.push_back(index[i]);
       }
     }
@@ -1700,7 +1778,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
   auto add_to_operand_index = [&](llvm::Value* index_component, int64 dim) {
     llvm::Value* gather_dim_component_extended =
         b_->CreateSExtOrTrunc(index_component, index_type);
-    int64 operand_dim = dim_numbers.gather_dims_to_operand_dims(dim);
+    int64 operand_dim = dim_numbers.start_index_map(dim);
     int64 output_dim = operand_to_output_dim[operand_dim];
     // If 'output_dim' is -1, it means 'operand_dim' is an elided window dim.
     // This means we set the iteration index to 0, so for the purpose of the
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 1598a4dd85..c037b98929 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -65,6 +65,21 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitComplexUnaryOp(
       const HloInstruction* op, llvm::Value* operand_value) const;
 
+  llvm::Value* IsZero(llvm::Value* v) const;
+  llvm::Value* IsIntMinDivisionOverflow(llvm::Value* lhs,
+                                        llvm::Value* rhs) const;
+  llvm::Value* GetZero(llvm::Type* type) const;
+  llvm::Value* GetOne(llvm::Type* type) const;
+  llvm::Value* GetIntSMin(llvm::Type* type) const;
+  llvm::Value* GetMinusOne(llvm::Type* type) const;
+  llvm::Value* Select(llvm::Value* cond, llvm::Value* if_true,
+                      llvm::Value* if_false) const;
+
+  llvm::Value* EmitIntegerDivide(llvm::Value* lhs, llvm::Value* rhs,
+                                 bool is_signed) const;
+  llvm::Value* EmitIntegerRemainder(llvm::Value* lhs, llvm::Value* rhs,
+                                    bool is_signed) const;
+
   virtual StatusOr<llvm::Value*> EmitIntegerBinaryOp(const HloInstruction* op,
                                                      llvm::Value* lhs_value,
                                                      llvm::Value* rhs_value,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
index addb016b04..5ab0756219 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using tensorflow::gtl::nullopt;
+using absl::nullopt;
 
 class ElementalIrEmitterExecutionTest : public HloTestBase {
  protected:
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index fd75847d0c..1c9f396b68 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/executable.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -76,8 +77,8 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
   std::unique_ptr<HloExecutionProfile> profile_ptr =
       module_config().debug_options().xla_hlo_profile() &&
               hlo_profiling_enabled()
-          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer_data(),
-                                            &hlo_profile_index_map())
+          ? absl::make_unique<HloExecutionProfile>(&hlo_profile_printer_data(),
+                                                   &hlo_profile_index_map())
           : nullptr;
 
   StatusOr<ScopedShapedBuffer> return_value =
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index 228c3fac95..70a78c8a2b 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -53,8 +53,8 @@ ExecutionHandle ExecutionTracker::Register(Backend* backend,
   tensorflow::mutex_lock lock(execution_mutex_);
   int64 handle = next_handle_++;
   auto inserted = handle_to_execution_.emplace(
-      handle,
-      MakeUnique<AsyncExecution>(backend, std::move(streams), profile, result));
+      handle, absl::make_unique<AsyncExecution>(backend, std::move(streams),
+                                                profile, result));
   CHECK(inserted.second);
 
   ExecutionHandle execution_handle;
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.h b/tensorflow/compiler/xla/service/flatten_call_graph.h
index d3efab3614..3cccec9862 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.h
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.h
@@ -28,7 +28,7 @@ namespace xla {
 // points-to analysis (see b/36865746 for details).
 class FlattenCallGraph : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "flatten-call-graph"; }
+  absl::string_view name() const override { return "flatten-call-graph"; }
 
   // Duplicates computations called from multiple call- or while-nodes to
   // flatten the call graph.
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index e3a42d0d06..d889fd8e88 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
@@ -27,85 +28,85 @@ namespace xla {
 using tensorflow::gtl::ArraySlice;
 
 static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
-    HloInstruction* gather_indices, int64 index_vector_dim) {
-  const Shape& gather_indices_shape = gather_indices->shape();
+    HloInstruction* start_indices, int64 index_vector_dim) {
+  const Shape& start_indices_shape = start_indices->shape();
 
-  if (gather_indices_shape.dimensions_size() == index_vector_dim) {
-    return gather_indices;
+  if (start_indices_shape.dimensions_size() == index_vector_dim) {
+    return start_indices;
   }
 
-  if (index_vector_dim == (gather_indices_shape.dimensions_size() - 1)) {
-    return gather_indices;
+  if (index_vector_dim == (start_indices_shape.dimensions_size() - 1)) {
+    return start_indices;
   }
 
   std::vector<int64> permutation;
-  permutation.reserve(gather_indices_shape.dimensions_size());
-  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+  permutation.reserve(start_indices_shape.dimensions_size());
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
     if (i != index_vector_dim) {
       permutation.push_back(i);
     }
   }
   permutation.push_back(index_vector_dim);
-  return MakeTransposeHlo(gather_indices, permutation);
+  return MakeTransposeHlo(start_indices, permutation);
 }
 
-// Canonicalizes the gather_indices tensors so that we only have deal with some
+// Canonicalizes the start_indices tensors so that we only have deal with some
 // specific cases in the while loop that does the heavy lifting.
 //
 // See the "High Level Algorithm" section for a broader picture.
 static StatusOr<HloInstruction*> CanonicalizeGatherIndices(
-    HloInstruction* gather_indices, int64 index_vector_dim) {
+    HloInstruction* start_indices, int64 index_vector_dim) {
   // Transpose the non-index-vector dimensions to the front.
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * transposed_gather_indices,
-      TransposeIndexVectorDimToLast(gather_indices, index_vector_dim));
+      HloInstruction * transposed_start_indices,
+      TransposeIndexVectorDimToLast(start_indices, index_vector_dim));
   bool indices_are_scalar =
-      index_vector_dim == gather_indices->shape().dimensions_size();
+      index_vector_dim == start_indices->shape().dimensions_size();
 
-  // The number of dimensions in gather_indices that are index dimensions.
-  const int64 index_dims_in_gather_indices = indices_are_scalar ? 0 : 1;
+  // The number of dimensions in start_indices that are index dimensions.
+  const int64 index_dims_in_start_indices = indices_are_scalar ? 0 : 1;
 
-  // If there is only one index (i.e. gather_indices has rank 1 and this gather
+  // If there is only one index (i.e. start_indices has rank 1 and this gather
   // is really just a dynamic slice) add a leading degenerate dimension for
   // uniformity.  Otherwise create a "collapsed" leading dimension that subsumes
   // all of the non-index-vector dimensions.
-  const Shape& shape = transposed_gather_indices->shape();
-  if (shape.dimensions_size() == index_dims_in_gather_indices) {
-    return PrependDegenerateDims(transposed_gather_indices, 1);
+  const Shape& shape = transposed_start_indices->shape();
+  if (shape.dimensions_size() == index_dims_in_start_indices) {
+    return PrependDegenerateDims(transposed_start_indices, 1);
   } else {
-    // Collapse all but the dimensions (0 or 1) in gather_indices containing the
+    // Collapse all but the dimensions (0 or 1) in start_indices containing the
     // index vectors.
     return CollapseFirstNDims(
-        transposed_gather_indices,
-        shape.dimensions_size() - index_dims_in_gather_indices);
+        transposed_start_indices,
+        shape.dimensions_size() - index_dims_in_start_indices);
   }
 }
 
 // Expands out or contracts away the gather dimensions in the accumulator
 // produced by the while loop.
-static StatusOr<HloInstruction*> AdjustGatherDimsInAccumulator(
-    const Shape& gather_indices_shape, HloInstruction* accumulator,
+static StatusOr<HloInstruction*> AdjustBatchDimsInAccumulator(
+    const Shape& start_indices_shape, HloInstruction* accumulator,
     int64 index_vector_dim) {
-  std::vector<int64> output_gather_dim_bounds;
-  output_gather_dim_bounds.reserve(gather_indices_shape.dimensions_size());
-  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+  std::vector<int64> batch_dim_bounds;
+  batch_dim_bounds.reserve(start_indices_shape.dimensions_size());
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
     if (i != index_vector_dim) {
-      output_gather_dim_bounds.push_back(gather_indices_shape.dimensions(i));
+      batch_dim_bounds.push_back(start_indices_shape.dimensions(i));
     }
   }
 
-  if (output_gather_dim_bounds.empty()) {
-    // If output_gather_dim_bounds is empty we must be lowering a (effectively)
+  if (batch_dim_bounds.empty()) {
+    // If batch_dim_bounds is empty we must be lowering a (effectively)
     // dynamic-slice.  In that case, there is a leading degenerate gather
     // dimension that we added to make this special case play well with the
     // general while loop which we need to remove now.
     return ElideDegenerateDims(accumulator, {0});
   }
 
-  return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds);
+  return ExpandFirstDimIntoNDims(accumulator, batch_dim_bounds);
 }
 
-// Expand an index vector from the gather_indices tensor into a vector that can
+// Expand an index vector from the start_indices tensor into a vector that can
 // be used to dynamic-slice out of the gather operand.
 static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
     HloInstruction* index_vector, const GatherDimensionNumbers& dim_numbers,
@@ -121,10 +122,8 @@ static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
   std::vector<HloInstruction*> expanded_index_components;
 
   for (int i = 0; i < operand_rank; i++) {
-    int64 index_vector_dim_index =
-        FindIndex(dim_numbers.gather_dims_to_operand_dims(), i);
-    if (index_vector_dim_index !=
-        dim_numbers.gather_dims_to_operand_dims_size()) {
+    int64 index_vector_dim_index = FindIndex(dim_numbers.start_index_map(), i);
+    if (index_vector_dim_index != dim_numbers.start_index_map_size()) {
       TF_ASSIGN_OR_RETURN(
           HloInstruction * component_to_concat,
           MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index},
@@ -147,10 +146,10 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
   const GatherDimensionNumbers& dim_numbers = gather.gather_dimension_numbers();
   CHECK_EQ(incoming_loop_state.size(), 3);
   HloInstruction* const operand = incoming_loop_state[0];
-  HloInstruction* const gather_indices = incoming_loop_state[1];
+  HloInstruction* const start_indices = incoming_loop_state[1];
   HloInstruction* const output_accumulator = incoming_loop_state[2];
 
-  bool has_scalar_indices = gather_indices->shape().dimensions_size() == 1;
+  bool has_scalar_indices = start_indices->shape().dimensions_size() == 1;
   CHECK_EQ(has_scalar_indices,
            dim_numbers.index_vector_dim() ==
                gather.operand(1)->shape().dimensions_size());
@@ -163,24 +162,24 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
   HloInstruction* index_vector;
 
   if (has_scalar_indices) {
-    // In this case gather_indices has rank 1 and induction_var_as_vector (of
+    // In this case start_indices has rank 1 and induction_var_as_vector (of
     // shape {1}) is an index into this rank 1 tensor.
     TF_ASSIGN_OR_RETURN(
         index_vector,
-        MakeDynamicSliceHlo(gather_indices, induction_var_as_vector, {1}));
+        MakeDynamicSliceHlo(start_indices, induction_var_as_vector, {1}));
   } else {
-    // In this case gather_indices has rank 2 and induction_var_as_vector (of
+    // In this case start_indices has rank 2 and induction_var_as_vector (of
     // shape {1}) is an index into just the first dimension of this rank 2
     // tensor.
     TF_ASSIGN_OR_RETURN(
-        HloInstruction * index_into_gather_indices,
+        HloInstruction * index_into_start_indices,
         PadVectorWithZeros(induction_var_as_vector,
                            /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
 
-    int64 index_vector_size = gather_indices->shape().dimensions(1);
+    int64 index_vector_size = start_indices->shape().dimensions(1);
     TF_ASSIGN_OR_RETURN(
         HloInstruction * index_vector_2d,
-        MakeDynamicSliceHlo(gather_indices, index_into_gather_indices,
+        MakeDynamicSliceHlo(start_indices, index_into_start_indices,
                             {1, index_vector_size}));
 
     TF_ASSIGN_OR_RETURN(index_vector,
@@ -194,26 +193,26 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
 
   TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice,
                       MakeDynamicSliceHlo(operand, gathered_slice_start,
-                                          gather.gather_window_bounds()));
+                                          gather.gather_slice_sizes()));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * gathered_slice_with_dims_elided,
+      HloInstruction* const gathered_slice_with_dims_collapsed,
       ElideDegenerateDims(gathered_slice,
-                          AsInt64Slice(dim_numbers.elided_window_dims())));
+                          AsInt64Slice(dim_numbers.collapsed_slice_dims())));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * gathered_slice_for_update,
-      PrependDegenerateDims(gathered_slice_with_dims_elided, 1));
+      HloInstruction* const gathered_slice_for_update,
+      PrependDegenerateDims(gathered_slice_with_dims_collapsed, 1));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * index_vector_into_accumulator,
+      HloInstruction* const index_vector_into_accumulator,
       PadVectorWithZeros(
           induction_var_as_vector, /*zeros_to_prepend=*/0,
           /*zeros_to_append=*/
-          gathered_slice_with_dims_elided->shape().dimensions_size()));
+          gathered_slice_with_dims_collapsed->shape().dimensions_size()));
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * updated_accumulator,
+      HloInstruction* const updated_accumulator,
       MakeDynamicUpdateSliceHlo(output_accumulator, gathered_slice_for_update,
                                 index_vector_into_accumulator));
 
@@ -221,19 +220,19 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
   // WhileUtil::MakeCountedLoop functions takes care of the induction variable
   // and the while loop exit condition.
   return StatusOr<std::vector<HloInstruction*>>{
-      {operand, gather_indices, updated_accumulator}};
+      {operand, start_indices, updated_accumulator}};
 }
 
 static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
     HloComputation* computation, PrimitiveType element_type,
-    ArraySlice<int64> window_bounds, int64 gather_loop_trip_count,
+    ArraySlice<int64> slice_sizes, int64 gather_loop_trip_count,
     const GatherDimensionNumbers& dim_numbers) {
   std::vector<int64> accumulator_state_shape_dims;
-  accumulator_state_shape_dims.reserve(1 + window_bounds.size());
+  accumulator_state_shape_dims.reserve(1 + slice_sizes.size());
   accumulator_state_shape_dims.push_back(gather_loop_trip_count);
-  for (int64 i = 0; i < window_bounds.size(); i++) {
-    if (!c_binary_search(dim_numbers.elided_window_dims(), i)) {
-      accumulator_state_shape_dims.push_back(window_bounds[i]);
+  for (int64 i = 0; i < slice_sizes.size(); i++) {
+    if (!absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
+      accumulator_state_shape_dims.push_back(slice_sizes[i]);
     }
   }
   return BroadcastZeros(computation, element_type,
@@ -241,23 +240,23 @@ static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
 }
 
 // `accumulator` is almost the tensor the gather operation would have produced,
-// except that it has the dimensions in the wrong order -- the gather dimensions
-// are the major dimensions and the window dimensions are the minor dimensions.
+// except that it has the dimensions in the wrong order -- the batch dimensions
+// are the major dimensions and the offset dimensions are the minor dimensions.
 // Fix this up with a transpose.
-static StatusOr<HloInstruction*> PermuteGatherAndWindowDims(
-    HloInstruction* accumulator, ArraySlice<int64> output_window_dims,
+static StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
+    HloInstruction* accumulator, ArraySlice<int64> offset_dims,
     int64 output_rank) {
   std::vector<int64> permutation;
   permutation.reserve(output_rank);
 
-  int64 gather_idx_counter = 0;
-  int64 window_idx_counter = output_rank - output_window_dims.size();
+  int64 batch_idx_counter = 0;
+  int64 offset_idx_counter = output_rank - offset_dims.size();
   for (int64 i = 0; i < output_rank; i++) {
-    bool is_window_dim = c_binary_search(output_window_dims, i);
-    if (is_window_dim) {
-      permutation.push_back(window_idx_counter++);
+    bool is_offset_dim = absl::c_binary_search(offset_dims, i);
+    if (is_offset_dim) {
+      permutation.push_back(offset_idx_counter++);
     } else {
-      permutation.push_back(gather_idx_counter++);
+      permutation.push_back(batch_idx_counter++);
     }
   }
 
@@ -268,11 +267,11 @@ static StatusOr<HloInstruction*> PermuteGatherAndWindowDims(
 //
 // We follow the following steps in sequence:
 //
-//  1. We canonicalize the gather_indices tensor such that it has rank
+//  1. We canonicalize the start_indices tensor such that it has rank
 //     2 (i.e. is a matrix) where each row is an index vector into the
 //     operand.
 //  2. We iterate over the set of indices in the canonicalized
-//     gather_indices tensor using a while loop, accumulating slices
+//     start_indices tensor using a while loop, accumulating slices
 //     of the operand tensor into an accumulator using
 //     DynamicUpdateSlice.
 //  3. The accumulator result from the while loop from (2) is then
@@ -287,11 +286,11 @@ static StatusOr<HloInstruction*> PermuteGatherAndWindowDims(
 //     operand = s32[3,3] parameter(0)
 //     indices = s32[2,2] parameter(1)
 //     ROOT gather = s32[2,3,2] gather(operand, indices),
-//         output_window_dims={1},
-//         elided_window_dims={1},
-//         gather_dims_to_operand_dims={1},
+//         offset_dims={1},
+//         collapsed_slice_dims={1},
+//         start_index_map={1},
 //         index_vector_dim=2,
-//         window_bounds={3, 1}
+//         slice_sizes={3, 1}
 //   }
 //
 // We'd first reshape indices to s32[4,1], where each row is an index
@@ -305,8 +304,8 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
 
   HloComputation* computation = gather_instr->parent();
   HloInstruction* operand = gather_instr->mutable_operand(0);
-  HloInstruction* gather_indices = gather_instr->mutable_operand(1);
-  const Shape& gather_indices_shape = gather_indices->shape();
+  HloInstruction* start_indices = gather_instr->mutable_operand(1);
+  const Shape& start_indices_shape = start_indices->shape();
   const Shape& output_shape = gather_instr->shape();
   int64 output_rank = output_shape.dimensions_size();
 
@@ -314,9 +313,9 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
       gather_instr->gather_dimension_numbers();
 
   int64 gather_loop_trip_count = 1;
-  for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) {
+  for (int64 i = 0, e = start_indices_shape.dimensions_size(); i < e; i++) {
     if (i != dim_numbers.index_vector_dim()) {
-      gather_loop_trip_count *= gather_indices_shape.dimensions(i);
+      gather_loop_trip_count *= start_indices_shape.dimensions(i);
     }
   }
 
@@ -327,24 +326,24 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
         gather_instr->ToString().c_str());
   }
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * canonical_gather_indices,
-                      CanonicalizeGatherIndices(
-                          gather_indices, dim_numbers.index_vector_dim()));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * canonical_start_indices,
+      CanonicalizeGatherIndices(start_indices, dim_numbers.index_vector_dim()));
 
   CHECK_EQ(gather_loop_trip_count,
-           canonical_gather_indices->shape().dimensions(0));
+           canonical_start_indices->shape().dimensions(0));
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * accumulator_init,
       CreateGatherLoopAccumulatorInitValue(
           computation, output_shape.element_type(),
-          gather_instr->gather_window_bounds(), gather_loop_trip_count,
+          gather_instr->gather_slice_sizes(), gather_loop_trip_count,
           gather_instr->gather_dimension_numbers()));
 
   StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
       WhileUtil::MakeCountedLoop(
           computation, gather_loop_trip_count,
-          {operand, canonical_gather_indices, accumulator_init},
+          {operand, canonical_start_indices, accumulator_init},
           [&](HloInstruction* indvar,
               const std::vector<HloInstruction*>& loop_state) {
             return GatherLoopBody(*gather_instr, indvar, loop_state);
@@ -356,13 +355,13 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
   HloInstruction* accumulator_result = gather_loop_result.back();
 
   TF_ASSIGN_OR_RETURN(
-      HloInstruction * accumulator_with_output_gather_dims_decanonicalized,
-      AdjustGatherDimsInAccumulator(gather_indices->shape(), accumulator_result,
-                                    dim_numbers.index_vector_dim()));
+      HloInstruction* const accumulator_with_batch_dims_decanonicalized,
+      AdjustBatchDimsInAccumulator(start_indices->shape(), accumulator_result,
+                                   dim_numbers.index_vector_dim()));
 
-  return PermuteGatherAndWindowDims(
-      accumulator_with_output_gather_dims_decanonicalized,
-      AsInt64Slice(dim_numbers.output_window_dims()), output_rank);
+  return PermuteBatchAndOffsetDims(accumulator_with_batch_dims_decanonicalized,
+                                   AsInt64Slice(dim_numbers.offset_dims()),
+                                   output_rank);
 }
 
 StatusOr<bool> GatherExpander::Run(HloModule* module) {
@@ -375,8 +374,8 @@ StatusOr<bool> GatherExpander::Run(HloModule* module) {
 
   std::vector<HloInstruction*> gather_instrs;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    c_copy_if(computation->instructions(), std::back_inserter(gather_instrs),
-              is_nontrivial_gather);
+    absl::c_copy_if(computation->instructions(),
+                    std::back_inserter(gather_instrs), is_nontrivial_gather);
   }
 
   for (HloInstruction* inst : gather_instrs) {
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
index c1fc8574da..7bd9ea5984 100644
--- a/tensorflow/compiler/xla/service/gather_expander.h
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -25,7 +25,7 @@ namespace xla {
 // nevertheless have a minimum level of support.
 class GatherExpander : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "gather_expander"; }
+  absl::string_view name() const override { return "gather_expander"; }
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 020ffcd106..141dd4d6f1 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -28,11 +28,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2147483647,5] parameter(1)
   ROOT gather = s32[2147483647,3,5] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -55,11 +55,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index a3f6e8d989..e53f525517 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 
 licenses(["notice"])  # Apache 2.0
@@ -55,6 +56,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -90,6 +92,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_reachability",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -106,6 +109,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -125,6 +129,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
 )
@@ -179,6 +184,11 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@llvm//:core",
         "@llvm//:support",
     ],
@@ -223,6 +233,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:math_ops",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:support",
     ],
@@ -242,6 +253,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -256,6 +268,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -336,6 +349,9 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/stream_executor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -361,15 +377,19 @@ cc_library(
     hdrs = ["cudnn_convolution_algorithm_picker.h"],
     deps = [
         ":backend_configs",
+        ":buffer_comparator",
         ":cudnn_convolution_runner",
         ":gpu_executable",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -387,6 +407,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -463,6 +484,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:multi_output_fusion",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -480,6 +502,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -510,6 +533,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -541,6 +566,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:shape_inference",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -597,6 +623,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:infeed_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
         "@llvm//:core",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
@@ -636,6 +663,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
+        "//tensorflow/compiler/xla/service:convolution_feature_group_converter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -652,6 +680,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
@@ -665,6 +694,9 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@llvm//:core",
     ],
     alwayslink = True,  # Contains compiler registration
@@ -697,8 +729,8 @@ cc_library(
         ":xfeed_queue",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -713,6 +745,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -751,6 +784,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -762,12 +796,12 @@ cc_library(
         ":stream_assignment",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_value",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_reachability",
         "//tensorflow/compiler/xla/service:hlo_scheduling",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -784,6 +818,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -852,3 +887,34 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+cc_library(
+    name = "buffer_comparator",
+    srcs = ["buffer_comparator.cc"],
+    hdrs = ["buffer_comparator.h"],
+    deps = [
+        ":gpu_executable",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_test(
+    name = "buffer_comparator_test",
+    srcs = ["buffer_comparator_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":buffer_comparator",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 537295292b..e208ad61e3 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -40,7 +40,7 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
     const BufferAssignment* buffer_assignment, int device_ordinal,
     DeviceMemoryAllocator* memory_allocator) {
   const int64 num_buffers = buffer_assignment->Allocations().size();
-  auto buffer_allocations = WrapUnique(new BufferAllocations(
+  auto buffer_allocations = absl::WrapUnique(new BufferAllocations(
       num_buffers, device_ordinal, memory_allocator, buffer_assignment));
 
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
new file mode 100644
index 0000000000..f22c2a8add
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -0,0 +1,204 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
+
+#include <cmath>
+#include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+namespace gpu {
+
+static constexpr float kTolerance = 0.1f;
+
+static string GetCompHloText(size_t num_elements) {
+  // Implements the textual format of the comparison routine, as it's more
+  // readable.
+  static constexpr char kF16CompHloText[] = R"(
+HloModule CompareF16
+
+MaxF32 {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %max = f32[] maximum(%lhs, %rhs)
+}
+
+Canonicalize (aparam: f16[SIZE]) -> f32[SIZE] {
+  %min_constant = f32[] constant(-65505)
+  %max_constant = f32[] constant(65505)
+  %large_constant = f32[] constant(1048576)
+  %min_values = f32[SIZE] broadcast(%min_constant), dimensions={}
+  %max_values = f32[SIZE] broadcast(%max_constant), dimensions={}
+  %large_values = f32[SIZE] broadcast(%large_constant), dimensions={}
+
+  %a = f16[SIZE] parameter(0)
+  %converted = f32[SIZE] convert(%a)
+  %clamped = f32[SIZE] clamp(%min_values, %converted, %max_values)
+
+  // Since the clamp() above already took care of infs, only NaNs will cause
+  // is-finite() to return false.
+  %is_finite = pred[SIZE] is-finite(%clamped)
+  ROOT %result = f32[SIZE] select(%is_finite, %clamped, %large_values)
+}
+
+ENTRY MaxDifference {
+  %one_constant = f32[] constant(1.0)
+  %zero_constant = f32[] constant(0.0)
+
+  %ones = f32[SIZE] broadcast(%one_constant), dimensions={}
+
+  %lhs = f16[SIZE] parameter(0)
+  %rhs = f16[SIZE] parameter(1)
+  %lhs_canonical = f32[SIZE] call(%lhs), to_apply=Canonicalize
+  %rhs_canonical = f32[SIZE] call(%rhs), to_apply=Canonicalize
+  %sub = f32[SIZE] subtract(%lhs_canonical, %rhs_canonical)
+  %sub_abs = f32[SIZE] abs(%sub)
+  %lhs_abs = f32[SIZE] abs(%lhs_canonical)
+  %rhs_abs = f32[SIZE] abs(%rhs_canonical)
+  %max = f32[SIZE] maximum(%lhs_abs, %rhs_abs)
+  %denominator = f32[SIZE] add(%max, %ones)
+  %error = f32[SIZE] divide(%sub_abs, %denominator)
+  ROOT %max_diff = f32[] reduce(%error, %zero_constant), dimensions={0}, to_apply=MaxF32
+})";
+  return absl::StrReplaceAll(kF16CompHloText,
+                             {{"SIZE", absl::StrCat(num_elements)}});
+}
+
+StatusOr<F16BufferComparator> F16BufferComparator::Create(
+    se::DeviceMemory<Eigen::half> ref_buffer, Compiler* compiler,
+    DeviceMemoryAllocator* allocator, se::Stream* stream) {
+  auto stream_exec = stream->parent();
+  int64 num_elements = ref_buffer.ElementCount();
+
+  // One may consider using hlo_runner to do all the compilation and execution.
+  // However, as of the time hlo_runner doesn't support injection for Compiler*,
+  // Stream*, or even the allocator. We may revisit this in the future if it
+  // proves to be a maintenance burden.
+  TF_ASSIGN_OR_RETURN(
+      auto exec, ([&]() -> StatusOr<std::unique_ptr<Executable>> {
+        HloModuleConfig config;
+        DebugOptions debug_options;
+        debug_options.set_xla_backend_optimization_level(2);
+        config.set_debug_options(debug_options);
+        TF_ASSIGN_OR_RETURN(
+            auto module, ParseHloString(GetCompHloText(num_elements), config));
+        TF_ASSIGN_OR_RETURN(
+            module,
+            compiler->RunHloPasses(std::move(module), stream_exec, nullptr));
+        return compiler->RunBackend(std::move(module), stream_exec, nullptr);
+      }()));
+
+  TF_ASSIGN_OR_RETURN(
+      auto shaped_buffer, ([&]() -> StatusOr<ScopedShapedBuffer> {
+        auto device_ordinal = stream_exec->device_ordinal();
+        TF_ASSIGN_OR_RETURN(
+            auto owning_buffer,
+            allocator->Allocate(device_ordinal, ref_buffer.size()));
+        se::DeviceMemory<Eigen::half> buffer(
+            owning_buffer.AsDeviceMemoryBase());
+        stream->ThenMemcpy(&buffer, ref_buffer, ref_buffer.size());
+        Shape shape = ShapeUtil::MakeShape(xla::F16, {num_elements});
+        ScopedShapedBuffer ret(shape, shape, allocator, device_ordinal);
+        ret.set_buffer(std::move(owning_buffer), {});
+        return std::move(ret);
+      }()));
+
+  return F16BufferComparator(stream, allocator, std::move(exec),
+                             std::move(shaped_buffer));
+}
+
+StatusOr<bool> F16BufferComparator::CompareEqualImpl(
+    se::DeviceMemory<Eigen::half> test_buffer) {
+  if (ref_buffer_.root_buffer().size() != test_buffer.size()) {
+    return InternalError("Mismatched buffer size: %lld vs %lld",
+                         ref_buffer_.root_buffer().size(), test_buffer.size());
+  }
+
+  int64 num_elements = test_buffer.ElementCount();
+
+  TF_ASSIGN_OR_RETURN(
+      auto result_buffer, ([&]() -> StatusOr<ScopedShapedBuffer> {
+        auto stream_exec = stream_->parent();
+        Shape shape = ShapeUtil::MakeShape(xla::F16, {num_elements});
+        auto device_ordinal = stream_exec->device_ordinal();
+        ShapedBuffer shaped_test_buffer(shape, shape, stream_exec->platform(),
+                                        device_ordinal);
+        shaped_test_buffer.set_buffer(test_buffer, {});
+        ExecutableRunOptions run_options;
+        run_options.set_device_ordinal(stream_exec->device_ordinal());
+        run_options.set_stream(stream_);
+        run_options.set_allocator(allocator_);
+        ServiceExecutableRunOptions service_run_options(run_options);
+        return exec_->ExecuteOnStream(
+            &service_run_options, {&ref_buffer_, &shaped_test_buffer}, nullptr);
+      }()));
+
+  float result;
+  CHECK(result_buffer.root_buffer().size() == sizeof(result));
+  stream_->ThenMemcpy(&result, result_buffer.root_buffer(), sizeof(result));
+  TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+  return result < kTolerance;
+}
+
+StatusOr<bool> F16BufferComparator::CompareEqual(
+    se::DeviceMemory<Eigen::half> test_buffer) {
+  TF_ASSIGN_OR_RETURN(auto result, CompareEqualImpl(test_buffer));
+  if (result) {
+    return true;
+  }
+  // Host side code that does the same thing, but report some of the
+  // differences as well.
+  int64 n = test_buffer.ElementCount();
+  std::vector<half> host_ref_buffer(n), host_test_buffer(n);
+  stream_->ThenMemcpy(host_ref_buffer.data(), ref_buffer_.root_buffer(),
+                      ref_buffer_.root_buffer().size());
+  stream_->ThenMemcpy(host_test_buffer.data(), test_buffer, test_buffer.size());
+  TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
+
+  const auto canonicalize = [](float a) -> float {
+    constexpr float kBigNumer = 1048576.;
+    constexpr float kMaxFp16Value = 65504.;
+    if (std::isnan(a)) {
+      return kBigNumer;
+    }
+    if (std::isinf(a)) {
+      if (a < 0) {
+        return -(kMaxFp16Value + 1);
+      }
+      return kMaxFp16Value + 1;
+    }
+    return a;
+  };
+  int differences_seen = 0;
+  for (int64 i = 0; i < n && differences_seen < 10; i++) {
+    float original_ref = static_cast<float>(host_ref_buffer[i]);
+    float original_test = static_cast<float>(host_test_buffer[i]);
+    float ref = canonicalize(original_ref);
+    float test = canonicalize(original_test);
+    if (!(std::abs(ref - test) / (std::max(std::abs(ref), std::abs(test)) + 1) <
+          kTolerance)) {
+      differences_seen++;
+      LOG(ERROR) << "Difference at " << i << ": " << original_ref << " vs "
+                 << original_test;
+    }
+  }
+
+  return false;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.h b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h
new file mode 100644
index 0000000000..bf2ba78cea
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.h
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
+
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// A fp16 comparator that internally keeps a reference buffer, and compares it
+// against other test buffers.
+class F16BufferComparator {
+ public:
+  F16BufferComparator(const F16BufferComparator&) = delete;
+  F16BufferComparator(F16BufferComparator&&) = default;
+
+  // Creates a new comparator. It internally allocates a buffer initialized by
+  // ref_buffer.
+  static StatusOr<F16BufferComparator> Create(
+      se::DeviceMemory<Eigen::half> ref_buffer, Compiler* compiler,
+      DeviceMemoryAllocator* allocator, se::Stream* stream);
+
+  // Returns true if the internally allocated buffer "compares equal" to
+  // test_buffer. The definition of "equal" is:
+  // * All NaNs equal.
+  // * All infs are treated as 65505 or -65505, so that this checker is tolerant
+  //   to fp16 overflows.
+  // * With NaNs and infs taken care of, a and b compare equal iff:
+  //     abs(a - b) / (max(abs(a), abs(b)) + 1) < tolerance
+  //
+  // See the implementation for the tolerance value.
+  StatusOr<bool> CompareEqual(se::DeviceMemory<Eigen::half> test_buffer);
+
+ private:
+  F16BufferComparator(se::Stream* stream, DeviceMemoryAllocator* allocator,
+                      std::unique_ptr<Executable> exec,
+                      ScopedShapedBuffer ref_buffer)
+      : stream_(stream),
+        allocator_(allocator),
+        exec_(std::move(exec)),
+        ref_buffer_(std::move(ref_buffer)) {}
+
+  StatusOr<bool> CompareEqualImpl(se::DeviceMemory<Eigen::half> test_buffer);
+
+  se::Stream* stream_;
+  DeviceMemoryAllocator* allocator_;
+  std::unique_ptr<Executable> exec_;
+  ScopedShapedBuffer ref_buffer_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_BUFFER_COMPARATOR_H_
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
new file mode 100644
index 0000000000..33761d1bd8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
+
+#include <limits>
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class BufferComparatorTest : public testing::Test {
+ protected:
+  BufferComparatorTest()
+      : backend_(Backend::CreateDefaultBackend().ConsumeValueOrDie()),
+        stream_exec_(backend_->default_stream_executor()),
+        allocator_(stream_exec_->platform(), {stream_exec_}),
+        compiler_(Compiler::GetForPlatform(stream_exec_->platform())
+                      .ConsumeValueOrDie()) {}
+
+  // Take floats only for convenience. Still uses half internally.
+  bool CompareEqualFloatBuffers(const std::vector<float>& lhs_float,
+                                const std::vector<float>& rhs_float) {
+    std::vector<half> lhs(lhs_float.begin(), lhs_float.end());
+    std::vector<half> rhs(rhs_float.begin(), rhs_float.end());
+    se::Stream stream(stream_exec_);
+    stream.Init();
+
+    auto owning_lhs_buffer =
+        allocator_
+            .Allocate(stream_exec_->device_ordinal(), lhs.size() * sizeof(half))
+            .ConsumeValueOrDie();
+
+    auto owning_rhs_buffer =
+        allocator_
+            .Allocate(stream_exec_->device_ordinal(), rhs.size() * sizeof(half))
+            .ConsumeValueOrDie();
+
+    auto lhs_buffer =
+        se::DeviceMemory<Eigen::half>(owning_lhs_buffer.AsDeviceMemoryBase());
+    auto rhs_buffer =
+        se::DeviceMemory<Eigen::half>(owning_rhs_buffer.AsDeviceMemoryBase());
+
+    stream.ThenMemcpy(&lhs_buffer, lhs.data(), lhs_buffer.size());
+    stream.ThenMemcpy(&rhs_buffer, rhs.data(), rhs_buffer.size());
+
+    TF_CHECK_OK(stream.BlockHostUntilDone());
+
+    return F16BufferComparator::Create(lhs_buffer, compiler_, &allocator_,
+                                       &stream)
+        .ConsumeValueOrDie()
+        .CompareEqual(rhs_buffer)
+        .ConsumeValueOrDie();
+  }
+
+  std::unique_ptr<Backend> backend_;
+  se::StreamExecutor* stream_exec_;
+  StreamExecutorMemoryAllocator allocator_;
+  Compiler* compiler_;
+};
+
+TEST_F(BufferComparatorTest, TestNaNs) {
+  EXPECT_TRUE(CompareEqualFloatBuffers({std::nanf("")}, {std::nanf("")}));
+  // NaN values with different bit patterns should compare equal.
+  EXPECT_TRUE(CompareEqualFloatBuffers({std::nanf("")}, {std::nanf("1234")}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({std::nanf("")}, {1.}));
+}
+
+TEST_F(BufferComparatorTest, TestInfs) {
+  const auto inf = std::numeric_limits<float>::infinity();
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {std::nanf("")}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({inf}, {inf}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({inf}, {65504}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({-inf}, {-65504}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {-65504}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {65504}));
+
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {20}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({inf}, {-20}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {20}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({-inf}, {-20}));
+}
+
+TEST_F(BufferComparatorTest, TestNumbers) {
+  EXPECT_TRUE(CompareEqualFloatBuffers({20}, {20.1}));
+  EXPECT_FALSE(CompareEqualFloatBuffers({0}, {1}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({0.9}, {1}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({9}, {10}));
+  EXPECT_TRUE(CompareEqualFloatBuffers({10}, {9}));
+}
+
+TEST_F(BufferComparatorTest, TestMultiple) {
+  EXPECT_TRUE(CompareEqualFloatBuffers({20, 30, 40, 50, 60},
+                                       {20.1, 30.1, 40.1, 50.1, 60.1}));
+  std::vector<float> lhs(200);
+  std::vector<float> rhs(200);
+  for (int i = 0; i < 200; i++) {
+    EXPECT_TRUE(CompareEqualFloatBuffers(lhs, rhs))
+        << "should be the same at index " << i;
+    lhs[i] = 3;
+    rhs[i] = 5;
+    EXPECT_FALSE(CompareEqualFloatBuffers(lhs, rhs))
+        << "should be the different at index " << i;
+    lhs[i] = 0;
+    rhs[i] = 0;
+  }
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 5780e0af40..8b0426aa27 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 7833a4077e..854a2f50b2 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index d76ca6698d..f7952787c1 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_THUNK_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h
index e09cde9abf..6e2e330edd 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h
@@ -54,9 +54,7 @@ namespace gpu {
 // BatchNormRewriter.
 class CudnnBatchNormRewriter : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override {
-    return "cudnn_batchnorm_rewriter";
-  }
+  absl::string_view name() const override { return "cudnn_batchnorm_rewriter"; }
   StatusOr<bool> Run(HloModule* module) override;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
index 7b172812c3..18a76e8c26 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 7348307ec8..3d421ebb69 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -14,24 +14,24 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+using absl::optional;
 using se::DeviceMemoryBase;
 using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::optional;
 
 class ScratchAllocator : public se::ScratchAllocator {
  public:
@@ -128,14 +128,14 @@ std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
 
 string AlgorithmToString(const AlgorithmDesc& algo) {
   if (algo.tensor_ops_enabled()) {
-    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
+    return absl::StrCat(algo.algo_id(), "+TC");
   }
-  return tensorflow::strings::StrCat(algo.algo_id());
+  return absl::StrCat(algo.algo_id());
 }
 
 string NumBytesToString(int64 bytes) {
-  return tensorflow::strings::StrCat(
-      tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)");
+  return absl::StrCat(tensorflow::strings::HumanReadableNumBytes(bytes), " (",
+                      bytes, "B)");
 }
 
 // Acquires a process-global lock on the device pointed to by the given
@@ -173,11 +173,17 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
 // cache misses and doing extra work.  Overall, caching doesn't seem worth the
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
-optional<std::tuple<int64, bool, int64>>
+StatusOr<std::tuple<int64, bool, int64>>
 CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
     const Shape& output_shape, const Window& window,
     const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) {
+  CHECK_EQ(input_shape.element_type(), filter_shape.element_type());
+  CHECK_EQ(input_shape.element_type(), output_shape.element_type());
+  // TODO(timshen): for now only check fp16. It can be expanded to other types,
+  // with some work on the HLO routines.
+  const bool cross_check_enabled = input_shape.element_type() == xla::F16;
+
   // Don't run this function concurrently on the same GPU.
   //
   // This is a bit of a hack and doesn't protect us against arbitrary concurrent
@@ -206,51 +212,75 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
   // Allocate space for the input, filter, and output of the convolution.  We
   // use a ScratchAllocator for this instead of calling allocator_ directly so
   // that our allocations don't leak.
-  //
-  // We don't put any data in these buffers, because (in theory, anyway) the
-  // speed of a conv isn't affected by the data being convolved.
   ScratchAllocator input_output_allocator(device_ordinal, allocator);
-  StatusOr<DeviceMemoryBase> maybe_input_buf =
-      input_output_allocator.AllocateBytes(&stream,
-                                           ShapeUtil::ByteSizeOf(input_shape));
-  StatusOr<DeviceMemoryBase> maybe_filter_buf =
-      input_output_allocator.AllocateBytes(&stream,
-                                           ShapeUtil::ByteSizeOf(filter_shape));
-  StatusOr<DeviceMemoryBase> maybe_output_buf =
-      input_output_allocator.AllocateBytes(&stream,
-                                           ShapeUtil::ByteSizeOf(output_shape));
-  if (!maybe_input_buf.ok() || !maybe_filter_buf.ok() ||
-      !maybe_output_buf.ok()) {
-    LOG(WARNING)
-        << "Couldn't allocate space for input/filter/output of convolution "
-        << instr->ToString() << ".  Falling back to default algorithm.";
-    return nullopt;
-  }
-
-  DeviceMemoryBase input_buf = maybe_input_buf.ValueOrDie();
-  DeviceMemoryBase filter_buf = maybe_filter_buf.ValueOrDie();
-  DeviceMemoryBase output_buf = maybe_output_buf.ValueOrDie();
-
-  // Although we don't have evidence this matters, zero out the buffers before
-  // autotuning.  It's conceivable that using uninitialized memory as the inputs
-  // might affect performance if e.g. the inputs contain denormals, and this is
-  // easy enough.
-  if (!stream.ThenMemZero(&input_buf, input_buf.size())
-           .ThenMemZero(&filter_buf, filter_buf.size())
-           .ThenMemZero(&output_buf, output_buf.size())
-           .BlockHostUntilDone()
-           .ok()) {
-    LOG(WARNING)
-        << "Couldn't zero out input/filter/output buffer for convolution "
-        << instr->ToString() << ".  Falling back to default algorithm.";
-    return nullopt;
+  TF_ASSIGN_OR_RETURN(DeviceMemoryBase input_buf,
+                      input_output_allocator.AllocateBytes(
+                          &stream, ShapeUtil::ByteSizeOf(input_shape)));
+  TF_ASSIGN_OR_RETURN(DeviceMemoryBase filter_buf,
+                      input_output_allocator.AllocateBytes(
+                          &stream, ShapeUtil::ByteSizeOf(filter_shape)));
+  TF_ASSIGN_OR_RETURN(DeviceMemoryBase output_buf,
+                      input_output_allocator.AllocateBytes(
+                          &stream, ShapeUtil::ByteSizeOf(output_shape)));
+
+  if (cross_check_enabled) {
+    // Broadcast a constant to the buffer, instead of zeroing the buffer. A
+    // non-zero constant is useful for the cross checking, because zero-inputs
+    // may not always reveal the bugs.
+    const auto initialize_f16 = [&stream](DeviceMemoryBase buffer) {
+      CHECK_EQ(0, (uintptr_t)buffer.opaque() % 4);
+      size_t left_over_bytes = buffer.size() % 4;
+      CHECK_EQ(0, left_over_bytes % 2);
+
+      constexpr float kBroadcastedConstant = 0.1f;
+      Eigen::half halfs[2] = {Eigen::half(kBroadcastedConstant),
+                              Eigen::half(kBroadcastedConstant)};
+      uint32 bits;
+      static_assert(sizeof(bits) == sizeof(halfs), "");
+      memcpy(&bits, halfs, sizeof(bits));
+
+      size_t aligned_size = buffer.size() / 4 * 4;
+      stream.ThenMemset32(&buffer, bits, aligned_size);
+
+      DeviceMemoryBase left_over(
+          static_cast<char*>(buffer.opaque()) + aligned_size, left_over_bytes);
+      stream.ThenMemcpy(&left_over, halfs, left_over_bytes);
+    };
+    initialize_f16(input_buf);
+    initialize_f16(filter_buf);
+    initialize_f16(output_buf);
+  } else {
+    // Although we don't have evidence this matters, zero out the buffers before
+    // autotuning.  It's conceivable that using uninitialized memory as the
+    // inputs might affect performance if e.g. the inputs contain denormals, and
+    // this is easy enough.
+    stream.ThenMemZero(&input_buf, input_buf.size())
+        .ThenMemZero(&filter_buf, filter_buf.size())
+        .ThenMemZero(&output_buf, output_buf.size());
   }
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+
+  DeviceMemoryBase* result_buf = [&] {
+    switch (kind) {
+      case CudnnConvKind::kBackwardFilter:
+        return &filter_buf;
+      case CudnnConvKind::kBackwardInput:
+        return &input_buf;
+      case CudnnConvKind::kForward:
+        return &output_buf;
+    }
+  }();
 
   const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
       input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
   int64 best_result_bytes_used = 0;
 
+  optional<F16BufferComparator> comparator;
+  // Use the first algorithm that's supported as reference. There isn't a
+  // particular reason to use it, as any algorithm sufficies. It doesn't make
+  // this algorithm considered correct, though.
+  optional<AlgorithmDesc> first_algorithm;
   for (const AlgorithmDesc& alg :
        GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) {
     ScratchAllocator scratch_allocator(device_ordinal, allocator);
@@ -266,6 +296,42 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
             .ok();
 
     if (launch_ok && profile_result.is_valid()) {
+      const bool crash_on_checking_failure =
+          instr->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_crash_on_verification_failures();
+      if (comparator.has_value()) {
+        StatusOr<bool> result = comparator->CompareEqual(
+            se::DeviceMemory<Eigen::half>(*result_buf));
+        if (!result.ok()) {
+          LOG(ERROR) << "Unable to compare "
+                     << AlgorithmToString(*first_algorithm) << " against "
+                     << AlgorithmToString(alg) << " for " << instr->ToString()
+                     << ": " << result.status();
+          CHECK(!crash_on_checking_failure);
+        } else if (!result.ValueOrDie()) {
+          LOG(ERROR) << "Results mismatch between different convolution "
+                        "algorithms. This is likely a bug in convolution, or "
+                        "an excessive loss of precision in convolution. "
+                     << instr->ToString() << " for "
+                     << AlgorithmToString(*first_algorithm) << " vs "
+                     << AlgorithmToString(alg);
+          CHECK(!crash_on_checking_failure);
+        }
+      } else if (cross_check_enabled) {
+        auto comp = F16BufferComparator::Create(
+            se::DeviceMemory<Eigen::half>(*result_buf), compiler_, allocator,
+            &stream);
+        if (comp.ok()) {
+          comparator.emplace(comp.ConsumeValueOrDie());
+          first_algorithm.emplace(alg);
+        } else {
+          LOG(ERROR) << "Fail to initialize buffer comparator: "
+                     << comp.status() << ", instruction: " << instr->ToString();
+          CHECK(!crash_on_checking_failure);
+        }
+      }
       int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
       VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
               << " succeeded, taking " << profile_result.elapsed_time_in_ms()
@@ -292,9 +358,10 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
                            best_result_bytes_used);
   }
 
-  LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString()
-               << " failed.  Falling back to default algorithm.";
-  return nullopt;
+  return InternalError(
+      "All algorithms tried for convolution %s failed.  Falling back to "
+      "default algorithm.",
+      instr->ToString().c_str());
 }
 
 StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
@@ -305,12 +372,13 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
   const auto& lhs_shape = instr->operand(0)->shape();
   const auto& rhs_shape = instr->operand(1)->shape();
   const auto& conv_result_shape = instr->shape().tuple_shapes(0);
-  optional<std::tuple<int64, bool, int64>> alg_scratch_and_tc;
+  StatusOr<std::tuple<int64, bool, int64>> alg_scratch_and_tc;
   if (call_target == kCudnnConvForwardCallTarget) {
-    alg_scratch_and_tc = PickBestAlgorithm(
-        CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
-        /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape,
-        instr->window(), instr->convolution_dimension_numbers(), instr);
+    alg_scratch_and_tc =
+        PickBestAlgorithm(CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
+                          /*filter_shape=*/rhs_shape,
+                          /*output_shape=*/conv_result_shape, instr->window(),
+                          instr->convolution_dimension_numbers(), instr);
   } else if (call_target == kCudnnConvBackwardInputCallTarget) {
     alg_scratch_and_tc = PickBestAlgorithm(
         CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape,
@@ -326,7 +394,8 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
                << instr->ToString();
   }
 
-  if (!alg_scratch_and_tc.has_value()) {
+  if (!alg_scratch_and_tc.ok()) {
+    LOG(ERROR) << alg_scratch_and_tc.status();
     return false;
   }
 
@@ -334,7 +403,8 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
   bool tensor_ops_enabled;
   int64 scratch_bytes;
 
-  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) = *alg_scratch_and_tc;
+  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) =
+      alg_scratch_and_tc.ConsumeValueOrDie();
 
   VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
           << NumBytesToString(scratch_bytes)
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
index bc5d1ce94a..f76d273e8c 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -16,11 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
 
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
@@ -34,10 +35,11 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
   // memory while timing the various convolution algorithms.  If it's null,
   // we'll use the default allocator on the StreamExecutor.
   CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
-                                  DeviceMemoryAllocator* allocator)
-      : stream_exec_(stream_exec), allocator_(allocator) {}
+                                  DeviceMemoryAllocator* allocator,
+                                  Compiler* compiler)
+      : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
 
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "cudnn-convolution-algorithm-picker";
   }
 
@@ -46,13 +48,14 @@ class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
  private:
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  tensorflow::gtl::optional<std::tuple<int64, bool, int64>> PickBestAlgorithm(
+  StatusOr<std::tuple<int64, bool, int64>> PickBestAlgorithm(
       CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
       const Shape& output_shape, const Window& window,
       const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
+  Compiler* compiler_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
index 0c0578d888..fbe7e98494 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
@@ -26,7 +26,7 @@ namespace gpu {
 // backwards-input convolutions into CustomCall HLOs that call into cuDNN.
 class CudnnConvolutionRewriter : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "cudnn-convolution-rewriter";
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
index 0645fbb3ad..68086c86e9 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -56,7 +57,7 @@ class ScratchBufAllocator : public se::ScratchAllocator {
           "Can't allocate twice from a ScratchBufAllocator.");
     }
     if (byte_size > scratch_.size()) {
-      return se::port::InternalError(tensorflow::strings::StrCat(
+      return se::port::InternalError(absl::StrCat(
           "Can't allocate ", byte_size,
           " bytes from a ScratchBufAllocator of size ", scratch_.size()));
     }
@@ -96,15 +97,9 @@ Status RunCudnnConvolution(
   // tensorflow/python/ops/nn_ops.py).
   const int effective_num_dimensions = std::max(2, num_dimensions);
 
-  if (std::is_same<T, float>::value) {
-    CHECK_EQ(F32, output_shape.element_type())
-        << ShapeUtil::HumanString(output_shape);
-  } else if (std::is_same<T, Eigen::half>::value) {
-    CHECK_EQ(F16, output_shape.element_type())
-        << ShapeUtil::HumanString(output_shape);
-  } else {
-    LOG(FATAL) << ShapeUtil::HumanString(output_shape);
-  }
+  CHECK_EQ(primitive_util::NativeToPrimitiveType<T>(),
+           output_shape.element_type())
+      << ShapeUtil::HumanString(output_shape);
 
   CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
@@ -246,21 +241,31 @@ Status RunCudnnConvolution(
     se::dnn::AlgorithmConfig algorithm, se::Stream* stream,
     se::dnn::ProfileResult* profile_result) {
   PrimitiveType output_primitive_type = output_shape.element_type();
-  CHECK(output_primitive_type == F32 || output_primitive_type == F16)
-      << ShapeUtil::HumanString(output_shape);
-  if (output_primitive_type == F32) {
-    return RunCudnnConvolution(
-        kind, input_shape, filter_shape, output_shape,
-        se::DeviceMemory<float>(input_buf), se::DeviceMemory<float>(filter_buf),
-        se::DeviceMemory<float>(output_buf), scratch_allocator, window, dnums,
-        algorithm, stream, profile_result);
+  switch (output_primitive_type) {
+    case F16:
+      return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                                 se::DeviceMemory<Eigen::half>(input_buf),
+                                 se::DeviceMemory<Eigen::half>(filter_buf),
+                                 se::DeviceMemory<Eigen::half>(output_buf),
+                                 scratch_allocator, window, dnums, algorithm,
+                                 stream, profile_result);
+    case F32:
+      return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                                 se::DeviceMemory<float>(input_buf),
+                                 se::DeviceMemory<float>(filter_buf),
+                                 se::DeviceMemory<float>(output_buf),
+                                 scratch_allocator, window, dnums, algorithm,
+                                 stream, profile_result);
+    case F64:
+      return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                                 se::DeviceMemory<double>(input_buf),
+                                 se::DeviceMemory<double>(filter_buf),
+                                 se::DeviceMemory<double>(output_buf),
+                                 scratch_allocator, window, dnums, algorithm,
+                                 stream, profile_result);
+    default:
+      LOG(FATAL) << ShapeUtil::HumanString(output_shape);
   }
-  return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
-                             se::DeviceMemory<Eigen::half>(input_buf),
-                             se::DeviceMemory<Eigen::half>(filter_buf),
-                             se::DeviceMemory<Eigen::half>(output_buf),
-                             scratch_allocator, window, dnums, algorithm,
-                             stream, profile_result);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 9b6de115ad..2460d951bd 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 // IWYU pragma: no_include "llvm/IR/Attributes.gen.inc"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
@@ -43,16 +45,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace gpu {
 
+using absl::StrAppend;
 using llvm_ir::IrArray;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
-using tensorflow::strings::StrAppend;
 
 namespace {
 // Returns whether operand is a floating-point literal with the given value.
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index 0cdddf8bcf..def595d217 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 8c53be5077..4adec7ee54 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FFT_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FFT_THUNK_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 2fd2206324..88f0b4d71c 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -28,7 +28,7 @@ ForThunk::ForThunk(const int64 loop_limit,
                    const HloInstruction* hlo)
     : Thunk(Kind::kWhile, hlo),
       loop_limit_(loop_limit),
-      body_thunk_sequence_(MakeUnique<SequentialThunk>(
+      body_thunk_sequence_(absl::make_unique<SequentialThunk>(
           // Pass nullptr as the HloInstruction* to the body_thunk_sequence_
           // constructor because this SequentialThunk is logically "part of"
           // this ForThunk, and shouldn't be profiled separately from it.
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 3cd30b754c..1bd88233e1 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace gpu {
@@ -64,10 +65,11 @@ double CalculateBytesReadByFusionParameter(HloInstruction* param) {
   // Slice for a more accurate estimate of bytes read.
   double bytes = 0.0;
   for (auto& instruction : instructions) {
-    if (c_all_of(instruction->users(), [](const HloInstruction* instruction) {
-          return instruction->opcode() == HloOpcode::kSlice ||
-                 instruction->opcode() == HloOpcode::kDynamicSlice;
-        })) {
+    if (absl::c_all_of(
+            instruction->users(), [](const HloInstruction* instruction) {
+              return instruction->opcode() == HloOpcode::kSlice ||
+                     instruction->opcode() == HloOpcode::kDynamicSlice;
+            })) {
       // All users are slice: accumulate bytes of all user slice instructions.
       for (auto& user : instruction->users()) {
         bytes += ShapeUtil::ByteSizeOf(user->shape());
@@ -223,7 +225,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // Skip 'fusion' instruction if we cannot merge into all of its users.
   // Merging into all users enables the removal of 'fusion' from the
   // computation.
-  if (!c_all_of(fusion->users(), [](const HloInstruction* user) {
+  if (!absl::c_all_of(fusion->users(), [](const HloInstruction* user) {
         return user->opcode() == HloOpcode::kFusion &&
                (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
                 user->fusion_kind() == HloInstruction::FusionKind::kInput);
@@ -241,11 +243,11 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // If 'fusion' has just one user, then an earlier fusion pass chose not to
   // fuse this producer/comsumer pair (likely because of expensive instruction
   // re-use by the consumer), and so we honor that choice here as well.
-  if (c_any_of(fusion->fused_instructions(),
-               [](const HloInstruction* instruction) {
-                 return instruction->opcode() != HloOpcode::kParameter &&
-                        GpuInstructionFusion::IsExpensive(*instruction);
-               })) {
+  if (absl::c_any_of(fusion->fused_instructions(),
+                     [](const HloInstruction* instruction) {
+                       return instruction->opcode() != HloOpcode::kParameter &&
+                              GpuInstructionFusion::IsExpensive(*instruction);
+                     })) {
     VLOG(3) << "Not merging " << fusion->name()
             << ": Contains one or more expensive instructions.";
     ++num_fail_expensive_fused_instruction_;
@@ -287,11 +289,10 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
           << " flops_to_bytes_ratio: " << CalculateFlopsToBytesRatio(fusion)
           << " merged_to_current_bytes_ratio: " << merged_to_current_bytes_ratio
           << " into users { "
-          << tensorflow::str_util::Join(users, ", ",
-                                        [](string* out, HloInstruction* user) {
-                                          tensorflow::strings::StrAppend(
-                                              out, user->name());
-                                        })
+          << absl::StrJoin(users, ", ",
+                           [](string* out, HloInstruction* user) {
+                             absl::StrAppend(out, user->name());
+                           })
           << " }";
   // Remove 'fusion' instruction.
   CHECK_EQ(0, fusion->user_count());
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
index 4c523a66de..7e3f5775b8 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.h
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
@@ -34,7 +34,7 @@ namespace gpu {
 //
 class FusionMerger : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "fusion merger"; }
+  absl::string_view name() const override { return "fusion merger"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 74282c568c..2c02ec2584 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
index 0c6f9b511f..8ffae18fe8 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
@@ -27,7 +27,7 @@ namespace gpu {
 // inserting kCopy instructions.
 class GpuCopyInsertion : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "copy-insertion"; }
+  absl::string_view name() const override { return "copy-insertion"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 7060837904..88be63e267 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -112,7 +112,7 @@ Status GpuExecutable::ExecuteThunks(
     //
     // TODO(jlebar): Should we cache the results of HloInstruction::ToString(),
     // since we expect it to be an expensive call?
-    tensorflow::gtl::optional<ScopedAnnotation> op_annotation;
+    absl::optional<ScopedAnnotation> op_annotation;
     if (top_level_annotation.IsEnabled()) {
       op_annotation.emplace(
           thunk->hlo_instruction() != nullptr
@@ -144,7 +144,7 @@ Status GpuExecutable::ExecuteThunks(
     TF_RETURN_IF_ERROR(
         thunk->ExecuteOnStream(buffer_allocations, stream, &profiler));
     if (thunk_schedule_->Depended(thunk)) {
-      auto finish_event = MakeUnique<se::Event>(main_stream->parent());
+      auto finish_event = absl::make_unique<se::Event>(main_stream->parent());
       finish_event->Init();
       stream->ThenRecordEvent(finish_event.get());
       thunk_to_finish_event[thunk] = std::move(finish_event);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index c7ce6d0acb..627a05e240 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@@ -32,10 +34,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
index d63e213d2b..bbb3340760 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
@@ -28,9 +28,7 @@ class GpuHloSupportChecker : public HloPassInterface {
   GpuHloSupportChecker() = default;
   ~GpuHloSupportChecker() override = default;
 
-  tensorflow::StringPiece name() const override {
-    return "gpu_hlo_support_checker";
-  }
+  absl::string_view name() const override { return "gpu_hlo_support_checker"; }
 
   // Note: always returns false (no instructions are ever modified by this
   // pass).
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 286547ebae..fbc8ddf599 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -119,7 +120,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
 
   for (const Shape& input_shape : AllLayoutsOf(shape)) {
     for (const Shape& result_shape : AllLayoutsOf(shape)) {
-      SCOPED_TRACE(tensorflow::strings::StrCat(
+      SCOPED_TRACE(absl::StrCat(
           "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape),
           ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape)));
 
@@ -192,7 +193,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
   // Enumerate all combinations of shapes.
   for (const Shape& input_shape : AllLayoutsOf(shape)) {
     for (const Shape& result_shape : AllLayoutsOf(shape)) {
-      SCOPED_TRACE(tensorflow::strings::StrCat(
+      SCOPED_TRACE(absl::StrCat(
           "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape),
           ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape)));
 
@@ -265,7 +266,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
   for (const Shape& input_shape : AllLayoutsOf(shape)) {
     for (const Shape& result_shape : AllLayoutsOf(shape)) {
       for (int constrained_param_no : {0, 4}) {
-        SCOPED_TRACE(tensorflow::strings::StrCat(
+        SCOPED_TRACE(absl::StrCat(
             "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape),
             ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape)));
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index a2f53f8446..44303724bb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "llvm/IR/DataLayout.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -160,9 +161,10 @@ Status GpuTransferManager::TransferLiteralFromOutfeed(
         if (ShapeUtil::IsTuple(shape)) {
           return;
         }
-        *buffer = MakeUnique<gpu::OutfeedBuffer>(GetByteSizeRequirement(shape));
+        *buffer = absl::make_unique<gpu::OutfeedBuffer>(
+            GetByteSizeRequirement(shape));
         (*buffer)->set_destination(
-            MakeUnique<MutableBorrowingLiteral>(literal, index));
+            absl::make_unique<MutableBorrowingLiteral>(literal, index));
       });
 
   // Give the tree of buffers to the outfeed mananger. The device will fill it
@@ -179,7 +181,7 @@ Status GpuTransferManager::TransferLiteralFromOutfeed(
 }  // namespace xla
 
 static std::unique_ptr<xla::TransferManager> CreateNVPTXTransferManager() {
-  return xla::MakeUnique<xla::gpu::GpuTransferManager>(
+  return absl::make_unique<xla::gpu::GpuTransferManager>(
       /*id=*/stream_executor::cuda::kCudaPlatformId,
       /*pointer_size=*/llvm::DataLayout(xla::gpu::NVPTXCompiler::kDataLayout)
           .getPointerSize(0 /* default address space */));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index 7929042869..fa88816bc8 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
 
 #include <vector>
 
@@ -61,4 +61,4 @@ class GpuTransferManager : public GenericTransferManager {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRANSFER_MANAGER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
index 1722676930..b9c21e8edb 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -33,7 +34,7 @@ namespace gpu {
 namespace {
 void InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>>* timers,
                        se::Stream* stream) {
-  timers->push(MakeUnique<se::Timer>(stream->parent()));
+  timers->push(absl::make_unique<se::Timer>(stream->parent()));
   stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get());
 }
 
@@ -115,7 +116,7 @@ HloExecutionProfiler::MakeScopedInstructionProfiler(
     CHECK(hlo_instructions_.insert(hlo_instruction).second)
         << hlo_instruction->name();
   }
-  return MakeUnique<ScopedInstructionProfiler>(this, hlo_instruction);
+  return absl::make_unique<ScopedInstructionProfiler>(this, hlo_instruction);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
index 19de37b0fb..76055ff009 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
@@ -59,8 +59,8 @@ GpuHloOrdering::GpuHloOrdering(
     : PredecessorHloOrdering(module) {
   // The entry computation has a total order when there's only one stream.
   if (stream_assignment.StreamCount() == 1) {
-    entry_sequence_ =
-        MakeUnique<std::vector<const HloInstruction*>>(thunk_launch_order);
+    entry_sequence_ = absl::make_unique<std::vector<const HloInstruction*>>(
+        thunk_launch_order);
   }
 
   // The ordering of instructions for the entry computation is determined by the
@@ -75,7 +75,7 @@ GpuHloOrdering::GpuHloOrdering(
   // same-stream predecessors of each instruction.
 
   // Compute the set of all instructions we will want to set reachability on.
-  auto predecessor_map = MakeUnique<HloReachabilityMap>(
+  auto predecessor_map = absl::make_unique<HloReachabilityMap>(
       module->entry_computation()->MakeInstructionPostOrder());
 
   // The most recently visited instruction per stream.
@@ -208,7 +208,7 @@ StatusOr<std::unique_ptr<HloSchedule>> HloSchedule::Build(
     BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_);
   }
 
-  schedule->hlo_ordering_ = MakeUnique<GpuHloOrdering>(
+  schedule->hlo_ordering_ = absl::make_unique<GpuHloOrdering>(
       &module, stream_assignment, schedule->thunk_launch_order_);
 
   return std::move(schedule);
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
index 45f0a1c645..d4a96cd5b3 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <unordered_set>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -47,7 +48,7 @@ class HloScheduleTest : public HloTestBase {
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
     config.set_debug_options(debug_options);
-    return MakeUnique<HloModule>("test_module", config);
+    return absl::make_unique<HloModule>("test_module", config);
   }
 
   HloVec RemoveHlo(const HloVec& input,
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 8c11cd0541..0e205b9c02 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 
+#include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -24,16 +25,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace gpu {
 
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 void HloToIrBindings::EmitBasePointersForHlos(
     tensorflow::gtl::ArraySlice<const HloInstruction*> io_hlos,
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
index c5f0cdf6cd..a4364b0deb 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 
 namespace xla {
 namespace gpu {
@@ -24,7 +24,7 @@ se::Stream* InfeedManager::GetStream(se::StreamExecutor* executor) {
   tensorflow::mutex_lock l(host_to_device_stream_mu_);
   if (host_to_device_executor_ == nullptr) {
     host_to_device_executor_ = executor;
-    host_to_device_stream_ = MakeUnique<se::Stream>(executor);
+    host_to_device_stream_ = absl::make_unique<se::Stream>(executor);
     host_to_device_stream_->Init();
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index c349063c71..f544bcc919 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -215,7 +215,7 @@ bool IsReductionToVector(const HloInstruction& reduce) {
 // This emits a device-side call to
 // "i32 vprintf(i8* fmt, arguments_type* arguments)" in the driver; see
 // http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
-llvm::Value* EmitPrintf(tensorflow::StringPiece fmt,
+llvm::Value* EmitPrintf(absl::string_view fmt,
                         tensorflow::gtl::ArraySlice<llvm::Value*> arguments,
                         llvm::IRBuilder<>* builder) {
   std::vector<llvm::Type*> argument_types;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 5d23a3d018..a35e250101 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -126,7 +126,7 @@ bool ImplementedAsLibraryCall(const HloInstruction& hlo);
 bool IsReductionToVector(const HloInstruction& reduce);
 
 // Emits call to "vprintf" with given format and arguments.
-llvm::Value* EmitPrintf(tensorflow::StringPiece fmt,
+llvm::Value* EmitPrintf(absl::string_view fmt,
                         tensorflow::gtl::ArraySlice<llvm::Value*> arguments,
                         llvm::IRBuilder<>* builder);
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 6675dbd3f9..7111b53944 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/algorithm/container.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
@@ -518,7 +519,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // We don't have to iterate over the batch dimensions in both arrays, simplify
   // the loop nest of the rhs.
   for (int i = 0; i != dnums.lhs_batch_dimensions_size(); ++i) {
-    DCHECK(c_linear_search(dnums.lhs_batch_dimensions(), i));
+    DCHECK(absl::c_linear_search(dnums.lhs_batch_dimensions(), i));
     rhs_index[i] = lhs_index[i];
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 561c683879..76e069fc41 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
@@ -40,7 +41,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1e81cbde35..84043689bd 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -21,6 +21,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -29,7 +34,6 @@ limitations under the License.
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
@@ -77,7 +81,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -85,13 +88,13 @@ namespace gpu {
 
 namespace {
 
+using absl::InlinedVector;
+using absl::nullopt;
+using absl::optional;
+using absl::StrCat;
 using llvm_ir::IrArray;
 using llvm_ir::IrName;
 using tensorflow::gtl::ArraySlice;
-using tensorflow::gtl::InlinedVector;
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::optional;
-using tensorflow::strings::StrCat;
 
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
@@ -314,13 +317,13 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
   };
 
   // Check the size of input tensors
-  if (!c_all_of(unnested_hlo->operands(), hlo_shape_in_range)) {
+  if (!absl::c_all_of(unnested_hlo->operands(), hlo_shape_in_range)) {
     return i64_ty;
   }
 
   // Check the size of the internal result tensors
   if (unnested_hlo->opcode() == HloOpcode::kFusion) {
-    if (!c_all_of(
+    if (!absl::c_all_of(
             unnested_hlo->fused_instructions_computation()->instructions(),
             hlo_shape_in_range)) {
       return i64_ty;
@@ -383,7 +386,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     int64 feature_index_value = feature_index->literal().Get<int64>({});
 
     thunk_sequence_->emplace_back(
-        MakeUnique<CudnnBatchNormForwardInferenceThunk>(
+        absl::make_unique<CudnnBatchNormForwardInferenceThunk>(
             /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
             /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
             /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
@@ -413,7 +416,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     auto output_mean = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
     auto output_inv_stddev = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
     thunk_sequence_->emplace_back(
-        MakeUnique<CudnnBatchNormForwardTrainingThunk>(
+        absl::make_unique<CudnnBatchNormForwardTrainingThunk>(
             /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
             /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
             /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
@@ -443,19 +446,20 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     auto output_grad_scale = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
     auto output_grad_offset =
         assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
-    thunk_sequence_->emplace_back(MakeUnique<CudnnBatchNormBackwardThunk>(
-        /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
-        /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
-        /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
-        /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
-        /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
-        /*epsilon=*/epsilon_value,
-        /*feature_index=*/feature_index_value,
-        /*output_grad_data=*/output_grad_data,
-        /*output_grad_scale=*/output_grad_scale,
-        /*output_grad_offset=*/output_grad_offset,
-        /*output_tuple=*/GetAllocationSlice(*custom_call),
-        /*hlo=*/custom_call));
+    thunk_sequence_->emplace_back(
+        absl::make_unique<CudnnBatchNormBackwardThunk>(
+            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+            /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
+            /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
+            /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
+            /*epsilon=*/epsilon_value,
+            /*feature_index=*/feature_index_value,
+            /*output_grad_data=*/output_grad_data,
+            /*output_grad_scale=*/output_grad_scale,
+            /*output_grad_offset=*/output_grad_offset,
+            /*output_tuple=*/GetAllocationSlice(*custom_call),
+            /*hlo=*/custom_call));
     return Status::OK();
   }
 
@@ -475,7 +479,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     const auto& target = custom_call->custom_call_target();
     std::unique_ptr<ConvolutionThunk> thunk;
     if (target == kCudnnConvForwardCallTarget) {
-      thunk = MakeUnique<ConvolutionThunk>(
+      thunk = absl::make_unique<ConvolutionThunk>(
           CudnnConvKind::kForward,
           /*input_buffer=*/lhs_slice,
           /*filter_buffer=*/rhs_slice,
@@ -489,7 +493,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           backend_config.algorithm(), backend_config.tensor_ops_enabled(),
           custom_call);
     } else if (target == kCudnnConvBackwardInputCallTarget) {
-      thunk = MakeUnique<ConvolutionThunk>(
+      thunk = absl::make_unique<ConvolutionThunk>(
           CudnnConvKind::kBackwardInput,
           /*input_buffer=*/conv_result_slice,
           /*filter_buffer=*/rhs_slice,
@@ -503,7 +507,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           backend_config.algorithm(), backend_config.tensor_ops_enabled(),
           custom_call);
     } else if (target == kCudnnConvBackwardFilterCallTarget) {
-      thunk = MakeUnique<ConvolutionThunk>(
+      thunk = absl::make_unique<ConvolutionThunk>(
           CudnnConvKind::kBackwardFilter,
           /*input_buffer=*/lhs_slice,
           /*filter_buffer=*/conv_result_slice,
@@ -576,7 +580,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         thunks.push_back(
             BuildKernelThunk(fusion, /*implements_whole_instruction=*/false));
         thunk_sequence_->emplace_back(
-            MakeUnique<SequentialThunk>(std::move(thunks), fusion));
+            absl::make_unique<SequentialThunk>(std::move(thunks), fusion));
         std::vector<IrArray> parameter_arrays;
         for (HloInstruction* operand : fusion->operands()) {
           parameter_arrays.push_back(GetIrArray(*operand, *fusion));
@@ -798,8 +802,7 @@ Status IrEmitterUnnested::EmitReductionToScalar(
   // //     RoundUpToNextMultipleOf(Ceil(num_elems / kTileSize), warpSize),
   // //
   // // and threads_per_block is a multiple of warpSize.
-  // reduce_kernel<<<num_blocks, threads_per_block>>>();
-  //
+  // reduce_kernel  //
   auto loop_body_emitter = [=](const IrArray::Index& tile_index) -> Status {
     const int num_reduces = reducers.size();
     llvm::Type* element_ir_type =
@@ -1718,7 +1721,7 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
     thunks.push_back(
         BuildKernelThunk(reduce, /*implements_whole_instruction=*/false));
     thunk_sequence_->emplace_back(
-        MakeUnique<SequentialThunk>(std::move(thunks), reduce));
+        absl::make_unique<SequentialThunk>(std::move(thunks), reduce));
 
     return EmitReductionToVector(
         reduce, input->shape(), {[&](const IrArray::Index& index) {
@@ -1738,7 +1741,7 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
   bool all_tuple_elements_have_buffer =
-      c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
+      absl::c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
         return ir_emitter_context_->buffer_assignment()
             .GetUniqueTopLevelSlice(tuple_element)
             .ok();
@@ -1760,7 +1763,7 @@ Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
     for (const HloInstruction* tuple_element : tuple->operands()) {
       tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element));
     }
-    thunk_sequence_->emplace_back(MakeUnique<TupleThunk>(
+    thunk_sequence_->emplace_back(absl::make_unique<TupleThunk>(
         tuple_element_buffers, GetAllocationSlice(*tuple), tuple));
     return Status::OK();
   }
@@ -1792,8 +1795,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   thunks.push_back(std::move(initializer_thunk));
   thunks.push_back(BuildKernelThunk(select_and_scatter,
                                     /*implements_whole_instruction=*/false));
-  thunk_sequence_->emplace_back(
-      MakeUnique<SequentialThunk>(std::move(thunks), select_and_scatter));
+  thunk_sequence_->emplace_back(absl::make_unique<SequentialThunk>(
+      std::move(thunks), select_and_scatter));
 
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
@@ -2018,7 +2021,7 @@ Status IrEmitterUnnested::HandleRng(HloInstruction* rng) {
   thunks.push_back(std::move(rng_thunk));
   thunks.push_back(std::move(increment_seed_thunk));
   thunk_sequence_->emplace_back(
-      MakeUnique<SequentialThunk>(std::move(thunks), rng));
+      absl::make_unique<SequentialThunk>(std::move(thunks), rng));
 
   return Status::OK();
 }
@@ -2043,7 +2046,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   auto values_destination = GetAllocationSlice(*sort, values_shape_index);
 
   if (keys_destination != GetAllocationSlice(*keys)) {
-    thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*keys),
         /*destination_buffer=*/keys_destination,
         /*mem_size=*/ShapeUtil::ByteSizeOf(keys->shape()), nullptr));
@@ -2051,7 +2054,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   if (values != nullptr && values_destination != GetAllocationSlice(*values)) {
     // TODO(b/26783907): Figure out why we never seem to share buffers for
     // key/value sort.
-    thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*values),
         /*destination_buffer=*/values_destination,
         /*mem_size=*/ShapeUtil::ByteSizeOf(values->shape()), nullptr));
@@ -2095,15 +2098,15 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
 
       TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
           dimension_to_sort, GetIrArray(*sort, *sort, keys_shape_index),
-          values != nullptr ? tensorflow::gtl::make_optional<IrArray>(
+          values != nullptr ? absl::make_optional<IrArray>(
                                   GetIrArray(*sort, *sort, values_shape_index))
-                            : tensorflow::gtl::nullopt,
+                            : absl::nullopt,
           IrName(sort), xor_mask, &b_, &launch_dimensions));
     }
   }
 
   thunk_sequence_->emplace_back(
-      MakeUnique<SequentialThunk>(std::move(thunks), sort));
+      absl::make_unique<SequentialThunk>(std::move(thunks), sort));
   return Status::OK();
 }
 
@@ -2130,7 +2133,7 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
   if (crs->operand_count() == 1) {
     CHECK(ShapeUtil::IsArray(crs->operand(0)->shape()))
         << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
-    thunk_sequence_->push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+    thunk_sequence_->push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*crs->operand(0)),
         /*destination_buffer=*/GetAllocationSlice(*crs),
         /*mem_size=*/ShapeUtil::ByteSizeOf(crs->shape()), crs));
@@ -2145,17 +2148,17 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
     tuple_element_buffers.push_back(ir_emitter_context_->buffer_assignment()
                                         .GetUniqueSlice(crs, {i})
                                         .ValueOrDie());
-    thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*crs->operand(i)),
         /*destination_buffer=*/tuple_element_buffers.back(),
         /*mem_size=*/ShapeUtil::ByteSizeOf(crs->operand(i)->shape()), nullptr));
   }
 
   // Output a tuple of the buffers above.
-  thunks.push_back(MakeUnique<TupleThunk>(tuple_element_buffers,
-                                          GetAllocationSlice(*crs), nullptr));
+  thunks.push_back(absl::make_unique<TupleThunk>(
+      tuple_element_buffers, GetAllocationSlice(*crs), nullptr));
   thunk_sequence_->push_back(
-      MakeUnique<SequentialThunk>(std::move(thunks), crs));
+      absl::make_unique<SequentialThunk>(std::move(thunks), crs));
   return Status::OK();
 }
 
@@ -2305,7 +2308,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   for (const auto& kv : hlo_slices) {
     buffers_needed.insert(kv.second.first.allocation());
   }
-  tensorflow::gtl::optional<const BufferAllocation*> temp_buffer;
+  absl::optional<const BufferAllocation*> temp_buffer;
   for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
     if (alloc.IsPreallocatedTempBuffer()) {
       if (!temp_buffer.has_value()) {
@@ -2322,10 +2325,10 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   // We'll pass a pointer to each of the elements of `buffers` to our kernel, in
   // this order.
   std::vector<const BufferAllocation*> non_constant_buffers;
-  c_copy_if(buffers_needed, std::back_inserter(non_constant_buffers),
-            [](const BufferAllocation* allocation) {
-              return !allocation->is_constant();
-            });
+  absl::c_copy_if(buffers_needed, std::back_inserter(non_constant_buffers),
+                  [](const BufferAllocation* allocation) {
+                    return !allocation->is_constant();
+                  });
 
   std::sort(non_constant_buffers.begin(), non_constant_buffers.end(),
             [](const BufferAllocation* a, const BufferAllocation* b) {
@@ -2389,7 +2392,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
         llvm::ConstantPointerNull::get(b_.getInt8PtrTy()));
   }
 
-  return MakeUnique<KernelThunk>(
+  return absl::make_unique<KernelThunk>(
       non_constant_buffers, llvm_ir::AsString(kernel->getName()),
       implements_whole_instruction ? inst : nullptr, unroll_factor);
 }
@@ -2398,7 +2401,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
     const HloInstruction* inst) {
   const HloInstruction* operand = inst->operand(0);
   CHECK_EQ(HloOpcode::kConstant, operand->opcode());
-  return MakeUnique<HostToDeviceCopyThunk>(
+  return absl::make_unique<HostToDeviceCopyThunk>(
       /*source_address=*/operand->literal().untyped_data(),
       /*destination_buffer=*/GetAllocationSlice(*inst),
       /*mem_size=*/
@@ -2410,7 +2413,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildDeviceToDeviceCopyThunk(
     const HloInstruction* inst) {
   const HloInstruction* operand = inst->operand(0);
-  return MakeUnique<DeviceToDeviceCopyThunk>(
+  return absl::make_unique<DeviceToDeviceCopyThunk>(
       /*source_address=*/GetAllocationSlice(*operand),
       /*destination_buffer=*/GetAllocationSlice(*inst),
       /*mem_size=*/
@@ -2430,7 +2433,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildInfeedThunk(
                      .GetUniqueSlice(inst, index)
                      .ConsumeValueOrDie();
       });
-  return MakeUnique<InfeedThunk>(slices, inst);
+  return absl::make_unique<InfeedThunk>(slices, inst);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildOutfeedThunk(
@@ -2447,7 +2450,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildOutfeedThunk(
           *slice = status_or_slice.ConsumeValueOrDie();
         }
       });
-  return MakeUnique<OutfeedThunk>(std::move(slices), inst);
+  return absl::make_unique<OutfeedThunk>(std::move(slices), inst);
 }
 
 namespace {
@@ -2470,7 +2473,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
   if (inst->opcode() == HloOpcode::kDot) {
     const HloInstruction* lhs = inst->operand(0);
     const HloInstruction* rhs = inst->operand(1);
-    return MakeUnique<GemmThunk>(
+    return absl::make_unique<GemmThunk>(
         GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
         GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
         GetAllocationSlice(*inst),  // The output buffer.
@@ -2512,7 +2515,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
     const HloInstruction* rhs =
         inst->operand(rhs_parameter->parameter_number());
 
-    return MakeUnique<GemmThunk>(
+    return absl::make_unique<GemmThunk>(
         GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
         GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
         GetAllocationSlice(*inst),  // The output buffer.
@@ -2529,11 +2532,12 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
     const HloInstruction* inst) {
   const HloInstruction* operand = inst->operand(0);
-  return MakeUnique<FftThunk>(inst->fft_type(), inst->fft_length(),
-                              /*input_buffer=*/GetAllocationSlice(*operand),
-                              /*output_buffer=*/GetAllocationSlice(*inst),
-                              /*input_shape=*/operand->shape(),
-                              /*output_shape=*/inst->shape(), inst);
+  return absl::make_unique<FftThunk>(
+      inst->fft_type(), inst->fft_length(),
+      /*input_buffer=*/GetAllocationSlice(*operand),
+      /*output_buffer=*/GetAllocationSlice(*inst),
+      /*input_shape=*/operand->shape(),
+      /*output_shape=*/inst->shape(), inst);
 }
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
@@ -2582,9 +2586,9 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     // MemzeroThunk.
     ArraySlice<uint8> literal_bytes(
         reinterpret_cast<const uint8*>(literal.untyped_data()), num_bytes);
-    if (c_all_of(literal_bytes, [](uint8 byte) { return byte == 0; })) {
-      return {
-          MakeUnique<MemzeroThunk>(GetAllocationSlice(*hlo, index), nullptr)};
+    if (absl::c_all_of(literal_bytes, [](uint8 byte) { return byte == 0; })) {
+      return {absl::make_unique<MemzeroThunk>(GetAllocationSlice(*hlo, index),
+                                              nullptr)};
     }
 
     // If the literal is 8 or 16 bits wide, we can emit a 32-bit memset by
@@ -2601,7 +2605,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
         memcpy(&pattern16, literal_bytes.data(), sizeof(pattern16));
       }
       uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16);
-      return {MakeUnique<Memset32BitValueThunk>(
+      return {absl::make_unique<Memset32BitValueThunk>(
           pattern32, GetAllocationSlice(*hlo, index), nullptr)};
     }
 
@@ -2612,7 +2616,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
                literal_bytes.size() - 4) == 0) {
       uint32 word;
       memcpy(&word, literal_bytes.data(), sizeof(word));
-      return {MakeUnique<Memset32BitValueThunk>(
+      return {absl::make_unique<Memset32BitValueThunk>(
           word, GetAllocationSlice(*hlo, index), nullptr)};
     }
   }
@@ -2764,7 +2768,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
                                     ir_emitter_context_);
   TF_CHECK_OK(body->Accept(&ir_emitter_body));
 
-  return MakeUnique<WhileThunk>(
+  return absl::make_unique<WhileThunk>(
       GetAllocationSlice(*condition->root_instruction()),  // cond result
       ir_emitter_condition.ConsumeThunkSequence(),
       ir_emitter_body.ConsumeThunkSequence(), hlo);
@@ -2782,8 +2786,8 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
                                     ir_emitter_context_);
   TF_CHECK_OK(body->Accept(&ir_emitter_body));
 
-  return MakeUnique<ForThunk>(loop_limit,
-                              ir_emitter_body.ConsumeThunkSequence(), hlo);
+  return absl::make_unique<ForThunk>(
+      loop_limit, ir_emitter_body.ConsumeThunkSequence(), hlo);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
@@ -2803,7 +2807,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
                                      ir_emitter_context_);
   TF_CHECK_OK(false_computation->Accept(&ir_emitter_false));
 
-  return MakeUnique<ConditionalThunk>(
+  return absl::make_unique<ConditionalThunk>(
       GetAllocationSlice(*hlo->operand(0)),
       GetAllocationSlice(*hlo->operand(1)),
       GetAllocationSlice(*hlo->operand(2)),
@@ -3105,7 +3109,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
         CeilOfRatio<int64>(output_dims_in_tiles[i], kTileSize);
   }
   const int64 num_tiles =
-      c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
+      absl::c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
   LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile);
 
   llvm::Type* index_ty =
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index e76823ad10..d856299889 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -41,8 +41,8 @@ Status KernelThunk::Initialize(const GpuExecutable& executable,
   tensorflow::mutex_lock lock(mutex_);
   if (!loader_spec_) {
     loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size()));
-    tensorflow::StringPiece ptx = executable.ptx();
-    // Convert tensorflow::StringPiece to se::port::StringPiece because
+    absl::string_view ptx = executable.ptx();
+    // Convert absl::string_view to se::port::StringPiece because
     // StreamExecutor uses the latter.
     loader_spec_->AddCudaPtxInMemory(
         se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
@@ -95,7 +95,7 @@ Status KernelThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   VLOG(3) << "Launching " << kernel->name();
   // Launch the kernel with potentially multiple blocks and threads.
   static constexpr int kKernelArgsLimit = 1024;
-  auto kernel_args = MakeUnique<se::KernelArgsArray<kKernelArgsLimit>>();
+  auto kernel_args = absl::make_unique<se::KernelArgsArray<kKernelArgsLimit>>();
   for (const BufferAllocation* arg : args_) {
     const auto& buf = buffer_allocations.GetDeviceAddress(arg->index());
     kernel_args->add_device_memory_argument(buf);
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index eb93efc560..ccf082c4c6 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -34,6 +34,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@llvm//:amdgpu_code_gen",
         "@llvm//:analysis",
         "@llvm//:bit_reader",
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc
index 12a8a59488..a3c74507dd 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
 
+#include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
@@ -86,7 +86,7 @@ void IrDumpingPassManager::run(llvm::Module &module) {
       const llvm::PassInfo *PI =
           llvm::PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
       const string basename = ReplaceFilenameExtension(
-          tensorflow::io::Basename(input_filename_),
+          absl::string_view(tensorflow::io::Basename(input_filename_)),
           tensorflow::strings::Printf(
               "pass-%02d.before.%s.ll", i,
               (PI == nullptr ? "unknown" : PI->getPassArgument().data())));
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index ff4ae1f9ef..e18d7e764a 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -20,13 +20,15 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
@@ -54,9 +56,7 @@ limitations under the License.
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -107,8 +107,7 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path,
                  << ", " << compute_capability.second << ") ."
                  << "Defaulting to libdevice for compute_" << libdevice_version;
   }
-  return tensorflow::strings::StrCat("libdevice.compute_", libdevice_version,
-                                     ".10.bc");
+  return absl::StrCat("libdevice.compute_", libdevice_version, ".10.bc");
 }
 
 // Gets the GPU name as it's known to LLVM for a given compute capability.  If
@@ -138,15 +137,16 @@ static string GetSmName(std::pair<int, int> compute_capability) {
                  << "Defaulting to telling LLVM that we're compiling for sm_"
                  << sm_version;
   }
-  return tensorflow::strings::StrCat("sm_", sm_version);
+  return absl::StrCat("sm_", sm_version);
 }
 
 // Convenience function for producing a name of a temporary compilation product
 // from the input filename.
 string MakeNameForTempProduct(const std::string& input_filename,
-                              tensorflow::StringPiece extension) {
-  return ReplaceFilenameExtension(
-      tensorflow::io::Basename(llvm_ir::AsString(input_filename)), extension);
+                              absl::string_view extension) {
+  return ReplaceFilenameExtension(absl::string_view(tensorflow::io::Basename(
+                                      llvm_ir::AsString(input_filename))),
+                                  extension);
 }
 
 // Initializes LLVM passes. Uses the PassRegistry mechanism.
@@ -167,7 +167,7 @@ void InitializePasses(llvm::PassRegistry* pass_registry) {
 
 // Returns the TargetMachine, given a triple.
 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
-    llvm::Triple triple, tensorflow::StringPiece cpu_name,
+    llvm::Triple triple, absl::string_view cpu_name,
     const HloModuleConfig& hlo_module_config) {
   std::string error;
   const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error);
@@ -205,7 +205,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
     default:
       codegen_opt_level = CodeGenOpt::None;
   }
-  return WrapUnique(target->createTargetMachine(
+  return absl::WrapUnique(target->createTargetMachine(
       triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options,
       Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel),
       codegen_opt_level));
@@ -243,9 +243,9 @@ void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
 }
 
 // Emits the given module to a bit code file.
-void EmitBitcodeToFile(const Module& module, tensorflow::StringPiece filename) {
+void EmitBitcodeToFile(const Module& module, absl::string_view filename) {
   std::error_code error_code;
-  llvm::ToolOutputFile outfile(filename.ToString().c_str(), error_code,
+  llvm::ToolOutputFile outfile(string(filename).c_str(), error_code,
                                llvm::sys::fs::F_None);
   if (error_code) {
     LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
@@ -266,8 +266,9 @@ string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
     // get creative to add a suffix.
     string module_id(llvm_ir::AsString(module->getModuleIdentifier()));
     IrDumpingPassManager codegen_passes(
-        ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
-                                 "-nvptx.dummy"),
+        ReplaceFilenameExtension(
+            absl::string_view(tensorflow::io::Basename(module_id)),
+            "-nvptx.dummy"),
         "", false);
     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
         llvm::Triple(module->getTargetTriple())));
@@ -332,8 +333,8 @@ Status LinkLibdeviceIfNecessary(llvm::Module* module,
               return !GV.hasName() || (GVS.count(GV.getName()) == 0);
             });
           })) {
-    return tensorflow::errors::Internal(tensorflow::strings::StrCat(
-        "Error linking libdevice from ", libdevice_path));
+    return tensorflow::errors::Internal(
+        absl::StrCat("Error linking libdevice from ", libdevice_path));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h
index 54e0e140de..9654175bfa 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
index 9ef9bc3a50..3b2c3591d9 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/SourceMgr.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace {
 
@@ -52,14 +52,13 @@ std::unique_ptr<llvm::Module> LoadIRModule(const string& filename,
   return module;
 }
 
-string ReplaceFilenameExtension(tensorflow::StringPiece filename,
-                                tensorflow::StringPiece new_extension) {
+string ReplaceFilenameExtension(absl::string_view filename,
+                                absl::string_view new_extension) {
   auto pos = filename.rfind('.');
-  tensorflow::StringPiece stem =
-      pos == tensorflow::StringPiece::npos
-          ? filename
-          : tensorflow::StringPiece(filename.data(), pos);
-  return tensorflow::strings::StrCat(stem, ".", new_extension);
+  absl::string_view stem = pos == absl::string_view::npos
+                               ? filename
+                               : absl::string_view(filename.data(), pos);
+  return absl::StrCat(stem, ".", new_extension);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h
index a6daeca95a..60f4926849 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace llvm {
 class LLVMContext;
@@ -41,8 +41,8 @@ std::unique_ptr<llvm::Module> LoadIRModule(const string& filename,
 //
 // For example:
 //   ReplaceFilenameExtension("/foo/baz.txt", "cc") --> "/foo/baz.cc"
-string ReplaceFilenameExtension(tensorflow::StringPiece filename,
-                                tensorflow::StringPiece new_extension);
+string ReplaceFilenameExtension(absl::string_view filename,
+                                absl::string_view new_extension);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index c62bae0628..9fb6f569ae 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -48,7 +49,7 @@ bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
         // If possible, we want to pick a reduce operand of the fusion root,
         // because it has the most constraints.
         for (const auto* inst : fused_expression_root->operands()) {
-          if (inst->opcode() == HloOpcode::kReduce) {
+          if (IsReductionToVector(*inst)) {
             return inst;
           }
         }
@@ -63,7 +64,7 @@ bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
   auto get_element_shape = [&](const HloInstruction* element_instr) {
     // Special handling of kReduce instructions -- the fusion
     // applies to the first operand.
-    if (element_instr->opcode() == HloOpcode::kReduce) {
+    if (IsReductionToVector(*element_instr)) {
       return element_instr->operand(0)->shape();
     }
     return element_instr->shape();
@@ -131,7 +132,7 @@ bool ReduceFriendlyInputLayouts(HloInstruction* instr) {
       max_rank_layout = &param->shape().layout();
     }
   }
-  return c_all_of(params, [&](HloInstruction* param) {
+  return absl::c_all_of(params, [&](HloInstruction* param) {
     return (ShapeUtil::Rank(param->shape()) < max_rank) ||
            (LayoutUtil::Equal(param->shape().layout(), *max_rank_layout));
   });
@@ -140,10 +141,15 @@ bool ReduceFriendlyInputLayouts(HloInstruction* instr) {
 }  // namespace
 
 bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
-  // We can fuse reduces and loop fusions.
-  return IsInputFusibleReduction(instr) ||
-         (instr->opcode() == HloOpcode::kFusion &&
-          instr->fusion_kind() == HloInstruction::FusionKind::kLoop);
+  // We can fuse reduces and loop fusions. Elementwise instructions can be fused
+  // with any other instruction.
+  // TODO(b/112957171): This should use the same isFusible logic as
+  // instruction_fusion.
+  return instr->IsFusable() &&
+         (IsInputFusibleReduction(instr) ||
+          (instr->opcode() == HloOpcode::kFusion &&
+           instr->fusion_kind() == HloInstruction::FusionKind::kLoop) ||
+          instr->IsElementwise());
 }
 
 int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
@@ -177,11 +183,12 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
   // merge into bigger loop fusions and input (reduce) fusions become fusions
   // with multiple reduce outputs. We could fuse reduce and loop fusions
   // together too (the result being an input fusion) if we find cases where this
-  // improves things.
+  // improves things. Also disable fusing standalone input-fusible reduces into
+  // loop fusions.
   CHECK(instr1->opcode() == HloOpcode::kFusion);
   if ((instr2->opcode() == HloOpcode::kFusion &&
        instr1->fusion_kind() != instr2->fusion_kind()) ||
-      (instr2->opcode() != HloOpcode::kFusion &&
+      (IsReductionToVector(*instr2) &&
        instr1->fusion_kind() == HloInstruction::FusionKind::kLoop)) {
     return false;
   }
@@ -248,7 +255,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
       }
       // Do not fuse a producer if the other operands of the fusion are
       // reachable from the producer, this would create a cycle.
-      if (c_any_of(consumer_operands, [&](HloInstruction* operand) {
+      if (absl::c_any_of(consumer_operands, [&](HloInstruction* operand) {
             return producer != operand &&
                    reachability()->IsReachable(producer, operand);
           })) {
@@ -268,7 +275,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
   for (auto& fusion_pair : potential_fusion_list) {
     HloInstruction* producer = fusion_pair.first;
     HloInstruction* consumer = fusion_pair.second;
-    if (!c_any_of(consumer->operands(), [&](HloInstruction* operand) {
+    if (!absl::c_any_of(consumer->operands(), [&](HloInstruction* operand) {
           return producer != operand &&
                  reachability()->IsReachable(producer, operand);
         })) {
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 14f157a5e5..c822c94f1b 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -15,19 +15,19 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-
-namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace gpu {
 
+namespace op = xla::testing::opcode_matchers;
+
 using MultiOutputFusionTest = HloTestBase;
 
 const char kModulePrefix[] = R"(
@@ -47,7 +47,7 @@ const char kModulePrefix[] = R"(
 TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
   // Fusion with reduce instruction root and a sibling reduce instruction
   // sharing the same input param.
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_computation {
       p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
       mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1)
@@ -74,7 +74,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
 }
 
 TEST_F(MultiOutputFusionTest, MultiOutputFusionDifferentReduceInputShapes) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_computation_1 {
       p1.1 = f32[6400]{0} parameter(1)
       mul = f32[6400]{0} multiply(p1.1, p1.1)
@@ -101,7 +101,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDifferentReduceInputShapes) {
 }
 
 TEST_F(MultiOutputFusionTest, MultiOutputFusionDifferentReduceOutputShapes) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_computation_1 {
       p1.1 = f32[10,10]{1,0} parameter(1)
       mul = f32[10,10]{1,0} multiply(p1.1, p1.1)
@@ -130,7 +130,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDifferentReduceOutputShapes) {
 TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingReduceFusions) {
   // Two sibling fusions with reduce instruction roots sharing the same input
   // param.
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_computation_1 {
       p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
       mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1)
@@ -165,7 +165,7 @@ TEST_F(MultiOutputFusionTest,
        MultiOutputFusionSiblingReduceAndReduceMultiOutputFusion) {
   // Multi-output fusion with two reduce instructions root and a sibling reduce
   // instruction sharing the same input param.
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_computation (p0: f32[128,512,28,28]) -> (f32[512], f32[512]) {
       const.1 = f32[] constant(1)
       p0.1 = f32[128,512,28,28]{3,2,1,0} parameter(0)
@@ -198,7 +198,7 @@ TEST_F(MultiOutputFusionTest,
        MultiOutputFusionSiblingFusionCheckAgainstReduceOperand) {
   // Verify that if we already have a multi-output fusion that we prefer to pick
   // a reduce op from its operands for checking shape compatibility.
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_computation_1 {
       p1.1 = f32[10,10]{1,0} parameter(1)
       mul = f32[10,10]{1,0} multiply(p1.1, p1.1)
@@ -228,7 +228,7 @@ TEST_F(MultiOutputFusionTest,
 }
 
 TEST_F(MultiOutputFusionTest, MultiOutputFusionTwoLoops) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_computation_1 {
       p0.1 = f32[6400]{0} parameter(0)
       ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
@@ -256,8 +256,136 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionTwoLoops) {
               op::Tuple(op::Multiply(), op::Divide()));
 }
 
-TEST_F(MultiOutputFusionTest, ProducerConsumerFusionElementwiseAndReduce) {
+TEST_F(MultiOutputFusionTest, MultiOutputFusionLoopReduceToInputFusion) {
+  // Fusing a reduce into a loop fusion would require changing the fusion kind.
+  // That's not supported yet.
   auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(0)
+      reduce = f32[] reduce(p0, const.2), dimensions={0}, to_apply=scalar_add_computation
+      ROOT root = (f32[6400]{0}, f32[]) tuple(fusion.1, reduce)
+    })"))
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(MultiOutputFusionTest, MultiOutputFusionLoopElementwise) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(1)
+      div = f32[6400]{0} divide(p0, const.2)
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, div)
+    })"))
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Multiply(), op::Divide()));
+}
+
+TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingLoopsDifferentShapes) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      ROOT mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      ROOT reduce = f32[8,1,5,1,1]{4,3,2,1,0} reduce(p0.2, const.2), dimensions={3}, to_apply=scalar_add_computation
+    }
+
+    ENTRY entry {
+      p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      fusion.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[8,1,5,1,1]{4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,1,1]{4,3,2,1,0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingLoopAndMultiOutputLoop) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1)
+      exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1)
+      ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      ROOT add = f32[8,1,5,16,1,1]{5,4,3,2,1,0} add(p0.2, const.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0
+      gte1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1
+      ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(gte0, gte1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Multiply(), op::Exp(), op::Add()));
+}
+
+TEST_F(MultiOutputFusionTest,
+       MultiOutputFusionSiblingLoopAndMultiOutputLoopDifferentShapes) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1)
+      exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1)
+      ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      ROOT reduce = f32[8,1,5,1,1]{4,3,2,1,0} reduce(p0.2, const.2), dimensions={3}, to_apply=scalar_add_computation
+    }
+
+    ENTRY entry {
+      p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[8,1,5,1,1]{4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0
+      gte1 =  f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1
+      ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,1,1]{4,3,2,1,0}) tuple(gte0, gte1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(MultiOutputFusionTest, ProducerConsumerFusionElementwiseAndReduce) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       c0 = f32[] constant(0)
@@ -277,7 +405,7 @@ TEST_F(MultiOutputFusionTest, ProducerConsumerFusionElementwiseAndReduce) {
 }
 
 TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduce) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_add {
       p0.1 = f32[2,2,2]{2,1,0} parameter(0)
       p1.1 = f32[2,2,2]{2,1,0} parameter(1)
@@ -304,7 +432,7 @@ TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduce) {
 }
 
 TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduceFusion) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_select {
       p1.1 = f32[2,2,2]{2,1,0} parameter(1)
       c0 = f32[] constant(0)
@@ -345,7 +473,7 @@ TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduceFusion) {
 }
 
 TEST_F(MultiOutputFusionTest, ProducerConsumerFusionDoNotFuseLoopReduceFusion) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_element_wise {
       p0.1 = f32[2,2,2]{2,1,0} parameter(0)
       p1.1 = f32[2,2,2]{2,1,0} parameter(1)
@@ -372,7 +500,7 @@ TEST_F(MultiOutputFusionTest, ProducerConsumerFusionDoNotFuseLoopReduceFusion) {
 
 TEST_F(MultiOutputFusionTest,
        ProducerConsumerFusionFp16LoopFusionAndReduceFusion) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_select {
       p1.1 = f16[2,2,2]{2,1,0} parameter(1)
       c0 = f16[] constant(0)
@@ -413,7 +541,7 @@ TEST_F(MultiOutputFusionTest,
 
 TEST_F(MultiOutputFusionTest,
        ProducerConsumerFusionReduceUnfriendlyLoopFusion) {
-  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     mixed_input_layouts_computation {
       p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0)
       p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1)
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 76c9b6ab33..695feadb11 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -21,19 +21,22 @@ limitations under the License.
 #include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
@@ -72,6 +75,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -83,7 +87,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -130,11 +133,16 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
 }
 
 // Runs optimization passes on the given HLO module.
+//
+// It takes a compiler pointer, as passes may compile and execute HLOs on the
+// fly for cuDNN verification or other purposes.
 Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
-                         DeviceMemoryAllocator* device_allocator) {
+                         DeviceMemoryAllocator* device_allocator,
+                         Compiler* compiler) {
   {
     HloPassPipeline pipeline("optimization");
-    pipeline.AddInvariantChecker<HloVerifier>();
+    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                              /*allow_mixed_precision=*/false);
     pipeline.AddPass<GpuHloSupportChecker>();
     ReducePrecisionInsertion::AddPasses(
         &pipeline, hlo_module->config().debug_options(),
@@ -150,7 +158,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     {
       auto& pass =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
-      pass.AddInvariantChecker<HloVerifier>();
+      pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                            /*allow_mixed_precision=*/false);
 
       // If cudnn batchnorms are enabled, rewrite batchnorm HLOs to cudnn calls
       // where possible.  Not every batchnorm op can be implemented as a call to
@@ -167,6 +176,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
+      pipeline.AddPass<ScatterExpander>();
+
       pass.AddPass<AlgebraicSimplifier>(
           /*is_layout_sensitive=*/false,
           [](const Shape&, const Shape&) { return false; });
@@ -195,7 +206,10 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     // Convert convolutions into CustomCalls to cudnn, then canonicalize them
     // (PadInsertion).
     HloPassPipeline pipeline("conv_canonicalization");
-    pipeline.AddInvariantChecker<HloVerifier>();
+    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
+                                              /*allow_mixed_precision=*/false);
+    // TODO(b/31709653): Directly use the grouped convolution support of Cudnn.
+    pipeline.AddPass<ConvolutionFeatureGroupConverter>();
     pipeline.AddPass<CudnnConvolutionRewriter>();
     pipeline.AddPass<PadInsertion>();
     if (IsVoltaOrLater(*stream_exec)) {
@@ -208,9 +222,22 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
   }
 
   {
-    HloPassPipeline pipeline("layout_assignment");
+    // Run layout assignment in a separate pipeline from
+    // "post-layout-assignment" because we want everything after layout
+    // assignment to have a layout-sensitive invariant-checker, but
+    // HloPassPipeline also runs its invariant checker before any passes are
+    // run, meaning, the pipeline that contains layout assignment cannot contain
+    // a layout-sensitive verifier!
+    HloPassPipeline pipeline("layout assignment");
     pipeline.AddPass<GpuLayoutAssignment>(
         hlo_module->mutable_entry_computation_layout(), stream_exec);
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
+  {
+    HloPassPipeline pipeline("post-layout_assignment");
+    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
+                                              /*allow_mixed_precision=*/false);
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
@@ -245,8 +272,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     // the gte(customcall, 0) would probably already be into a fusion node.  We
     // can't simplify across HloComputation boundaries, so in this case we
     // wouldn't be able to simplify away the new_tuple bits.
-    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(stream_exec,
-                                                      device_allocator);
+    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(
+        stream_exec, device_allocator, compiler);
     // Clean up new_tuple described above.
     pipeline.AddPass<TupleSimplifier>();
 
@@ -256,17 +283,20 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
-    fusion.AddInvariantChecker<HloVerifier>();
+    fusion.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
+                                            /*allow_mixed_precision=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
     fusion.AddPass<FusionMerger>();
     fusion.AddPass<GpuMultiOutputFusion>();
     fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                            /*only_fusion_computations=*/true);
+    fusion.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
 
     HloPassPipeline reduce_pipeline("reduce-precision");
-    reduce_pipeline.AddInvariantChecker<HloVerifier>();
+    reduce_pipeline.AddInvariantChecker<HloVerifier>(
+        /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false);
     ReducePrecisionInsertion::AddPasses(
         &reduce_pipeline, hlo_module->config().debug_options(),
         ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
@@ -292,7 +322,8 @@ Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // (b/27180329). Therefore, in that case, we set the output to be a copy of
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
-  pipeline.AddInvariantChecker<HloVerifier>();
+  pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
+                                            /*allow_mixed_precision=*/false);
 
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
@@ -342,9 +373,9 @@ void WarnIfBadPtxasVersion(const string& ptxas_path) {
   string vmaj_str, vmin_str, vdot_str;
   if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
                          &vmin_str, &vdot_str) ||
-      !tensorflow::strings::safe_strto64(vmaj_str, &vmaj) ||
-      !tensorflow::strings::safe_strto64(vmin_str, &vmin) ||
-      !tensorflow::strings::safe_strto64(vdot_str, &vdot)) {
+      !absl::SimpleAtoi(vmaj_str, &vmaj) ||
+      !absl::SimpleAtoi(vmin_str, &vmin) ||
+      !absl::SimpleAtoi(vdot_str, &vdot)) {
     LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
                  << " --version:\n"
                  << out;
@@ -456,7 +487,7 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
   tensorflow::SubProcess ptxas_info_dumper;
   std::vector<string> ptxas_args = {
       ptxas_path, ptx_path, "-o", cubin_path,
-      tensorflow::strings::StrCat("-arch=sm_", cc_major, cc_minor)};
+      absl::StrCat("-arch=sm_", cc_major, cc_minor)};
   if (VLOG_IS_ON(2)) {
     ptxas_args.push_back("-v");
   }
@@ -492,11 +523,15 @@ NVPTXCompiler::NVPTXCompiler()
 StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
+  // We dump the post-optimization HLO in RunBackend so no need to dump it here.
+  VLOG(2) << "*** HLO Before Optimization";
+  XLA_VLOG_LINES(2, module->ToString());
+
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
   tracing::ScopedActivity activity("HLO Transforms", module->name(),
                                    /*is_expensive=*/true);
   TF_RETURN_IF_ERROR(
-      OptimizeHloModule(module.get(), stream_exec, device_allocator));
+      OptimizeHloModule(module.get(), stream_exec, device_allocator, this));
   return std::move(module);
 }
 
@@ -548,6 +583,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   // include headers, so no need for us to print them ourselves.
   XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
+  VLOG(2) << "*** HLO After Optimization";
   XLA_VLOG_LINES(2, module->ToString());
   const string xla_dump_optimized_hlo_proto_to =
       module->config().debug_options().xla_dump_optimized_hlo_proto_to();
@@ -659,7 +695,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   // Write PTX to IR dump directory, if IR dumping was requested.
   if (!ir_dump_directory.empty()) {
     const string ptx_outfile = tensorflow::io::JoinPath(
-        ir_dump_directory, tensorflow::strings::StrCat(module->name(), ".ptx"));
+        ir_dump_directory, absl::StrCat(module->name(), ".ptx"));
     auto status = [&] {
       auto* env = tensorflow::Env::Default();
       TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(ir_dump_directory));
@@ -675,7 +711,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   const std::vector<uint8> cubin =
       CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor);
 
-  auto thunk_schedule = MakeUnique<ThunkSchedule>(
+  auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
       hlo_schedule->ThunkLaunchOrder());
   VLOG(2) << "Printing the thunk schedule...";
@@ -689,7 +725,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     cost_analysis.set_bytes_per_second(
         stream_exec->GetDeviceDescription().memory_bandwidth());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
-    profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+    profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
     profile_printer =
         CreateHloProfilePrinterData(*profile_index_map, cost_analysis);
   }
@@ -798,7 +834,7 @@ se::Platform::Id NVPTXCompiler::PlatformId() const {
 static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(
       stream_executor::cuda::kCudaPlatformId,
-      []() { return xla::MakeUnique<xla::gpu::NVPTXCompiler>(); });
+      []() { return absl::make_unique<xla::gpu::NVPTXCompiler>(); });
   return true;
 }
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index d4d2909f1b..08ef6ef56c 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -20,13 +20,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_manager.cc b/tensorflow/compiler/xla/service/gpu/outfeed_manager.cc
index 4aaf0c9e14..2fa170964e 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_manager.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h
index 192359f026..11dc56a64f 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h
+++ b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h
@@ -32,9 +32,7 @@ namespace gpu {
 // TODO(jlebar): Also pad dots.
 class PadForTensorCores : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override {
-    return "pad for tensor cores";
-  }
+  absl::string_view name() const override { return "pad for tensor cores"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc
index 99e7580b82..104af48c82 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc
@@ -29,7 +29,12 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 using ::testing::_;
 
-using PadForTensorCoresTest = HloVerifiedTestBase;
+class PadForTensorCoresTest : public HloVerifiedTestBase {
+ public:
+  PadForTensorCoresTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+};
 
 TEST_F(PadForTensorCoresTest, PadF16ForwardConvInputChannels) {
   ParseAndVerifyModule(R"(
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index b22040eee1..98cc21ccac 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -69,7 +70,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     PrimitiveType element_type = input->shape().element_type();
     HloInstruction* padding =
         computation->AddInstruction(HloInstruction::CreateConstant(
-            MakeUnique<Literal>(LiteralUtil::Zero(element_type))));
+            absl::make_unique<Literal>(LiteralUtil::Zero(element_type))));
     input = MakePadHlo(input, padding, padding_config).ValueOrDie();
   }
 
@@ -126,7 +127,7 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window,
   PrimitiveType element_type = kernel->shape().element_type();
   HloInstruction* padding =
       computation->AddInstruction(HloInstruction::CreateConstant(
-          MakeUnique<Literal>(LiteralUtil::Zero(element_type))));
+          absl::make_unique<Literal>(LiteralUtil::Zero(element_type))));
   return MakePadHlo(kernel, padding, padding_config).ValueOrDie();
 }
 }  // namespace
@@ -236,7 +237,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
   HloComputation* computation = backward_conv->parent();
   HloInstruction* output = backward_conv->mutable_operand(1);
   HloInstruction* padding = computation->AddInstruction(
-      HloInstruction::CreateConstant(MakeUnique<Literal>(
+      HloInstruction::CreateConstant(absl::make_unique<Literal>(
           LiteralUtil::Zero(input->shape().element_type()))));
   HloInstruction* padded_input =
       MakePadHlo(input, padding, input_padding_config).ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.h b/tensorflow/compiler/xla/service/gpu/pad_insertion.h
index 67e51509e4..a622e894ed 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.h
@@ -26,7 +26,7 @@ namespace gpu {
 // padding, so that they can be lowered to cuDNN convolution.
 class PadInsertion : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "pad insertion"; }
+  absl::string_view name() const override { return "pad insertion"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 3838fee674..ca57cacb98 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -57,8 +57,8 @@ ParallelLoopEmitter::ParallelLoopEmitter(
       unroll_factor_(unroll_factor) {}
 
 std::vector<llvm_ir::IrArray::Index>
-ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
-    tensorflow::StringPiece loop_name, llvm::Type* index_type) {
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
+                                                   llvm::Type* index_type) {
   // Emit the following code in LLVM IR:
   //   linear_index = blockIdx.x * blockDim.x + threadIdx.x;
   //   if (linear_index < num_elements) {
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index b82a23419d..cc7da2e73b 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -58,7 +58,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ~ParallelLoopEmitter() override = default;
 
   std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
-      tensorflow::StringPiece loop_name, llvm::Type* index_type) override;
+      absl::string_view loop_name, llvm::Type* index_type) override;
 
  private:
   // The thread and block dimension to parallelize the loop on.
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index d3fd0544fb..c927c5ee16 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 0806dd5161..5b6cf2c04d 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
@@ -119,7 +119,7 @@ int ComputeStreamToAssign(
 }  // namespace
 
 std::unique_ptr<StreamAssignment> AssignStreams(const HloModule& module) {
-  auto stream_assignment = MakeUnique<StreamAssignment>();
+  auto stream_assignment = absl::make_unique<StreamAssignment>();
   const HloComputation& computation = *module.entry_computation();
   std::unique_ptr<HloReachabilityMap> reachability =
       computation.ComputeReachability();
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index 6f4bb0580e..3f75d8b559 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -33,7 +34,7 @@ class StreamAssignmentTest : public HloTestBase {
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
     config.set_debug_options(debug_options);
-    return MakeUnique<HloModule>("test_module", config);
+    return absl::make_unique<HloModule>("test_module", config);
   }
 
   // Pre-canned shapes.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index 4fad3f46cf..db4a33dc56 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -35,13 +35,13 @@ cc_library(
         "requires-gpu-sm35",
     ],
     deps = [
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -60,6 +60,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -94,6 +95,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -150,6 +152,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -168,6 +171,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 4b8415fe91..0e84ec7e62 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/core/platform/logging.h"
@@ -32,7 +32,7 @@ std::unique_ptr<HloModule> GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) {
   debug_options.add_xla_disable_hlo_passes("constant_folding");
   config.set_debug_options(debug_options);
 
-  return MakeUnique<HloModule>(TestName(), config);
+  return absl::make_unique<HloModule>(TestName(), config);
 }
 
 void GpuCodegenTest::CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index ce69e058e6..4550f36fdf 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index e5958165ef..a06576df7b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index cca35316f0..15d1e269cc 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -27,13 +27,22 @@ namespace {
 
 class GpuKernelTilingTest : public GpuCodegenTest {
  protected:
-  GpuKernelTilingTest() {
+  GpuKernelTilingTest() {}
+
+  // Most tests in this file want to skip layout assignment, but a few need it
+  // enabled.
+  HloModuleConfig ConfigWithLayoutAssignment() {
+    return GetModuleConfigForTest();
+  }
+
+  HloModuleConfig ConfigWithoutLayoutAssignment() {
+    HloModuleConfig config;
     auto debug_options = HloTestBase::GetDebugOptionsForTest();
-    config_.set_debug_options(debug_options);
     // Disable layout_assignment to use the preassigned layouts.
-    debug_options.add_xla_disable_hlo_passes("layout_assignment");
+    debug_options.add_xla_disable_hlo_passes("layout-assignment");
+    config.set_debug_options(debug_options);
+    return config;
   }
-  HloModuleConfig config_;
 };
 
 TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
@@ -46,7 +55,13 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
     })";
 
   // Check that a call to llvm.nvvm.barrier0 is generated.
-  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  //
+  // We must enable layout assignment in order for this test to work correctly.
+  // AlgebraicSimplifier removes copy1; it's added back by layout assignment,
+  // which respects the module's entry computation layout.  But if we don't run
+  // layout assignment...well, nobody else adds the copy back.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithLayoutAssignment()).ValueOrDie();
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
@@ -68,8 +83,11 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
       ROOT copy1 = f16[2,3,64]{1,0,2} copy(para0)
     })";
 
-  // Check that a call to llvm.nvvm.barrier0 is not generated.
-  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  // Check that a call to llvm.nvvm.barrier0 is not generated.  As in
+  // UnnestedTransposeWithProperDimensionsTiled, we must run layout assignment
+  // here.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithLayoutAssignment()).ValueOrDie();
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
@@ -95,7 +113,8 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
     })";
 
   // Check that a call to llvm.nvvm.barrier0 is generated.
-  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
@@ -128,7 +147,8 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
     })";
 
   // Check that a call to llvm.nvvm.barrier0 is generated.
-  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
@@ -162,7 +182,8 @@ TEST_F(GpuKernelTilingTest,
     })";
 
   // Check that a call to llvm.nvvm.barrier0 is not generated.
-  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 6c9ae7bada..6a9ecd9dae 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index c42e5704a4..15198865bd 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
index 9622936306..0f2d5568ca 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -138,6 +138,9 @@ TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
   debug_options.set_xla_gpu_max_kernel_unroll_factor(2);
+  // Disable layout assignment for this test.  Layout assignment does not expect
+  // fusions to be present, and so it does the wrong thing.
+  debug_options.add_xla_disable_hlo_passes("layout-assignment");
   config.set_debug_options(debug_options);
 
   const char *const kMultiOutputFusionModule = R"(
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index bdb062837c..141f321938 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -144,16 +144,15 @@ const std::list<const Thunk*>& ThunkSchedule::DependsOn(
 string ThunkSchedule::ToString() const {
   string result = "Total order:\n";
   for (Thunk* thunk : thunk_total_order_) {
-    tensorflow::strings::StrAppend(&result, "\t",
-                                   thunk->hlo_instruction()->ToString(), "\n");
+    absl::StrAppend(&result, "\t", thunk->hlo_instruction()->ToString(), "\n");
   }
-  tensorflow::strings::StrAppend(&result, "Dependencies:\n");
+  absl::StrAppend(&result, "Dependencies:\n");
   for (const auto& entry : depends_on_) {
     const Thunk* dependent = entry.first;
     for (const Thunk* dependency : entry.second) {
-      tensorflow::strings::StrAppend(
-          &result, "\t", dependent->hlo_instruction()->name(), " depends on ",
-          dependency->hlo_instruction()->name(), "\n");
+      absl::StrAppend(&result, "\t", dependent->hlo_instruction()->name(),
+                      " depends on ", dependency->hlo_instruction()->name(),
+                      "\n");
     }
   }
   return result;
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index 8579b1545f..989b542ff4 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -25,7 +26,7 @@ Status TupleThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
                                    se::Stream* stream,
                                    HloExecutionProfiler* profiler) {
   auto size = tuple_element_buffers_.size();
-  auto tuple_element_buffer_addresses = MakeUnique<void*[]>(size);
+  auto tuple_element_buffer_addresses = absl::make_unique<void*[]>(size);
   for (int i = 0; i != size; ++i) {
     tuple_element_buffer_addresses[i] =
         buffer_allocations.GetDeviceAddress(tuple_element_buffers_[i]).opaque();
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index d81d87e7dc..828fc2884b 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -34,9 +34,9 @@ WhileThunk::WhileThunk(
       // and body_thunk_sequence_ constructors because these SequentialThunks
       // are logically "part of" this WhileThunk, and shouldn't be profiled
       // separately from it.
-      condition_thunk_sequence_(MakeUnique<SequentialThunk>(
+      condition_thunk_sequence_(absl::make_unique<SequentialThunk>(
           std::move(*condition_thunk_sequence), nullptr)),
-      body_thunk_sequence_(MakeUnique<SequentialThunk>(
+      body_thunk_sequence_(absl::make_unique<SequentialThunk>(
           std::move(*body_thunk_sequence), nullptr)) {}
 
 Status WhileThunk::Initialize(const GpuExecutable& executable,
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index c5f3906356..40183de96e 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -118,7 +118,8 @@ class WhileTransformerTest : public HloTestBase {
   }
 
   void RunCopyInsertionPass() {
-    HloVerifier verifier;
+    HloVerifier verifier(/*layout_sensitive=*/false,
+                         /*allow_mixed_precision=*/false);
     TF_ASSERT_OK(verifier.Run(module_.get()).status());
     CopyInsertion copy_insertion;
     TF_ASSERT_OK(copy_insertion.Run(module_.get()).status());
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
index aa89567ee8..a2be89511b 100644
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ b/tensorflow/compiler/xla/service/graphviz_example.cc
@@ -22,9 +22,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -43,8 +43,7 @@ namespace {
 // Adds a computation to the given HLO module which adds a scalar constant to
 // its parameter and returns the result.
 HloComputation* AddScalarConstantComputation(int64 addend, HloModule* module) {
-  auto builder =
-      HloComputation::Builder(tensorflow::strings::StrCat("add_", addend));
+  auto builder = HloComputation::Builder(absl::StrCat("add_", addend));
   auto x_value = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "x_value"));
   auto half = builder.AddInstruction(
@@ -84,7 +83,7 @@ HloComputation* CallForwardingComputation(HloComputation* computation,
 // the module.
 std::unique_ptr<HloModule> MakeBigGraph() {
   HloModuleConfig config;
-  auto module = MakeUnique<HloModule>("BigGraph", config);
+  auto module = absl::make_unique<HloModule>("BigGraph", config);
 
   auto builder = HloComputation::Builder("TestBigGraphvizGraph");
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 4005fc0d11..38c3982ebf 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -45,7 +46,7 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForModule(
   // bound, by minimizing the liveness of sub-computations.
   TF_ASSIGN_OR_RETURN(
       HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), *module,
+      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(), *module,
                          module_sequence, *points_to_analysis, size_function));
   return result.heap_size;
 }
@@ -60,9 +61,10 @@ StatusOr<int64> HeapSimulator::MinimumMemoryForComputation(
         memory_by_computation) {
   TF_ASSIGN_OR_RETURN(
       HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
-                         sequence, points_to_analysis, size_function,
-                         HeapSimulator::Options(), memory_by_computation));
+      HeapSimulator::Run(absl::make_unique<NoFragmentationStatsHeap>(),
+                         computation, sequence, points_to_analysis,
+                         size_function, HeapSimulator::Options(),
+                         memory_by_computation));
   return result.heap_size;
 }
 
@@ -142,7 +144,7 @@ Status HeapSimulator::RunComputation(
         }
       } else {
         // A GetTupleElement doesn't need to keep all of its operand's buffers
-        // alive. It only needs the buffers that relate to the element its
+        // alive. It only needs the buffers that relate to the element it's
         // extracting, and the tuple it's extracting from, but not the buffers
         // for the other elements.
         for (const BufferValue* buffer : points_to.element({})) {
@@ -275,13 +277,13 @@ Status HeapSimulator::RunComputation(
                                                  *memory_by_computation_);
     }
 
-    // If the whole module is sequential, we can save memory by running the
-    // heap-simulation for sub-computations inline. E.g. the buffers for the
-    // condition and body of a kWhile instruction are only live for the duration
-    // of the instruction itself.
+    // If all computations in the module have been scheduled, we can save memory
+    // by running the heap-simulation for sub-computations inline. E.g. the
+    // buffers for the condition and body of a kWhile instruction are only live
+    // for the duration of the instruction itself.
     //
     // The order that the sub-computations are simulated does not affect
-    // correctness; since the whole module is sequential, we know that the
+    // correctness; since the whole module has been scheduled, we know that the
     // sub-computations will never be run concurrently.
     if (module_sequence_ != nullptr) {
       if (instruction->opcode() == HloOpcode::kCall ||
@@ -344,7 +346,7 @@ HeapSimulator::HeapSimulator(
     const SequentialHloOrdering::HloModuleSequence* module_sequence,
     const tensorflow::gtl::FlatMap<const HloComputation*, int64>*
         memory_by_computation)
-    : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
+    : no_fragmentation_stats_(absl::make_unique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
       options_(options),
@@ -378,9 +380,10 @@ void HeapSimulator::Alloc(const BufferValue* buffer,
 
   allocated_buffers_.insert(buffer);
   const int64 size = size_fn_(*buffer);
-  algorithm_->Alloc(buffer, size);
-  no_fragmentation_stats_->Alloc(buffer, size);
-
+  const HloInstruction* instruction_to_calc_aliasing =
+      memory_by_computation_ == nullptr ? nullptr : instruction;
+  algorithm_->Alloc(buffer, size, instruction_to_calc_aliasing);
+  no_fragmentation_stats_->Alloc(buffer, size, instruction_to_calc_aliasing);
   FillDebugTrace(HeapSimulatorTrace::Event::ALLOC, buffer, instruction,
                  nullptr);
 }
@@ -518,6 +521,18 @@ void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size) {
   }
 }
 
+void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size,
+                                     const HloInstruction* instruction) {
+  // The output buffer of while/call/conditional is always aliased with the
+  // output buffer of the root instruction in the body. Don't double count.
+  if (instruction == nullptr ||
+      (instruction->opcode() != HloOpcode::kWhile &&
+       instruction->opcode() != HloOpcode::kCall &&
+       instruction->opcode() != HloOpcode::kConditional)) {
+    Alloc(buffer, size);
+  }
+}
+
 void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
     const HloInstruction* instruction,
     const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 811a6042df..af05bedee7 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -36,6 +36,7 @@ namespace xla {
 
 // Forward declare classes defined below.
 class HeapAlgorithm;
+class NoFragmentationStatsHeap;
 
 // HeapSimulator assigns buffer offsets by running a simulation of a regular
 // memory heap with Alloc and Free calls.  It only works for completely
@@ -161,7 +162,10 @@ class HeapSimulator {
                       const HloInstruction* instruction,
                       const BufferValue* shared_with_canonical);
 
-  const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
+  // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
+  // in which case we are calculating the same allocs/frees twice in the
+  // simulation.
+  const std::unique_ptr<NoFragmentationStatsHeap> no_fragmentation_stats_;
   const std::unique_ptr<HeapAlgorithm> algorithm_;
   const BufferValue::SizeFunction size_fn_;
   const Options options_;
@@ -216,6 +220,21 @@ class HeapAlgorithm {
   // Alloc allocates a buffer of 'size' bytes.
   virtual void Alloc(const BufferValue* buffer, int64 size) = 0;
 
+  // NoFragmentationStatsHeap overrides this method.
+  virtual void Alloc(const BufferValue* buffer, int64 size,
+                     const HloInstruction* instruction) {
+    Alloc(buffer, size);
+  }
+
+  // Takes memory usage of subcomputations into account when calculating the
+  // memory usage of a computation. Currently, we don't handle buffer aliasing
+  // between computations entirely correctly. We are careful to not double count
+  // for the output buffers of whiles/conds/calls. But we don't take into
+  // account other aliases, such as for the while init. A more thorough solution
+  // would require something like BufferAssignment::BuildColocatedBufferSets.
+  // TODO(b/65835246):
+  // Since TuplePointsToAnalysis is being replaced with a module-aware alias
+  // analysis, it's not worth making major changes to HeapSimulator now.
   virtual void AccountForSubcomputationMemory(
       const HloInstruction* instruction,
       const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
@@ -240,6 +259,9 @@ class NoFragmentationStatsHeap : public HeapAlgorithm {
 
   void Alloc(const BufferValue* buffer, int64 size) override;
 
+  void Alloc(const BufferValue* buffer, int64 size,
+             const HloInstruction* instruction) override;
+
   void AccountForSubcomputationMemory(
       const HloInstruction* instruction,
       const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index b41dc66fe9..5f85f14565 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -137,7 +138,7 @@ class HeapSimulatorTracker {
       const string& name, std::unique_ptr<HloComputation> computation,
       const std::vector<const HloInstruction*>& instruction_sequence) {
     HloModuleConfig config;
-    module_ = MakeUnique<HloModule>(name, config);
+    module_ = absl::make_unique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
@@ -146,8 +147,8 @@ class HeapSimulatorTracker {
     // the secondary sorting criteria of DecreasingSizeRunsHeap to sort calls by
     // buffer id, for determinism in the tests.
     auto zero_size = [](const BufferValue& buffer) { return 0; };
-    auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
-        MakeUnique<HeapCallRecorder>(&actual_calls_));
+    auto algorithm = absl::make_unique<DecreasingSizeRunsHeap>(
+        absl::make_unique<HeapCallRecorder>(&actual_calls_));
     result_ = HeapSimulator::Run(
                   std::move(algorithm), *module_->entry_computation(),
                   instruction_sequence, *points_to_analysis_, zero_size)
@@ -156,7 +157,7 @@ class HeapSimulatorTracker {
 
   explicit HeapSimulatorTracker(const string& name) {
     HloModuleConfig config;
-    module_ = MakeUnique<HloModule>(name, config);
+    module_ = absl::make_unique<HloModule>(name, config);
   }
 
   // Similar to the single entry computation constructor above, but runs the
@@ -182,8 +183,8 @@ class HeapSimulatorTracker {
     auto size_fn = [&reverse_position](const BufferValue& buffer) {
       return reverse_position[buffer.instruction()];
     };
-    auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
-        MakeUnique<HeapCallRecorder>(&actual_calls_));
+    auto algorithm = absl::make_unique<DecreasingSizeRunsHeap>(
+        absl::make_unique<HeapCallRecorder>(&actual_calls_));
     result_ = HeapSimulator::Run(std::move(algorithm), *module_,
                                  module_sequence, *points_to_analysis_, size_fn)
                   .ConsumeValueOrDie();
@@ -675,7 +676,8 @@ class HeapAlgorithmTestBase : public ::testing::Test {
     const BufferValue::Id id = buffers_.size();
     auto const0 = builder_.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-    buffers_.emplace_back(MakeUnique<HloValue>(id, const0, ShapeIndex{}));
+    buffers_.emplace_back(
+        absl::make_unique<HloValue>(id, const0, ShapeIndex{}));
     return buffers_.back().get();
   }
 
@@ -724,7 +726,8 @@ class DecreasingSizeRunsHeapTest : public HeapAlgorithmTestBase {};
 
 TEST_F(DecreasingSizeRunsHeapTest, Empty) {
   CallSequence call_sequence;
-  DecreasingSizeRunsHeap heap(MakeUnique<HeapCallRecorder>(&call_sequence));
+  DecreasingSizeRunsHeap heap(
+      absl::make_unique<HeapCallRecorder>(&call_sequence));
   heap.Finish();
   EXPECT_EQ(call_sequence, CallSequence({
                                {kFinish, nullptr},
@@ -733,7 +736,8 @@ TEST_F(DecreasingSizeRunsHeapTest, Empty) {
 
 TEST_F(DecreasingSizeRunsHeapTest, Simple) {
   CallSequence call_sequence;
-  DecreasingSizeRunsHeap heap(MakeUnique<HeapCallRecorder>(&call_sequence));
+  DecreasingSizeRunsHeap heap(
+      absl::make_unique<HeapCallRecorder>(&call_sequence));
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Alloc(buffer_c_, 30);
@@ -760,7 +764,8 @@ TEST_F(DecreasingSizeRunsHeapTest, Simple) {
 
 TEST_F(DecreasingSizeRunsHeapTest, Mixed) {
   CallSequence call_sequence;
-  DecreasingSizeRunsHeap heap(MakeUnique<HeapCallRecorder>(&call_sequence));
+  DecreasingSizeRunsHeap heap(
+      absl::make_unique<HeapCallRecorder>(&call_sequence));
   heap.Alloc(buffer_a_, 10);
   heap.Alloc(buffer_b_, 20);
   heap.Free(buffer_b_, 20);
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index be9098f555..821c599863 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,6 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
+// Next ID: 52
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -45,6 +46,8 @@ message HloInstructionProto {
   reserved "control_predecessor_names";
   reserved 6;
   reserved "called_computation_names";
+  reserved 44;
+  reserved "replica_group_ids";
 
   string name = 1;
   string opcode = 2;
@@ -74,6 +77,11 @@ message HloInstructionProto {
   // Describes the dimension numbers used for a convolution.
   xla.ConvolutionDimensionNumbers convolution_dimension_numbers = 16;
 
+  // The number of feature groups. Used for a convolution. Must be a divisor of
+  // the input feature dimension and output feature dimension. If not specified,
+  // it will use a default value of 1.
+  int64 feature_group_count = 50;
+
   // Describes the [begin, end) index range and stride for slices.
   message SliceDimensions {
     int64 start = 1;
@@ -133,7 +141,7 @@ message HloInstructionProto {
 
   // Gather dimension numbers.
   xla.GatherDimensionNumbers gather_dimension_numbers = 33;
-  repeated int64 gather_window_bounds = 34;
+  repeated int64 gather_slice_sizes = 34;
 
   // Compute Host.
   string channel_name = 41;
@@ -152,9 +160,6 @@ message HloInstructionProto {
   string backend_config = 43;
 
   // Cross replica op fields.
-  // TODO(b/112107579): remove replica_group_ids field and always use
-  // replica_groups.
-  repeated int64 replica_group_ids = 44;
   repeated ReplicaGroup replica_groups = 49;
   int64 all_reduce_id = 45;
   string cross_replica_sum_barrier = 46;
@@ -165,6 +170,9 @@ message HloInstructionProto {
   bool is_host_transfer = 47;
 
   xla.ScatterDimensionNumbers scatter_dimension_numbers = 48;
+
+  // Precision configuration for the instruction. Has backend-specific meaning.
+  xla.PrecisionConfigProto precision_config = 51;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index e8a4b034b4..0986da65cb 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -28,15 +30,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
-using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
 
 // Data structure used to construct the alias analysis. Thrown away after alias
 // analysis is complete. This data structure keeps track of which sets of
@@ -414,7 +412,7 @@ Status HloAliasAnalysis::Verify() const {
 }
 
 string HloAliasAnalysis::ToString() const {
-  string out = StrCat("HloAliasAnalysis, module ", module_->name(), "\n");
+  string out = absl::StrCat("HloAliasAnalysis, module ", module_->name(), "\n");
   StrAppend(&out, "  Buffers at each position:\n");
   for (const HloComputation* computation : module_->computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
@@ -457,7 +455,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
   VLOG(2) << "HloAliasAnalysis::Run on module " << module->name();
   XLA_VLOG_LINES(2, module->ToString());
 
-  auto alias_analysis = WrapUnique(new HloAliasAnalysis(module));
+  auto alias_analysis = absl::WrapUnique(new HloAliasAnalysis(module));
   TF_ASSIGN_OR_RETURN(alias_analysis->dataflow_analysis_,
                       HloDataflowAnalysis::Run(*module, /*ssa_form=*/true,
                                                /*bitcast_defines_value=*/false,
@@ -537,10 +535,10 @@ bool HloAliasAnalysis::HasLiveRangeInterference(
       if (ordering.MayInterfere(*values[i - 1], *values[i],
                                 dataflow_analysis())) {
         VLOG(1) << "In buffer " << buffer.id() << " containing values:\n  "
-                << Join(values, ", ",
-                        [](string* out, const HloValue* value) {
-                          StrAppend(out, value->ToShortString());
-                        })
+                << absl::StrJoin(values, ", ",
+                                 [](string* out, const HloValue* value) {
+                                   StrAppend(out, value->ToShortString());
+                                 })
 
                 << "\nValue " << values[i - 1]->ToShortString()
                 << " may interfere with value " << values[i]->ToShortString();
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
index e16413f361..6c11a073b7 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.cc
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -27,15 +29,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
-using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrCat;
-
 bool HloBuffer::operator==(const HloBuffer& other) const {
   bool equal = id() == other.id();
   if (equal) {
@@ -59,10 +56,11 @@ std::vector<HloPosition> HloBuffer::ComputePositions() const {
 }
 
 string HloBuffer::ToString() const {
-  return StrCat("HloBuffer ", id_, ", values: ",
-                Join(values_, ", ", [](string* result, const HloValue* value) {
-                  result->append(value->ToShortString());
-                }));
+  return absl::StrCat(
+      "HloBuffer ", id_, ", values: ",
+      absl::StrJoin(values_, ", ", [](string* result, const HloValue* value) {
+        result->append(value->ToShortString());
+      }));
 }
 
 std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 441288da1a..cf95b112d7 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -23,9 +23,13 @@ limitations under the License.
 #include <set>
 #include <sstream>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -36,13 +40,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
-using ::tensorflow::strings::StrCat;
+using absl::StrCat;
 
 std::unique_ptr<HloComputation> HloComputation::Builder::Build(
     HloInstruction* root_instruction) {
@@ -56,8 +58,8 @@ std::unique_ptr<HloComputation> HloComputation::Builder::Build(
   HloInstruction* root =
       root_instruction ? root_instruction : last_added_instruction_;
   CHECK_NE(nullptr, root);
-  return WrapUnique(new HloComputation(name_, parameter_count, &instructions_,
-                                       root, fusion_instruction_));
+  return absl::WrapUnique(new HloComputation(
+      name_, parameter_count, &instructions_, root, fusion_instruction_));
 }
 
 HloComputation::HloComputation(
@@ -135,7 +137,7 @@ string RenameFusionParameter(const string& original_name, int64 new_param_no) {
   }
   string after_param = original_name.substr(index + param_underscore.size());
   int64 numeric_suffix;
-  if (tensorflow::strings::safe_strto64(after_param, &numeric_suffix)) {
+  if (absl::SimpleAtoi(after_param, &numeric_suffix)) {
     return StrCat(original_name.substr(0, index + param_underscore.size()),
                   new_param_no);
   }
@@ -320,6 +322,7 @@ void ComputeComputationPostOrder(
 enum State { kVisiting, kVisited };
 
 void ComputeInstructionPostOrder(
+    std::map<int64, std::vector<HloInstruction*>> channel_dependency_map,
     std::vector<HloInstruction*>* post_order, HloInstruction* root,
     tensorflow::gtl::FlatMap<HloInstruction*, State>* visited) {
   std::vector<HloInstruction*> dfs_stack;
@@ -354,12 +357,67 @@ void ComputeInstructionPostOrder(
     for (HloInstruction* op : current->control_predecessors()) {
       dfs_stack.emplace_back(op);
     }
+
+    // Add inputs for send->recv_done dependencies and cross-replica-sum
+    // dependencies.
+    switch (current->opcode()) {
+      case HloOpcode::kRecvDone: {
+        const auto& dependencies =
+            channel_dependency_map[current->channel_id()];
+        for (HloInstruction* op : dependencies) {
+          dfs_stack.emplace_back(op);
+        }
+        break;
+      }
+      case HloOpcode::kCrossReplicaSum: {
+        auto all_reduce_id = current->all_reduce_id();
+        if (all_reduce_id) {
+          const auto& dependencies =
+              channel_dependency_map[all_reduce_id.value()];
+          for (HloInstruction* op : dependencies) {
+            dfs_stack.emplace_back(op);
+          }
+        }
+        break;
+      }
+      default:
+        break;
+    }
   }
 }
 
 }  // namespace
 
+std::map<int64, std::vector<HloInstruction*>>
+HloComputation::ComputeChannelDependencies() const {
+  std::map<int64, std::vector<HloInstruction*>> channel_dependency_map;
+  for (const auto& instruction : instructions_) {
+    switch (instruction->opcode()) {
+      case HloOpcode::kSend: {
+        channel_dependency_map[instruction->channel_id()].push_back(
+            instruction.get());
+        break;
+      }
+      case HloOpcode::kCrossReplicaSum: {
+        auto all_reduce_id = instruction->all_reduce_id();
+        if (all_reduce_id) {
+          auto& dependencies = channel_dependency_map[all_reduce_id.value()];
+          absl::c_copy(instruction->operands(),
+                       std::back_inserter(dependencies));
+          absl::c_copy(instruction->control_predecessors(),
+                       std::back_inserter(dependencies));
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  return channel_dependency_map;
+}
+
 std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
+  auto channel_dependency_map = ComputeChannelDependencies();
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
   std::vector<HloInstruction*> trace_instructions;
@@ -371,7 +429,8 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
       // users).
       trace_instructions.push_back(instruction.get());
     } else if (instruction->users().empty()) {
-      ComputeInstructionPostOrder(&post_order, instruction.get(), &visited);
+      ComputeInstructionPostOrder(channel_dependency_map, &post_order,
+                                  instruction.get(), &visited);
     }
   }
   post_order.insert(post_order.end(), trace_instructions.begin(),
@@ -493,9 +552,9 @@ HloComputation::CreateFromProto(
               return to_proto_id[a.get()] < to_proto_id[b.get()];
             });
 
-  return WrapUnique(new HloComputation(proto.name(), parameter_count,
-                                       &instructions, root,
-                                       /*fusion_instruction=*/nullptr));
+  return absl::WrapUnique(new HloComputation(proto.name(), parameter_count,
+                                             &instructions, root,
+                                             /*fusion_instruction=*/nullptr));
 }
 
 void HloComputation::FuseInstructionsInto(
@@ -624,6 +683,9 @@ ProgramShape HloComputation::ComputeProgramShape() const {
 }
 
 bool HloComputation::operator==(const HloComputation& other) const {
+  if (this == &other) {
+    return true;
+  }
   std::set<std::pair<const HloInstruction*, const HloInstruction*>> visited;
   std::function<bool(const HloInstruction*, const HloInstruction*)> eq =
       [&visited, &eq](const HloInstruction* a, const HloInstruction* b) {
@@ -674,13 +736,34 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
 std::unique_ptr<HloReachabilityMap> HloComputation::ComputeReachability()
     const {
   const auto& all = MakeInstructionPostOrder();
-  auto result = MakeUnique<HloReachabilityMap>(all);
+  auto result = absl::make_unique<HloReachabilityMap>(all);
+  auto channel_dependency_map = ComputeChannelDependencies();
 
   std::vector<HloInstruction*> inputs;
   for (const HloInstruction* hlo : all) {
     inputs.assign(hlo->operands().begin(), hlo->operands().end());
     inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
                   hlo->control_predecessors().end());
+
+    switch (hlo->opcode()) {
+      case HloOpcode::kRecvDone: {
+        const auto& dependencies = channel_dependency_map[hlo->channel_id()];
+        absl::c_copy(dependencies, std::back_inserter(inputs));
+        break;
+      }
+      case HloOpcode::kCrossReplicaSum: {
+        auto all_reduce_id = hlo->all_reduce_id();
+        if (all_reduce_id) {
+          const auto& dependencies =
+              channel_dependency_map[all_reduce_id.value()];
+          absl::c_copy(dependencies, std::back_inserter(inputs));
+        }
+        break;
+      }
+      default:
+        break;
+    }
+
     result->FastSetReachabilityToUnion(inputs, hlo);
   }
   return result;
@@ -723,11 +806,10 @@ std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
     }
   }
   VLOG(3) << "Unreachable roots:"
-          << tensorflow::str_util::Join(
-                 unreachable_roots, "\n\t",
-                 [](string* out, const HloInstruction* hlo) {
-                   tensorflow::strings::StrAppend(out, hlo->ToString());
-                 });
+          << absl::StrJoin(unreachable_roots, "\n\t",
+                           [](string* out, const HloInstruction* hlo) {
+                             absl::StrAppend(out, hlo->ToString());
+                           });
   return unreachable_roots;
 }
 
@@ -829,7 +911,7 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     HloCloneContext* context, const string& suffix) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
-    context_ptr = MakeUnique<HloCloneContext>(parent(), suffix);
+    context_ptr = absl::make_unique<HloCloneContext>(parent(), suffix);
     context = context_ptr.get();
   }
 
@@ -898,12 +980,11 @@ void HloComputation::UniquifyName(NameUniquer* name_uniquer) {
   name_ = name_uniquer->GetUniqueName(name_);
 }
 
-HloInstruction* HloComputation::GetInstructionWithName(
-    tensorflow::StringPiece name) {
+HloInstruction* HloComputation::GetInstructionWithName(absl::string_view name) {
   auto instructions_in_computation = instructions();
-  auto it = c_find_if(instructions_in_computation, [&](HloInstruction* instr) {
-    return instr->name() == name;
-  });
+  auto it = absl::c_find_if(
+      instructions_in_computation,
+      [&](HloInstruction* instr) { return instr->name() == name; });
   return it == instructions_in_computation.end() ? nullptr : *it;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 49ed65910f..8d9b694977 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -367,7 +367,7 @@ class HloComputation {
 
   // Returns the instruction in this computation that has name `name`.  Returns
   // null if there is no such computation.
-  HloInstruction* GetInstructionWithName(tensorflow::StringPiece name);
+  HloInstruction* GetInstructionWithName(absl::string_view name);
 
   int64 unique_id() const { return unique_id_; }
 
@@ -399,6 +399,13 @@ class HloComputation {
   // Internal helper to collect unreachable roots.
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
+  // Returns a map from channel-id to directed dependencies of the channel
+  // instructions. For send&recv pairs it means the send instruction and for
+  // cross-replica-sum the union of the dependencies for all participating
+  // instructions.
+  std::map<int64, std::vector<HloInstruction*>> ComputeChannelDependencies()
+      const;
+
   string name_;
   int64 unique_id_;
   HloInstruction* root_instruction_;
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index e4c5470331..f7ed1b0316 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -691,6 +691,27 @@ TEST_F(HloComputationTest, StringificationCanonical) {
   EXPECT_EQ(computation->ToString(options), expected_computation2);
 }
 
-}  // namespace
+TEST_F(HloComputationTest, ChannelReachability) {
+  const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
+  HloComputation::Builder builder("ChannelReachability");
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto send =
+      builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1));
+  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
+  auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto recv =
+      builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1));
+  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
 
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build(recv_done));
+  auto reachability = computation->ComputeReachability();
+  EXPECT_TRUE(reachability->IsReachable(param, recv_done));
+  EXPECT_FALSE(reachability->IsReachable(send, recv));
+  EXPECT_FALSE(reachability->IsReachable(send_done, recv));
+}
+
+}  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 7229031c0c..2ed645c3ae 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -38,7 +39,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
   // Limit the constant folding to 0 iterations to skip folding loops. This
   // retains the behavior from before while loop support in HloEvaluator and may
   // be revised.
-  auto evaluator = MakeUnique<HloEvaluator>(/*max_loop_iterations=*/0);
+  auto evaluator = absl::make_unique<HloEvaluator>(/*max_loop_iterations=*/0);
 
   XLA_VLOG_LINES(2,
                  "HloConstantFolding::Run(), before:\n" + module->ToString());
@@ -51,9 +52,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           computation->root_instruction() != instruction) {
         continue;
       }
-      // Skip Constant, Parameter, Reduce, and AfterAll operation.
-      // TODO(b/35975797): Enable Reduce operation once arbitrary computation
-      // are supported by the evaluator.
+      // Skip Constant, Parameter, and AfterAll operation.
       // TODO(b/64407269): Enable Tuple once the timeout issue is resolved.
       // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one
       // operand in which case constant folding will be impossible and this
@@ -61,7 +60,6 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
       if (instruction->opcode() == HloOpcode::kParameter ||
           instruction->opcode() == HloOpcode::kConstant ||
           instruction->opcode() == HloOpcode::kTuple ||
-          instruction->opcode() == HloOpcode::kReduce ||
           instruction->opcode() == HloOpcode::kAfterAll) {
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.h b/tensorflow/compiler/xla/service/hlo_constant_folding.h
index 331480bd02..4557983a9c 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.h
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.h
@@ -25,7 +25,7 @@ namespace xla {
 // computation on constants.
 class HloConstantFolding : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "constant_folding"; }
+  absl::string_view name() const override { return "constant_folding"; }
 
   // Run constant folding operations on the given module. Returns whether the
   // module was changed (constant expressions folded).
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 64a42c1efc..7cd1481a8a 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -202,5 +203,45 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   EXPECT_TRUE(matched);
 }
 
+const char* const kConstantFoldReduce = R"(
+  HloModule ConstantFoldReduce
+
+  add {
+    a = s32[] parameter(0)
+    b = s32[] parameter(1)
+    ROOT add = s32[] add(a, b)
+  }
+
+  ENTRY r {
+    x = s32[3] constant({1, 2, 3})
+    init = s32[] constant(0)
+    ROOT reduce = s32[] reduce(x, init), dimensions={0}, to_apply=add
+  })";
+
+TEST_F(HloConstantFoldingTest, ConstantFoldReduce) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kConstantFoldReduce));
+  HloConstantFolding const_folder;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_EQ(6, module->entry_computation()
+                   ->root_instruction()
+                   ->literal()
+                   .GetFirstElement<int32>());
+}
+
+TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kConstantFoldReduce));
+  HloInstruction* add = module->computations().begin()->root_instruction();
+  LayoutUtil::ClearLayout(add->mutable_shape());
+  HloConstantFolding const_folder;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
+  EXPECT_FALSE(result);
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Reduce());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 1bbb0ff08e..5add4251ef 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -258,10 +258,6 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleHostCompute(const HloInstruction*) {
-  return Status::OK();
-}
-
 Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
   // Compute properties of the mapped function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
@@ -544,15 +540,6 @@ Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) {
 }
 
 Status HloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) {
-  // TODO(b/110096724): Compute correct cost here.
-  double flops = 0.0;
-  ShapeUtil::ForEachSubshape(hlo->shape(),
-                             [&](const Shape& subshape, const ShapeIndex&) {
-                               if (ShapeUtil::IsArray(subshape)) {
-                                 flops += ShapeUtil::ElementsIn(subshape);
-                               }
-                             });
-  current_properties_[kFlopsKey] = flops;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 193a04bea0..1bf1c4a315 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -74,7 +74,6 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
   Status HandleOutfeed(const HloInstruction* outfeed) override;
-  Status HandleHostCompute(const HloInstruction* host_compute) override;
   Status HandleRng(const HloInstruction* random) override;
   Status HandleReverse(const HloInstruction* reverse) override;
   Status HandleSort(const HloInstruction* sort) override;
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 90d2be118d..0ceb6a2968 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -14,15 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
+using absl::StrCat;
 using tensorflow::gtl::ArraySlice;
-using tensorflow::strings::StrCat;
 
 StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
                                         HloInstruction* rhs) {
@@ -149,13 +151,13 @@ StatusOr<HloInstruction*> MakeConcatHlo(ArraySlice<HloInstruction*> operands,
   CHECK_GT(operands.size(), 0);
 
   HloComputation* computation = operands[0]->parent();
-  CHECK(c_all_of(operands, [&](HloInstruction* instr) {
+  CHECK(absl::c_all_of(operands, [&](HloInstruction* instr) {
     return instr->parent() == computation;
   }));
 
   std::vector<const Shape*> operand_shapes;
-  c_transform(operands, std::back_inserter(operand_shapes),
-              [](HloInstruction* instr) { return &instr->shape(); });
+  absl::c_transform(operands, std::back_inserter(operand_shapes),
+                    [](HloInstruction* instr) { return &instr->shape(); });
 
   TF_ASSIGN_OR_RETURN(Shape concat_shape, ShapeInference::InferConcatOpShape(
                                               operand_shapes, dimension));
@@ -174,6 +176,29 @@ StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
       HloInstruction::CreateDot(dot_shape, lhs, rhs, dim_numbers));
 }
 
+StatusOr<HloInstruction*> MakeMapHlo(
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* map_computation) {
+  CHECK(!operands.empty()) << "Map Hlo requires at least one operand.";
+  HloComputation* computation = operands.front()->parent();
+  std::vector<const Shape*> operand_shapes;
+  int64 max_operand_rank = 0;
+  for (const HloInstruction* operand : operands) {
+    CHECK_EQ(computation, operand->parent());
+    operand_shapes.push_back(&operand->shape());
+    max_operand_rank =
+        std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
+  }
+  std::vector<int64> map_dims(max_operand_rank);
+  std::iota(map_dims.begin(), map_dims.end(), 0);
+  TF_ASSIGN_OR_RETURN(
+      Shape map_shape,
+      ShapeInference::InferMapShape(
+          operand_shapes, map_computation->ComputeProgramShape(), map_dims));
+  return computation->AddInstruction(
+      HloInstruction::CreateMap(map_shape, operands, map_computation));
+}
+
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   CHECK_GT(n, 0);
 
@@ -205,7 +230,7 @@ StatusOr<HloInstruction*> PrependDegenerateDims(HloInstruction* operand,
   const Shape& operand_shape = operand->shape();
   new_shape_dims.reserve(n + operand_shape.dimensions_size());
   new_shape_dims.insert(new_shape_dims.begin(), n, 1);
-  c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims));
+  absl::c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims));
   return MakeReshapeHlo(new_shape_dims, operand);
 }
 
@@ -217,7 +242,7 @@ StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
   std::vector<int64> expanded_shape_dim_bounds;
   expanded_shape_dim_bounds.reserve(expanded_dims.size() +
                                     operand->shape().dimensions_size() - 1);
-  c_copy(expanded_dims, std::back_inserter(expanded_shape_dim_bounds));
+  absl::c_copy(expanded_dims, std::back_inserter(expanded_shape_dim_bounds));
   std::copy(operand->shape().dimensions().begin() + 1,
             operand->shape().dimensions().end(),
             std::back_inserter(expanded_shape_dim_bounds));
@@ -228,7 +253,7 @@ StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
 
 StatusOr<HloInstruction*> ElideDegenerateDims(HloInstruction* operand,
                                               ArraySlice<int64> dims_to_elide) {
-  CHECK(c_is_sorted(dims_to_elide));
+  CHECK(absl::c_is_sorted(dims_to_elide));
 
   const Shape& input_shape = operand->shape();
   // First accumulate in reverse
@@ -245,12 +270,44 @@ StatusOr<HloInstruction*> ElideDegenerateDims(HloInstruction* operand,
     }
   }
 
-  c_reverse(new_shape_dim_bounds);
+  absl::c_reverse(new_shape_dim_bounds);
   Shape output_shape =
       ShapeUtil::MakeShape(input_shape.element_type(), new_shape_dim_bounds);
   return MakeReshapeHlo(output_shape, operand);
 }
 
+StatusOr<HloInstruction*> InsertDegenerateDims(
+    HloInstruction* operand, ArraySlice<int64> dims_to_insert) {
+  CHECK(absl::c_is_sorted(dims_to_insert));
+
+  const Shape& operand_shape = operand->shape();
+  int64 output_shape_rank =
+      operand_shape.dimensions_size() + dims_to_insert.size();
+  for (auto dim_to_insert : dims_to_insert) {
+    CHECK_LT(dim_to_insert, output_shape_rank);
+  }
+
+  std::vector<int64> output_shape_dim_bounds;
+  output_shape_dim_bounds.reserve(output_shape_rank);
+  int64 operand_dims_idx = 0;
+  int64 dims_to_insert_idx = 0;
+  for (int64 i = 0; i < output_shape_rank; ++i) {
+    if (dims_to_insert_idx < dims_to_insert.size() &&
+        i == dims_to_insert[dims_to_insert_idx]) {
+      output_shape_dim_bounds.push_back(1);
+      ++dims_to_insert_idx;
+    } else {
+      output_shape_dim_bounds.push_back(
+          operand_shape.dimensions(operand_dims_idx));
+      ++operand_dims_idx;
+    }
+  }
+
+  Shape output_shape = ShapeUtil::MakeShape(operand_shape.element_type(),
+                                            output_shape_dim_bounds);
+  return MakeReshapeHlo(output_shape, operand);
+}
+
 StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
                                              int64 zeros_to_prepend,
                                              int64 zeros_to_append) {
@@ -263,7 +320,7 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
   *padding_config.add_dimensions() = padding_config_dim;
 
   HloInstruction* zero = computation->AddInstruction(
-      HloInstruction::CreateConstant(MakeUnique<Literal>(
+      HloInstruction::CreateConstant(absl::make_unique<Literal>(
           LiteralUtil::Zero(operand->shape().element_type()))));
   return MakePadHlo(operand, zero, padding_config);
 }
@@ -273,14 +330,14 @@ StatusOr<HloInstruction*> BroadcastZeros(
     ArraySlice<int64> broadcast_dimensions) {
   HloInstruction* zero =
       computation->AddInstruction(HloInstruction::CreateConstant(
-          MakeUnique<Literal>(LiteralUtil::Zero(element_type))));
+          absl::make_unique<Literal>(LiteralUtil::Zero(element_type))));
   return MakeBroadcastHlo(zero, /*broadcast_dimensions=*/{},
                           /*result_shape_bounds=*/broadcast_dimensions);
 }
 
 StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
     ArraySlice<const Shape*> domain, const Shape& range,
-    tensorflow::StringPiece name) {
+    absl::string_view name) {
   HloComputation::Builder b{std::string(name)};
   int64 param_idx = 0;
   for (const Shape* param_shape : domain) {
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 49b1402d68..1bc6d09b45 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -102,6 +102,12 @@ StatusOr<HloInstruction*> MakeConcatHlo(
 StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
                                      const DotDimensionNumbers& dim_numbers);
 
+// Creates a Map HLO instruction and adds it to the computation containing the
+// operands. All operands must be in the same computation.
+StatusOr<HloInstruction*> MakeMapHlo(
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* map_computation);
+
 // -----------------------------------------------------------------------------
 // Some other miscellaneous helpers to generate common HLO patterns.  All of
 // these add all the instructions they generate into the computation containing
@@ -144,6 +150,16 @@ StatusOr<HloInstruction*> ExpandFirstDimIntoNDims(
 StatusOr<HloInstruction*> ElideDegenerateDims(
     HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> dims_to_elide);
 
+// Inserts (via reshape) a set of degenerate dimensions (dimensions containing
+// exactly one element), `dims_to_insert` into `operand`. The dimensions in
+// `dims_to_insert` refer to the dimensions in the result, and hence should be
+// less than the rank of the result. Also, `dims_to_insert` must be sorted.
+//
+// For example, if `operand` is of shape f32[12,21,8,34] and dims_to_insert is
+// {0, 2}, then the result is `operand` reshaped to [1,12,1,21,8,34].
+StatusOr<HloInstruction*> InsertDegenerateDims(
+    HloInstruction* operand, tensorflow::gtl::ArraySlice<int64> dims_to_insert);
+
 // Pads `operand` (which must have rank 1) with `zeros_to_prepend` zeros in the
 // front and `zeros_to_append` zeros in the back.
 StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
@@ -161,7 +177,7 @@ StatusOr<HloInstruction*> BroadcastZeros(
 // a value of type `range`.
 StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
     tensorflow::gtl::ArraySlice<const Shape*> domain, const Shape& range,
-    tensorflow::StringPiece name);
+    absl::string_view name);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index 60d3e71757..a8de285d16 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -28,7 +28,7 @@ using tensorflow::gtl::ArraySlice;
 
 class HloCreationUtilsTest : public HloTestBase {
  protected:
-  static std::unique_ptr<HloModule> CreateModuleWithProgramShape(
+  std::unique_ptr<HloModule> CreateModuleWithProgramShape(
       PrimitiveType primitive_type, ArraySlice<int64> input_shape_dims,
       ArraySlice<int64> output_shape_dims, HloInstruction** param,
       HloComputation** entry_computation) {
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 06484f4012..cb367adf5e 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/hash/hash.h"
 
 namespace xla {
 
@@ -103,6 +104,9 @@ int64 CseHash(const HloInstruction* instruction) {
   for (auto operand : instruction->operands()) {
     hash = tensorflow::Hash64Combine(hash, operand->unique_id());
   }
+  if (instruction->opcode() == HloOpcode::kConstant) {
+    hash = tensorflow::Hash64Combine(hash, instruction->literal().Hash());
+  }
   return hash;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h
index 5e2b348bdd..a28c03599a 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.h
+++ b/tensorflow/compiler/xla/service/hlo_cse.h
@@ -34,7 +34,7 @@ class HloCSE : public HloPassInterface {
       : is_layout_sensitive_(is_layout_sensitive),
         only_fusion_computations_(only_fusion_computations) {}
   ~HloCSE() override = default;
-  tensorflow::StringPiece name() const override { return "cse"; }
+  absl::string_view name() const override { return "cse"; }
 
   // Run CSE on the given module. Returns whether the module was changed (common
   // subexpressions were found and eliminated).
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 90fbaa37c5..406d712ec6 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index bbfb0c253f..1d35757b42 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -29,8 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -78,8 +78,8 @@ bool MultiDynamicSliceUseShareSameIndices(
 
 }  // namespace
 
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 HloDataflowAnalysis::HloDataflowAnalysis(
     const HloModule& module, bool ssa_form, bool bitcast_defines_value,
@@ -93,7 +93,7 @@ HloDataflowAnalysis::HloDataflowAnalysis(
 bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
     const HloInstruction* inst) {
   tensorflow::gtl::FlatSet<const HloInstruction*> visited;
-  tensorflow::gtl::InlinedVector<const HloInstruction*, 4> stack;
+  absl::InlinedVector<const HloInstruction*, 4> stack;
   stack.push_back(inst);
   while (!stack.empty()) {
     const HloInstruction* current = stack.back();
@@ -886,7 +886,7 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
   VLOG(1) << "HloDataflowAnalysis::Run on module " << module.name();
   XLA_VLOG_LINES(2, module.ToString());
 
-  auto dataflow_analysis = WrapUnique(new HloDataflowAnalysis(
+  auto dataflow_analysis = absl::WrapUnique(new HloDataflowAnalysis(
       module, ssa_form, bitcast_defines_value, fusion_can_share_buffer));
 
   TF_RETURN_IF_ERROR(dataflow_analysis->InitializeInstructionValueSets());
@@ -976,28 +976,22 @@ Status HloDataflowAnalysis::Verify() const {
 bool HloDataflowAnalysis::DoesNotUseOperandBuffer(
     const HloInstruction* operand, const ShapeIndex& index,
     const HloInstruction* user) const {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  if (user->opcode() == HloOpcode::kFusion &&
-      user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
-    // Find fusion parameter associated with 'operand'.
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-    // Iterate through all users of all uses of the fusion parameter value.
-    // Return false if any uses are detected, returns true otherwise.
-    const HloValue& value = GetValueDefinedAt(fusion_param, index);
-    return value.uses().empty();
-  } else {
-    // Return false if no value at 'operand' and 'index' is used at 'user'.
-    for (const HloValue* value : GetValueSet(operand, index).values()) {
-      for (const HloUse& use : value->uses()) {
-        if (use.instruction == user) {
-          return false;
+  // Return false if no value at 'operand' and 'index' is used at 'user'.
+  for (const HloValue* value : GetValueSet(operand, index).values()) {
+    for (const HloUse& use : value->uses()) {
+      if (use.instruction == user) {
+        if (user->opcode() == HloOpcode::kFusion &&
+            user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+          HloInstruction* fusion_param =
+              user->fused_parameter(use.operand_number);
+          const HloValue& value =
+              GetValueDefinedAt(fusion_param, use.operand_index);
+          return value.uses().empty();
         }
+        return false;
       }
     }
   }
-
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index f4abc7a7c7..a1678d4943 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -138,7 +138,8 @@ class HloDataflowAnalysis {
   // Returns true if 'user' cannot possibly use the buffer at 'index' in
   // 'operand'. Returns false otherwise.
   //
-  // REQUIRES: 'operand' is an operand of 'user'.
+  // 'operand' does not have to be an operand of 'user'. This can be the case
+  // with indirect uses.
   bool DoesNotUseOperandBuffer(const HloInstruction* operand,
                                const ShapeIndex& index,
                                const HloInstruction* user) const;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 4755c4a0cf..d1a96c10f8 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1963,6 +1963,54 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
   EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion));
 }
 
+// Similar to FusedDynamicUpdateSlice above, but tests indirect uses of the
+// parameter tuple.
+TEST_F(DoesNotUseOperandBufferTest, IndirectUses) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto t0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple_param, 0));
+  auto t1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple_param, 1));
+  // Swap the tuple elements.
+  auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({t1, t0}));
+
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction never uses tuple element 0, but does use element 1.
+  EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {0}, fusion));
+  EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion));
+  // The same holds for the parameter tuple, except that the tuple elements are
+  // swapped in 'tuple'.
+  EXPECT_TRUE(
+      dataflow_analysis_->DoesNotUseOperandBuffer(tuple_param, {1}, fusion));
+  EXPECT_FALSE(
+      dataflow_analysis_->DoesNotUseOperandBuffer(tuple_param, {0}, fusion));
+}
+
 class CanShareOperandBufferWithUserTest : public HloDataflowAnalysisTestBase {};
 
 TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
diff --git a/tensorflow/compiler/xla/service/hlo_dce.h b/tensorflow/compiler/xla/service/hlo_dce.h
index 4e244494d6..1fe69b1395 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.h
+++ b/tensorflow/compiler/xla/service/hlo_dce.h
@@ -36,7 +36,7 @@ namespace xla {
 class HloDCE : public HloPassInterface {
  public:
   ~HloDCE() override {}
-  tensorflow::StringPiece name() const override { return "dce"; }
+  absl::string_view name() const override { return "dce"; }
 
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 26e3736e01..3b5cde2996 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
index 78955db0da..72185698c9 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -31,31 +31,10 @@ class HloDomainIsolator::RunContext {
   StatusOr<bool> Run();
 
  private:
-  // Inserts a kDomain instruction between parent and operand, in case
-  // the attribute (ie, sharding) values change between instruction and operand.
-  // Returns the newly inserted kDomain instruction, or nullptr if no kDomain
-  // instruction was necessary.
-  StatusOr<HloInstruction*> CreateDomain(HloInstruction* instruction,
-                                         HloInstruction* parent,
-                                         HloInstruction* operand);
-
   HloModule* module_;
   HloDomainIsolator* isolator_;
 };
 
-StatusOr<HloInstruction*> HloDomainIsolator::RunContext::CreateDomain(
-    HloInstruction* instruction, HloInstruction* parent,
-    HloInstruction* operand) {
-  HloInstruction* domain = nullptr;
-  std::unique_ptr<HloInstruction> domain_instruction =
-      isolator_->creator_(instruction, operand);
-  if (domain_instruction != nullptr) {
-    domain = operand->parent()->AddInstruction(std::move(domain_instruction));
-    TF_RETURN_IF_ERROR(operand->ReplaceUseWith(parent, domain));
-  }
-  return domain;
-}
-
 StatusOr<bool> HloDomainIsolator::RunContext::Run() {
   hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Isolator");
 
@@ -71,16 +50,16 @@ StatusOr<bool> HloDomainIsolator::RunContext::Run() {
         // When applying multiple domains, we could end up stacking more than
         // one in one edge, so here we want to build the effective
         // (kDomain-less) instruction->operand edge.
-        HloInstruction* parent = instruction;
-        while (operand->opcode() == HloOpcode::kDomain) {
-          parent = operand;
-          operand = operand->mutable_operand(0);
+        HloInstruction* root = operand;
+        while (root->opcode() == HloOpcode::kDomain) {
+          root = root->mutable_operand(0);
         }
         // Check whether a kDomain is necessary between instruction and operand.
-        TF_ASSIGN_OR_RETURN(HloInstruction * domain,
-                            CreateDomain(instruction, parent, operand));
+        HloInstruction* domain =
+            isolator_->creator_(instruction, root, operand);
         if (domain != nullptr) {
           VLOG(4) << "New domain: " << domain->ToString();
+          TF_RETURN_IF_ERROR(operand->ReplaceUseWith(instruction, domain));
           ++added_domains;
         }
       }
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.h b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
index eded3e78ee..d36631fc2f 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
@@ -34,14 +34,16 @@ class HloDomainIsolator : public HloPassInterface {
  public:
   // Creates a new kDomain instruction for the edge between the use instruction
   // (the first HloInstruction argument), and the operand instruction (the
-  // second HloInstruction argument).
+  // third HloInstruction argument) if the interesting attribute of the
+  // instruction differes from the attribute of the root (the second
+  // HloInstruction argument).
   // Returns nullptr in case no domain separation is necessary.
-  using DomainCreator = std::function<std::unique_ptr<HloInstruction>(
-      HloInstruction*, HloInstruction*)>;
+  using DomainCreator = std::function<HloInstruction*(
+      HloInstruction*, HloInstruction*, HloInstruction*)>;
 
   explicit HloDomainIsolator(DomainCreator creator);
 
-  tensorflow::StringPiece name() const override { return "domain_isolator"; }
+  absl::string_view name() const override { return "domain_isolator"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index 9e096320db..edf0073f30 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -25,14 +26,14 @@ namespace xla {
 
 /* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
     HloComputation* computation, string domain_kind) {
-  auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind)));
+  auto domain_map = absl::WrapUnique(new HloDomainMap(std::move(domain_kind)));
   TF_RETURN_IF_ERROR(domain_map->Populate(computation));
   return std::move(domain_map);
 }
 
 /* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
     HloModule* module, string domain_kind) {
-  auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind)));
+  auto domain_map = absl::WrapUnique(new HloDomainMap(std::move(domain_kind)));
   for (HloComputation* computation : module->computations()) {
     TF_RETURN_IF_ERROR(domain_map->Populate(computation));
   }
@@ -56,14 +57,14 @@ Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) {
   // both sides.
   for (HloInstruction* operand : instruction->unique_operands()) {
     if (IsDomainInstruction(operand)) {
-      auto domain = MakeUnique<DomainMetadata::Domain>();
+      auto domain = absl::make_unique<DomainMetadata::Domain>();
       domain->enter_domains.insert(operand);
       domain->exit_domains.insert(instruction);
       TF_RETURN_IF_ERROR(InsertDomain(std::move(domain)));
     }
   }
   if (instruction == instruction->parent()->root_instruction()) {
-    auto domain = MakeUnique<DomainMetadata::Domain>();
+    auto domain = absl::make_unique<DomainMetadata::Domain>();
     domain->enter_domains.insert(instruction);
     TF_RETURN_IF_ERROR(InsertDomain(std::move(domain)));
   }
@@ -143,7 +144,7 @@ Status HloDomainMap::ExpandDomain(HloInstruction* instruction,
 
 StatusOr<std::unique_ptr<DomainMetadata::Domain>> HloDomainMap::CreateDomain(
     HloInstruction* instruction) const {
-  auto domain = MakeUnique<DomainMetadata::Domain>();
+  auto domain = absl::make_unique<DomainMetadata::Domain>();
   TF_RETURN_IF_ERROR(ExpandDomain(instruction, domain.get()));
   domain->instructions = MakeNonDomainInstructions(domain->reach_set);
   return std::move(domain);
diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
index f855f2a1fc..575149c8b8 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace xla {
@@ -63,7 +63,7 @@ class DomainMetadata {
 
   // Returns the metadata type. A unique identifier which describes the real
   // metadata type.
-  virtual tensorflow::StringPiece Kind() const = 0;
+  virtual absl::string_view Kind() const = 0;
 
   // Compares the metadata object with another one and returns true if the
   // two matches.
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.h b/tensorflow/compiler/xla/service/hlo_domain_remover.h
index c859e05f02..97bc8ef604 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_remover.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.h
@@ -35,13 +35,13 @@ class HloDomainRemover : public HloPassInterface {
   // instructions in it with the same attributes (ie, sharding), a normalizer
   // function is tasked at applying attribute normalization on the instructions
   // within such domain.
-  HloDomainRemover(tensorflow::StringPiece kind,
+  HloDomainRemover(absl::string_view kind,
                    std::function<Status(const DomainMetadata::Domain&,
                                         const DomainMetadata* metadata)>
                        normalizer)
-      : kind_(kind.ToString()), normalizer_(std::move(normalizer)) {}
+      : kind_(kind), normalizer_(std::move(normalizer)) {}
 
-  tensorflow::StringPiece name() const override { return "domain_remover"; }
+  absl::string_view name() const override { return "domain_remover"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index 70271be304..79e78ee2d0 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
@@ -28,6 +29,11 @@ namespace xla {
 namespace {
 
 class HloDomainTest : public HloVerifiedTestBase {
+ public:
+  HloDomainTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+
  protected:
   bool FindUserViaDomainPath(HloInstruction* instruction,
                              HloInstruction* operand) const {
@@ -45,9 +51,8 @@ class HloDomainTest : public HloVerifiedTestBase {
 
   // Checks whether there is a kDomain instruction in the edge between the
   // instruction and the operand.
-  bool HasDomainEdge(HloModule* module,
-                     tensorflow::StringPiece instruction_name,
-                     tensorflow::StringPiece operand_name) {
+  bool HasDomainEdge(HloModule* module, absl::string_view instruction_name,
+                     absl::string_view operand_name) {
     HloInstruction* instruction = FindInstruction(module, instruction_name);
     HloInstruction* operand = FindInstruction(module, operand_name);
     CHECK_NE(instruction, nullptr);
@@ -65,7 +70,7 @@ class HloDomainTest : public HloVerifiedTestBase {
     return false;
   }
 
-  StatusOr<HloModule*> ParseModule(tensorflow::StringPiece hlo_string) {
+  StatusOr<HloModule*> ParseModule(absl::string_view hlo_string) {
     HloModuleConfig config;
     config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
     ParseAndVerifyModule(hlo_string, config);
@@ -80,10 +85,10 @@ class OpNameMetadata : public DomainMetadata {
   explicit OpNameMetadata(string opname) : opname_(std::move(opname)) {}
 
   std::unique_ptr<DomainMetadata> Clone() const override {
-    return MakeUnique<OpNameMetadata>(opname_);
+    return absl::make_unique<OpNameMetadata>(opname_);
   }
 
-  tensorflow::StringPiece Kind() const override { return KindName(); }
+  absl::string_view Kind() const override { return KindName(); }
 
   bool Matches(const DomainMetadata& other) const override {
     const OpNameMetadata* other_ptr =
@@ -97,25 +102,26 @@ class OpNameMetadata : public DomainMetadata {
 
   string ToString() const override { return opname_; }
 
-  static tensorflow::StringPiece KindName() { return "opname"; }
+  static absl::string_view KindName() { return "opname"; }
 
  private:
   string opname_;
 };
 
 // Creator function for OpNameMetadata domains.
-std::unique_ptr<HloInstruction> OpNameDomainCreator(HloInstruction* instruction,
-                                                    HloInstruction* operand) {
-  if (instruction->metadata().op_name() == operand->metadata().op_name()) {
+HloInstruction* OpNameDomainCreator(HloInstruction* instruction,
+                                    HloInstruction* root,
+                                    HloInstruction* operand) {
+  if (instruction->metadata().op_name() == root->metadata().op_name()) {
     return nullptr;
   }
   std::unique_ptr<DomainMetadata> operand_side_metadata =
-      MakeUnique<OpNameMetadata>(operand->metadata().op_name());
+      absl::make_unique<OpNameMetadata>(root->metadata().op_name());
   std::unique_ptr<DomainMetadata> user_side_metadata =
-      MakeUnique<OpNameMetadata>(instruction->metadata().op_name());
-  return HloInstruction::CreateDomain(operand->shape(), operand,
-                                      std::move(operand_side_metadata),
-                                      std::move(user_side_metadata));
+      absl::make_unique<OpNameMetadata>(instruction->metadata().op_name());
+  return operand->parent()->AddInstruction(HloInstruction::CreateDomain(
+      operand->shape(), operand, std::move(operand_side_metadata),
+      std::move(user_side_metadata)));
 }
 
 Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain,
@@ -142,7 +148,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(CreateShardingDomain);
+  HloDomainIsolator isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
   EXPECT_TRUE(isolator_changed);
 
@@ -184,7 +190,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(CreateShardingDomain);
+  HloDomainIsolator isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
   EXPECT_TRUE(!isolator_changed);
 }
@@ -211,7 +217,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(CreateShardingDomain);
+  HloDomainIsolator isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
   EXPECT_TRUE(isolator_changed);
 
@@ -248,7 +254,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(CreateShardingDomain);
+  HloDomainIsolator isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
   EXPECT_FALSE(isolator_changed);
 }
@@ -302,7 +308,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator sharding_isolator(CreateShardingDomain);
+  HloDomainIsolator sharding_isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_isolator_changed,
                           sharding_isolator.Run(module));
   EXPECT_TRUE(sharding_isolator_changed);
@@ -356,7 +362,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(CreateShardingDomain);
+  HloDomainIsolator isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
   EXPECT_TRUE(isolator_changed);
 
@@ -445,7 +451,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
 
-  HloDomainIsolator isolator(CreateShardingDomain);
+  HloDomainIsolator isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
   EXPECT_TRUE(isolator_changed);
 
@@ -474,8 +480,8 @@ ENTRY entry {
 TEST_F(HloDomainTest, DumpParseNullSharding) {
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {});
-  auto sharding_md_0 = MakeUnique<ShardingMetadata>(nullptr);
-  auto sharding_md_1 = MakeUnique<ShardingMetadata>(nullptr);
+  auto sharding_md_0 = absl::make_unique<ShardingMetadata>(nullptr);
+  auto sharding_md_1 = absl::make_unique<ShardingMetadata>(nullptr);
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p"));
   HloInstruction* domain = builder.AddInstruction(HloInstruction::CreateDomain(
@@ -504,7 +510,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
 
-  HloDomainIsolator isolator(CreateShardingDomain);
+  HloDomainIsolator isolator(ShardingDomainCreator{});
   TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
   EXPECT_TRUE(isolator_changed);
 
@@ -523,5 +529,64 @@ ENTRY entry {
             tpl->sharding());
 }
 
+TEST_F(HloDomainTest, MultiDomainMultiUser) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+ENTRY %entry (p0: (f32[4], f32[4])) -> (f32[4], f32[4], f32[4]) {
+  %p0 = (f32[4], f32[4]) parameter(0)
+  %a = f32[4]{0} get-tuple-element(%p0), index=0
+  %domain = f32[4] domain(%a),
+    domain={kind="sharding", entry={maximal device=1}, exit={maximal device=0}}
+  %b = f32[4] get-tuple-element(%p0), index=1
+  %domain.1 = f32[4] domain(%b),
+    domain={kind="sharding", entry={maximal device=1}, exit={maximal device=0}}
+  %c = f32[4] add(%domain, %domain.1), sharding={maximal device=1}
+  %domain.2 = f32[4] domain(%c),
+    domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}}
+  %d = f32[4] subtract(%domain, %c),
+    sharding={maximal device=1}, metadata={op_name="D"}
+  %domain.3 = f32[4] domain(%d),
+    domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}}
+  %e = f32[4] multiply(%c, %d),
+    sharding={maximal device=1}, metadata={op_name="D"}
+  %f = f32[4] add(f32[4]{0} %e, f32[4]{0} %c), sharding={maximal device=1}
+  %domain.4 = f32[4]{0} domain(%f),
+    domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}}
+  ROOT %g = (f32[4], f32[4], f32[4]) tuple(%domain.2, %domain.3, %domain.4)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator opname_isolator(OpNameDomainCreator);
+  TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
+                          opname_isolator.Run(module));
+  EXPECT_TRUE(opname_isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module, "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+
+  HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
+                                    ShardingMetadata::NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
+                          sharding_remover.Run(module));
+  EXPECT_TRUE(sharding_remover_changed);
+
+  HloDomainRemover opname_remover(OpNameMetadata::KindName(),
+                                  OpNameDomainNormalizer);
+  TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
+                          opname_remover.Run(module));
+  EXPECT_TRUE(opname_remover_changed);
+
+  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module, "d", "c"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_verifier.cc b/tensorflow/compiler/xla/service/hlo_domain_verifier.cc
index 751fc677e2..dc514ae3e5 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_verifier.cc
@@ -52,7 +52,7 @@ Status HloDomainVerifier::RunContext::PopulateDomainKinds() {
           TF_RET_CHECK(instruction->user_side_metadata().Kind() ==
                        instruction->operand_side_metadata().Kind())
               << instruction->ToString();
-          kinds.insert(instruction->user_side_metadata().Kind().ToString());
+          kinds.insert(string(instruction->user_side_metadata().Kind()));
         }
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_domain_verifier.h b/tensorflow/compiler/xla/service/hlo_domain_verifier.h
index 8e53cf97f8..81d6d69a8c 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_verifier.h
@@ -33,7 +33,7 @@ class HloDomainVerifier : public HloPassInterface {
  public:
   HloDomainVerifier(std::vector<string> kinds) : kinds_(std::move(kinds)) {}
 
-  tensorflow::StringPiece name() const override { return "domain_verifier"; }
+  absl::string_view name() const override { return "domain_verifier"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.h b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
index 2b109225d0..44ded2c2fa 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.h
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
@@ -32,9 +32,7 @@ class HloElementTypeConverter : public HloPassInterface {
   HloElementTypeConverter(PrimitiveType eliminate_type,
                           PrimitiveType replace_with_type);
 
-  tensorflow::StringPiece name() const override {
-    return "element_type_converter";
-  }
+  absl::string_view name() const override { return "element_type_converter"; }
 
   // Returns the pass on the module and returns whether the module was modified.
   StatusOr<bool> Run(HloModule* module) override;
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 51353eea6e..ca1c4dd0e9 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -23,13 +23,15 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -43,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -95,7 +96,7 @@ StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
                  << HloOpcodeString(opcode);
   }
 
-  auto result = MakeUnique<Literal>(shape);
+  auto result = absl::make_unique<Literal>(shape);
   TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
     return compare_op(lhs_literal.Get<OperandT>(multi_index),
                       rhs_literal.Get<OperandT>(multi_index));
@@ -125,7 +126,7 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
                  << HloOpcodeString(opcode);
   }
 
-  auto result = MakeUnique<Literal>(shape);
+  auto result = absl::make_unique<Literal>(shape);
   TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
     return compare_op(lhs_literal.Get<complex64>(multi_index),
                       rhs_literal.Get<complex64>(multi_index));
@@ -138,44 +139,57 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
 
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
-  typed_visitors_[PRED] = MakeUnique<HloEvaluatorTypedVisitor<bool>>(this);
-  typed_visitors_[U8] = MakeUnique<HloEvaluatorTypedVisitor<uint8>>(this);
-  typed_visitors_[U16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented(
-        "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
-        "U16.");
-  });
-  typed_visitors_[U32] = MakeUnique<HloEvaluatorTypedVisitor<uint32>>(this);
-  typed_visitors_[U64] = MakeUnique<HloEvaluatorTypedVisitor<uint64>>(this);
-  typed_visitors_[S8] = MakeUnique<HloEvaluatorTypedVisitor<int8>>(this);
-  typed_visitors_[S16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented(
-        "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
-        "S16.");
-  });
-  typed_visitors_[S32] = MakeUnique<HloEvaluatorTypedVisitor<int32>>(this);
-  typed_visitors_[S64] = MakeUnique<HloEvaluatorTypedVisitor<int64>>(this);
+  typed_visitors_[PRED] =
+      absl::make_unique<HloEvaluatorTypedVisitor<bool>>(this);
+  typed_visitors_[U8] =
+      absl::make_unique<HloEvaluatorTypedVisitor<uint8>>(this);
+  typed_visitors_[U16] =
+      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
+        return Unimplemented(
+            "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
+            "U16.");
+      });
+  typed_visitors_[U32] =
+      absl::make_unique<HloEvaluatorTypedVisitor<uint32>>(this);
+  typed_visitors_[U64] =
+      absl::make_unique<HloEvaluatorTypedVisitor<uint64>>(this);
+  typed_visitors_[S8] = absl::make_unique<HloEvaluatorTypedVisitor<int8>>(this);
+  typed_visitors_[S16] =
+      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
+        return Unimplemented(
+            "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
+            "S16.");
+      });
+  typed_visitors_[S32] =
+      absl::make_unique<HloEvaluatorTypedVisitor<int32>>(this);
+  typed_visitors_[S64] =
+      absl::make_unique<HloEvaluatorTypedVisitor<int64>>(this);
   typed_visitors_[F16] =
-      MakeUnique<HloEvaluatorTypedVisitor<Eigen::half, float>>(this);
-  typed_visitors_[F32] = MakeUnique<HloEvaluatorTypedVisitor<float>>(this);
-  typed_visitors_[F64] = MakeUnique<HloEvaluatorTypedVisitor<double>>(this);
-  typed_visitors_[C64] = MakeUnique<HloEvaluatorTypedVisitor<complex64>>(this);
+      absl::make_unique<HloEvaluatorTypedVisitor<Eigen::half, float>>(this);
+  typed_visitors_[F32] =
+      absl::make_unique<HloEvaluatorTypedVisitor<float>>(this);
+  typed_visitors_[F64] =
+      absl::make_unique<HloEvaluatorTypedVisitor<double>>(this);
+  typed_visitors_[C64] =
+      absl::make_unique<HloEvaluatorTypedVisitor<complex64>>(this);
 
   // Most of the evaluator computations we use don't support BF16 (e.g.,
   // std::ceil, std::tanh). To make evaluator work with BF16, we set all
   // elementwise computations to be done in F32 and do BF16<->F32 conversion
   // around the input and the output of the computations.
   typed_visitors_[BF16] =
-      MakeUnique<HloEvaluatorTypedVisitor<bfloat16, float>>(this);
-
-  typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented(
-        "HloEvaluatorTypedVisitor: unhandled primitive type: TUPLE.");
-  });
-  typed_visitors_[OPAQUE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented(
-        "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE.");
-  });
+      absl::make_unique<HloEvaluatorTypedVisitor<bfloat16, float>>(this);
+
+  typed_visitors_[TUPLE] =
+      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
+        return Unimplemented(
+            "HloEvaluatorTypedVisitor: unhandled primitive type: TUPLE.");
+      });
+  typed_visitors_[OPAQUE] =
+      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
+        return Unimplemented(
+            "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE.");
+      });
 }
 
 template <typename LiteralPtr>
@@ -216,7 +230,6 @@ template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     HloInstruction* instruction, ArraySlice<LiteralPtr> arg_literals) {
   TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape()));
 
   evaluated_.clear();
   arg_literals_.clear();
@@ -253,7 +266,6 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     return tensorflow::errors::FailedPrecondition(
         "Not all operands are constants.");
   }
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape()));
 
   arg_literals_.clear();
   evaluated_.clear();
@@ -555,43 +567,41 @@ Status HloEvaluator::HandleTuple(HloInstruction* tuple) {
   return Status::OK();
 }
 
-// Returns an ShapeUtil::IndexIterationSpace that iterates over the output
-// gather dimensions while keeping the rest of the output dimensions clamped to
-// 0.
-ShapeUtil::IndexIterationSpace IterationSpaceForOutputGatherIndices(
+// Returns an ShapeUtil::IndexIterationSpace that iterates over the output batch
+// dimensions while keeping the rest of the output dimensions clamped to 0.
+ShapeUtil::IndexIterationSpace IterationSpaceForOutputBatchIndices(
     const Shape& output_shape, const GatherDimensionNumbers& dim_numbers) {
   int64 output_rank = output_shape.dimensions_size();
   std::vector<int64> index_base(output_rank, 0);
   std::vector<int64> index_count;
   index_count.reserve(output_rank);
   for (int64 i = 0; i < output_rank; i++) {
-    bool is_output_gather_dim =
-        !c_binary_search(dim_numbers.output_window_dims(), i);
-    index_count.push_back(is_output_gather_dim ? output_shape.dimensions(i)
-                                               : 1);
+    bool is_output_batch_dim =
+        !absl::c_binary_search(dim_numbers.offset_dims(), i);
+    index_count.push_back(is_output_batch_dim ? output_shape.dimensions(i) : 1);
   }
 
   return {std::move(index_base), std::move(index_count),
           std::vector<int64>(output_rank, 1)};
 }
 
-// Return an ShapeUtil::IndexIterationSpace that iterates over the output window
+// Return an ShapeUtil::IndexIterationSpace that iterates over the output slice
 // dimensions while keeping the rest of the output dimensions clamped to 0.
-ShapeUtil::IndexIterationSpace IterationSpaceForOutputWindowIndices(
-    int64 output_rank, ArraySlice<int64> window_bounds,
+ShapeUtil::IndexIterationSpace IterationSpaceForOutputOffsetIndices(
+    int64 output_rank, ArraySlice<int64> slice_sizes,
     const GatherDimensionNumbers& dim_numbers) {
   std::vector<int64> index_base(output_rank, 0);
   std::vector<int64> index_count(output_rank, 1);
-  int64 window_bounds_idx = 0;
+  int64 slice_sizes_idx = 0;
   for (int64 i = 0; i < output_rank; i++) {
     bool is_output_window_dim =
-        c_binary_search(dim_numbers.output_window_dims(), i);
+        absl::c_binary_search(dim_numbers.offset_dims(), i);
     if (is_output_window_dim) {
-      while (c_binary_search(dim_numbers.elided_window_dims(),
-                             window_bounds_idx)) {
-        window_bounds_idx++;
+      while (absl::c_binary_search(dim_numbers.collapsed_slice_dims(),
+                                   slice_sizes_idx)) {
+        slice_sizes_idx++;
       }
-      index_count[i] = window_bounds[window_bounds_idx++];
+      index_count[i] = slice_sizes[slice_sizes_idx++];
     }
   }
 
@@ -599,30 +609,30 @@ ShapeUtil::IndexIterationSpace IterationSpaceForOutputWindowIndices(
           std::vector<int64>(output_rank, 1)};
 }
 
-// This functor computes the contribution of gather_indices to an input index
+// This functor computes the contribution of start_indices to an input index
 // corresponding to an output index.  That is, given an output index I, it picks
-// out the gather output indices in I and uses them to look up a gather index,
-// G, from the gather indices tensor, and expands G into the input space
-// according to gather_dims_to_operand_dims.
-class OutputGatherIndexToInputIndex {
+// out the batch indices in I and uses them to look up a starting index, G, from
+// the start indices tensor, and expands G into the input space according to
+// start_index_map.
+class OutputBatchIndexToInputIndex {
  public:
   // The constructor does some setup work that is amortized across all
   // iterations.
-  explicit OutputGatherIndexToInputIndex(
+  explicit OutputBatchIndexToInputIndex(
       const GatherDimensionNumbers* dim_numbers, const Shape& input_shape,
-      const Shape& output_shape, const Literal* gather_indices)
-      : dim_numbers_(*dim_numbers), gather_indices_(*gather_indices) {
+      const Shape& output_shape, const Literal* start_indices)
+      : dim_numbers_(*dim_numbers), start_indices_(*start_indices) {
     for (int64 i = 0; i < output_shape.dimensions_size(); i++) {
-      output_dim_is_gather_dims_.push_back(
-          !c_binary_search(dim_numbers_.output_window_dims(), i));
+      output_dim_is_batch_dims_.push_back(
+          !absl::c_binary_search(dim_numbers_.offset_dims(), i));
     }
 
     for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
       int64 index_of_input_dim_in_index_vector =
-          std::distance(dim_numbers_.gather_dims_to_operand_dims().begin(),
-                        c_find(dim_numbers_.gather_dims_to_operand_dims(), i));
+          std::distance(dim_numbers_.start_index_map().begin(),
+                        absl::c_find(dim_numbers_.start_index_map(), i));
       if (index_of_input_dim_in_index_vector ==
-          dim_numbers_.gather_dims_to_operand_dims_size()) {
+          dim_numbers_.start_index_map_size()) {
         input_dim_value_to_index_vector_.push_back(-1);
       } else {
         input_dim_value_to_index_vector_.push_back(
@@ -630,14 +640,14 @@ class OutputGatherIndexToInputIndex {
       }
     }
 
-    index_vector_index_.resize(gather_indices_.shape().dimensions_size());
+    index_vector_index_.resize(start_indices_.shape().dimensions_size());
     input_index_.resize(input_shape.dimensions_size());
     int64 index_vector_size =
-        gather_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
+        start_indices_.shape().dimensions(dim_numbers_.index_vector_dim());
     index_vector_.resize(index_vector_size);
   }
 
-  // Returns the contribution of gather_indices to the input index corresponding
+  // Returns the contribution of start_indices to the input index corresponding
   // to output_index.  See gather_inner_loop_body.
   //
   // This is conceptually  a stateless transformation from output_index to the
@@ -659,7 +669,7 @@ class OutputGatherIndexToInputIndex {
   }
 
  private:
-  // Propagates the gather index dimensions from the output index into
+  // Propagates the batch dimensions from the output index into
   // index_vector_index_ by mutating index_vector_index_ in place.  Does not
   // update the dim_numbers.index_vector_dim() dimension -- that's the dimension
   // we iterate over in FetchIndexVector.
@@ -667,7 +677,7 @@ class OutputGatherIndexToInputIndex {
       ArraySlice<int64> output_index) {
     int64 index_vector_index_i = 0;
     for (int64 i = 0, e = output_index.size(); i < e; i++) {
-      if (!output_dim_is_gather_dims_[i]) {
+      if (!output_dim_is_batch_dims_[i]) {
         continue;
       }
 
@@ -679,14 +689,14 @@ class OutputGatherIndexToInputIndex {
     }
   }
 
-  // Populates index_vector_ by iterating over gather_indices_ according to
+  // Populates index_vector_ by iterating over start_indices_ according to
   // index_vector_index_.
   Status FetchIndexVector() {
     int64 index_vector_dim = dim_numbers_.index_vector_dim();
     for (int64 i = 0, e = index_vector_.size(); i < e; i++) {
       index_vector_index_[index_vector_dim] = i;
-      TF_ASSIGN_OR_RETURN(index_vector_[i], gather_indices_.GetIntegralAsS64(
-                                                index_vector_index_));
+      TF_ASSIGN_OR_RETURN(index_vector_[i],
+                          start_indices_.GetIntegralAsS64(index_vector_index_));
     }
     return Status::OK();
   }
@@ -708,15 +718,15 @@ class OutputGatherIndexToInputIndex {
   // PropagateIndexVectorToInputIndex.
   std::vector<int64> input_dim_value_to_index_vector_;
 
-  // output_dim_is_gather_dims_[i] is true iff the output index i is a gather
+  // output_dim_is_batch_dims_[i] is true iff the output index i is a gather
   // dimension.
-  std::vector<bool> output_dim_is_gather_dims_;
+  std::vector<bool> output_dim_is_batch_dims_;
 
-  // The buffer into which we construct an index into gather_indices_ to fetch
+  // The buffer into which we construct an index into start_indices_ to fetch
   // the index vector.
   std::vector<int64> index_vector_index_;
 
-  // The index vector fetched from gather_indices_.
+  // The index vector fetched from start_indices_.
   std::vector<int64> index_vector_;
 
   // The result computed by this functor.  operator() returns an ArraySlice into
@@ -724,24 +734,23 @@ class OutputGatherIndexToInputIndex {
   std::vector<int64> input_index_;
 
   const GatherDimensionNumbers& dim_numbers_;
-  const Literal& gather_indices_;
+  const Literal& start_indices_;
 };
 
-// This functor computes the contribution of the window indices in an output
+// This functor computes the contribution of the offset indices in an output
 // index to an input index.  That is, given an output index I it picks out the
-// output window indices in I and expands it into a window index into the input
-// shape.
-class OutputWindowIndexToInputIndex {
+// output offset indices in I and expands it into an index into the input shape.
+class OutputOffsetIndexToInputIndex {
  public:
   // The constructor does some setup work that is amortized across all
   // iterations.
-  explicit OutputWindowIndexToInputIndex(
+  explicit OutputOffsetIndexToInputIndex(
       const GatherDimensionNumbers& dim_numbers, const Shape& input_shape,
       const Shape& output_shape) {
     std::vector<int64> window_index_to_output_index;
     int64 output_index_count = 0;
     for (int64 i = 0; i < output_shape.dimensions_size(); i++) {
-      if (c_binary_search(dim_numbers.output_window_dims(), i)) {
+      if (absl::c_binary_search(dim_numbers.offset_dims(), i)) {
         window_index_to_output_index.push_back(output_index_count++);
       } else {
         output_index_count++;
@@ -750,7 +759,7 @@ class OutputWindowIndexToInputIndex {
 
     int64 window_dim_count = 0;
     for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
-      if (c_binary_search(dim_numbers.elided_window_dims(), i)) {
+      if (absl::c_binary_search(dim_numbers.collapsed_slice_dims(), i)) {
         input_dim_value_to_output_index_.push_back(-1);
       } else {
         input_dim_value_to_output_index_.push_back(
@@ -808,20 +817,20 @@ class OutputWindowIndexToInputIndex {
 
 // Rehapes the gather indices input to have a trailing degenerate `1` dimension
 // if necessary.  Hands over the ownership of the newly created literal (if
-// there is one) to `reshaped_gather_indices`.
+// there is one) to `reshaped_start_indices`.
 static StatusOr<std::reference_wrapper<const Literal>> ReshapedGatherIndices(
-    int64 index_vector_dim, const Literal& gather_indices,
-    std::unique_ptr<Literal>* reshaped_gather_indices) {
-  if (gather_indices.shape().dimensions_size() != index_vector_dim) {
-    return std::cref(gather_indices);
+    int64 index_vector_dim, const Literal& start_indices,
+    std::unique_ptr<Literal>* reshaped_start_indices) {
+  if (start_indices.shape().dimensions_size() != index_vector_dim) {
+    return std::cref(start_indices);
   }
 
-  std::vector<int64> new_shape(gather_indices.shape().dimensions().begin(),
-                               gather_indices.shape().dimensions().end());
+  std::vector<int64> new_shape(start_indices.shape().dimensions().begin(),
+                               start_indices.shape().dimensions().end());
   new_shape.push_back(1);
-  TF_ASSIGN_OR_RETURN(*reshaped_gather_indices,
-                      gather_indices.Reshape(new_shape));
-  return std::cref(**reshaped_gather_indices);
+  TF_ASSIGN_OR_RETURN(*reshaped_start_indices,
+                      start_indices.Reshape(new_shape));
+  return std::cref(**reshaped_start_indices);
 }
 
 Status HloEvaluator::HandleGather(HloInstruction* gather) {
@@ -830,34 +839,33 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
   const GatherDimensionNumbers& dim_numbers =
       gather->gather_dimension_numbers();
   const Literal& operand = GetEvaluatedLiteralFor(gather->operand(0));
-  std::unique_ptr<Literal> reshaped_gather_indices;
+  std::unique_ptr<Literal> reshaped_start_indices;
   TF_ASSIGN_OR_RETURN(
-      const Literal& gather_indices,
+      const Literal& start_indices,
       ReshapedGatherIndices(dim_numbers.index_vector_dim(),
                             GetEvaluatedLiteralFor(gather->operand(1)),
-                            &reshaped_gather_indices));
+                            &reshaped_start_indices));
 
   // We iterate over the gather dimensions in the output shape in an outer loop
   // nest, and iterate over the window dimensions in the output shape in an
   // inner loop nest.
 
-  ShapeUtil::IndexIterationSpace gather_indices_iteration_space =
-      IterationSpaceForOutputGatherIndices(shape, dim_numbers);
-  ShapeUtil::IndexIterationSpace window_indices_iteration_space =
-      IterationSpaceForOutputWindowIndices(
-          shape.dimensions_size(), gather->gather_window_bounds(), dim_numbers);
+  ShapeUtil::IndexIterationSpace start_indices_iteration_space =
+      IterationSpaceForOutputBatchIndices(shape, dim_numbers);
+  ShapeUtil::IndexIterationSpace offset_indices_iteration_space =
+      IterationSpaceForOutputOffsetIndices(
+          shape.dimensions_size(), gather->gather_slice_sizes(), dim_numbers);
 
   // Scratch buffers that hold an index in the output shape and the
   // corresponding index in the input shape.
   std::vector<int64> input_index(operand.shape().dimensions_size());
   std::vector<int64> output_index(gather->shape().dimensions_size());
-  std::vector<int64> input_gather_index_clamped(
-      operand.shape().dimensions_size());
+  std::vector<int64> input_index_clamped(operand.shape().dimensions_size());
 
-  OutputGatherIndexToInputIndex output_gather_index_to_input_index(
+  OutputBatchIndexToInputIndex output_batch_index_to_input_index(
       &gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
-      /*output_shape=*/shape, &gather_indices);
-  OutputWindowIndexToInputIndex output_window_index_to_input_index(
+      /*output_shape=*/shape, &start_indices);
+  OutputOffsetIndexToInputIndex output_offset_index_to_input_index(
       gather->gather_dimension_numbers(), /*input_shape=*/operand.shape(),
       /*output_shape=*/shape);
 
@@ -869,29 +877,29 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
           ArraySlice<int64> output_gather_index) -> StatusOr<bool> {
     TF_ASSIGN_OR_RETURN(
         ArraySlice<int64> input_window_index,
-        output_window_index_to_input_index(output_window_index));
+        output_offset_index_to_input_index(output_window_index));
     for (int i = 0, e = output_index.size(); i < e; i++) {
       output_index[i] = output_gather_index[i] + output_window_index[i];
       DCHECK_LT(output_index[i], shape.dimensions(i));
     }
     for (int i = 0, e = input_gather_index.size(); i < e; i++) {
       int64 output_dim =
-          output_window_index_to_input_index.input_dim_value_to_output_index(i);
+          output_offset_index_to_input_index.input_dim_value_to_output_index(i);
       // If 'output_dim' is -1, it means 'i' is an elided window dim. This means
       // we set the iteration index to 0, so for the purpose of the following
       // calculations we can consider the output dimension size to be 1.
       int64 output_dim_size =
           output_dim == -1 ? 1 : shape.dimensions(output_dim);
       // Clamp the gather index so that the gather region fits in the operand.
-      // input_gather_index_clamped[i] = clamp(input_gather_index[i], 0,
+      // input_index_clamped[i] = clamp(input_gather_index[i], 0,
       //                                       operand_shape.dimensions(i) -
       //                                       output_dim_size);
-      input_gather_index_clamped[i] =
+      input_index_clamped[i] =
           std::min(operand_shape.dimensions(i) - output_dim_size,
                    std::max(0LL, input_gather_index[i]));
     }
     for (int i = 0, e = input_index.size(); i < e; i++) {
-      input_index[i] = input_gather_index_clamped[i] + input_window_index[i];
+      input_index[i] = input_index_clamped[i] + input_window_index[i];
       DCHECK_GE(input_index[i], 0);
       DCHECK_LT(input_index[i], operand_shape.dimensions(i));
     }
@@ -902,18 +910,17 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
 
   auto gather_outer_loop_body =
       [&](ArraySlice<int64> output_gather_index) -> StatusOr<bool> {
-    TF_ASSIGN_OR_RETURN(
-        ArraySlice<int64> input_gather_index,
-        output_gather_index_to_input_index(output_gather_index));
+    TF_ASSIGN_OR_RETURN(ArraySlice<int64> input_gather_index,
+                        output_batch_index_to_input_index(output_gather_index));
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-        shape, window_indices_iteration_space,
+        shape, offset_indices_iteration_space,
         std::bind(gather_inner_loop_body, std::placeholders::_1,
                   input_gather_index, output_gather_index)));
     return true;
   };
 
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-      shape, gather_indices_iteration_space, gather_outer_loop_body));
+      shape, start_indices_iteration_space, gather_outer_loop_body));
   evaluated_[gather] = std::move(result);
   return Status::OK();
 }
@@ -960,7 +967,7 @@ Status HloEvaluator::HandleGetTupleElement(HloInstruction* get_tuple_element) {
 
   const Literal& operand_tuple_literal = GetEvaluatedLiteralFor(operand);
 
-  evaluated_[get_tuple_element] = MakeUnique<Literal>(
+  evaluated_[get_tuple_element] = absl::make_unique<Literal>(
       ShapeUtil::GetTupleElementShape(operand->shape(), index));
   return evaluated_[get_tuple_element]->CopyFrom(operand_tuple_literal,
                                                  /*dest_shape_index=*/{},
@@ -1162,10 +1169,11 @@ StatusOr<std::unique_ptr<Literal>> EvaluateSortInternal(
       result_keys.push_back(key_value.first);
       result_values.push_back(key_value.second);
     }
-    auto result_keys_literal = MakeUnique<Literal>(keys_literal.shape());
+    auto result_keys_literal = absl::make_unique<Literal>(keys_literal.shape());
     result_keys_literal->PopulateR1(
         tensorflow::gtl::ArraySlice<KeyType>(result_keys));
-    auto result_values_literal = MakeUnique<Literal>(values_literal.shape());
+    auto result_values_literal =
+        absl::make_unique<Literal>(values_literal.shape());
     result_values_literal->PopulateR1(
         tensorflow::gtl::ArraySlice<ValueType>(result_values));
     return std::make_pair(std::move(result_keys_literal),
@@ -1180,8 +1188,9 @@ StatusOr<std::unique_ptr<Literal>> EvaluateSortInternal(
   } else {
     // For R2 sort, the desired semantics are to sort each matrix row
     // independently.
-    auto keys_result_literal = MakeUnique<Literal>(keys_literal.shape());
-    auto values_result_literal = MakeUnique<Literal>(values_literal.shape());
+    auto keys_result_literal = absl::make_unique<Literal>(keys_literal.shape());
+    auto values_result_literal =
+        absl::make_unique<Literal>(values_literal.shape());
     int64 r1_length = keys_literal.shape().dimensions(1);
     for (int64 row = 0; row < keys_literal.shape().dimensions(0); ++row) {
       TF_ASSIGN_OR_RETURN(auto keys_r1_slice,
@@ -1274,7 +1283,7 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) {
 
 Status HloEvaluator::Preprocess(HloInstruction* hlo) {
   VLOG(2) << "About to visit HLO: " << hlo->ToString();
-  return Status::OK();
+  return ShapeUtil::ValidateShape(hlo->shape());
 }
 
 Status HloEvaluator::Postprocess(HloInstruction* hlo) {
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index a4c37ef328..7588916de5 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -226,7 +226,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
           ShapeUtil::HumanString(operand->shape()).c_str());
     }
 
-    auto result = MakeUnique<Literal>(shape);
+    auto result = absl::make_unique<Literal>(shape);
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
           return unary_op(operand_literal.Get<NativeT>(multi_index));
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 3ac6d68df3..c3af15c6a8 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -51,8 +52,11 @@ static std::array<bool, 2> use_bf16_params{true, false};
 class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
                          public HloVerifiedTestBase {
  protected:
-  HloEvaluatorTest() : use_bfloat16_(GetParam()) {
-    evaluator_ = MakeUnique<HloEvaluator>();
+  HloEvaluatorTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false),
+        use_bfloat16_(GetParam()) {
+    evaluator_ = absl::make_unique<HloEvaluator>();
   }
 
   std::unique_ptr<Literal> Evaluate(
@@ -523,7 +527,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
 
   std::unique_ptr<Literal> result = Evaluate();
 
-  auto expected_array = MakeUnique<Array4D<float>>(8, 5, 1, 1);
+  auto expected_array = absl::make_unique<Array4D<float>>(8, 5, 1, 1);
   expected_array->Fill(kPadValue);
   (*expected_array)(1, 0, 0, 0) = 1.0f;
   (*expected_array)(1, 2, 0, 0) = 2.0f;
@@ -547,7 +551,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
   //  { 9, 10, 11 },
   //  { 13, 14, 15 },
   // }
-  auto input_array = MakeUnique<Array2D<float>>(4, 3);
+  auto input_array = absl::make_unique<Array2D<float>>(4, 3);
   input_array->FillUnique(1.0f);
   auto input = LiteralUtil::CreateR2FromArray2D<float>(*input_array);
   HloInstruction* input_instruction =
@@ -568,7 +572,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
   std::unique_ptr<Literal> result = Evaluate();
 
   // f32[1,5] { 7.0, 2.718, 2.718, 2.718, 2.718 }
-  auto expected_array = MakeUnique<Array2D<float>>(1, 5);
+  auto expected_array = absl::make_unique<Array2D<float>>(1, 5);
   (*expected_array)(0, 0) = 7.0f;
   (*expected_array)(0, 1) = 2.718f;
   (*expected_array)(0, 2) = 2.718f;
@@ -588,7 +592,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   //  { 9, 10, 11 },
   //  { 13, 14, 15 },
   // }
-  auto input_array = MakeUnique<Array2D<float>>(4, 3);
+  auto input_array = absl::make_unique<Array2D<float>>(4, 3);
   input_array->FillUnique(1.0f);
   auto input = LiteralUtil::CreateR2FromArray2D<float>(*input_array);
   HloInstruction* input_instruction =
@@ -612,7 +616,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
 
   std::unique_ptr<Literal> result = Evaluate();
 
-  auto expected_array = MakeUnique<Array2D<float>>(0, 9);
+  auto expected_array = absl::make_unique<Array2D<float>>(0, 9);
   auto expected = LiteralUtil::CreateR2FromArray2D<float>(*expected_array);
 
   EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
@@ -628,7 +632,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   //  { 3 },
   //  { 4 },
   // }
-  auto lhs_array = MakeUnique<Array2D<float>>(4, 1);
+  auto lhs_array = absl::make_unique<Array2D<float>>(4, 1);
   lhs_array->FillUnique(1.0f);
   auto lhs_literal = LiteralUtil::CreateR2FromArray2D<float>(*lhs_array);
   HloInstruction* lhs_instruction =
@@ -679,7 +683,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   //  { 3, 4 },
   //  { 5, 6 },
   // }
-  auto rhs_array = MakeUnique<Array2D<float>>(3, 2);
+  auto rhs_array = absl::make_unique<Array2D<float>>(3, 2);
   rhs_array->FillUnique(1.0f);
   auto rhs_literal = LiteralUtil::CreateR2FromArray2D<float>(*rhs_array);
   HloInstruction* rhs_instruction =
@@ -710,7 +714,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   //  { 9, 10, 11 },
   //  { 13, 14, 15 },
   // }
-  auto lhs_array = MakeUnique<Array2D<float>>(4, 3);
+  auto lhs_array = absl::make_unique<Array2D<float>>(4, 3);
   lhs_array->FillUnique(1.0f);
   auto lhs_literal = LiteralUtil::CreateR2FromArray2D<float>(*lhs_array);
   HloInstruction* lhs_instruction =
@@ -722,7 +726,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   //  { 3, 4 },
   //  { 5, 6 },
   // }
-  auto rhs_array = MakeUnique<Array2D<float>>(3, 2);
+  auto rhs_array = absl::make_unique<Array2D<float>>(3, 2);
   rhs_array->FillUnique(1.0f);
   auto rhs_literal = LiteralUtil::CreateR2FromArray2D<float>(*rhs_array);
   HloInstruction* rhs_instruction =
@@ -1215,7 +1219,12 @@ TEST_P(HloEvaluatorTest,
   EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
-class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {
+ public:
+  HloEvaluatorPreciseReduceTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+};
 
 // Tests that Reduce doesn't lose precision when adding many numbers (because
 // it accumulates its result in a double).
@@ -1297,7 +1306,7 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
   //  { 1, 2, 3 },
   //  { 5, 6, 7 },
   // }
-  auto arg_array = MakeUnique<Array2D<float>>(2, 3);
+  auto arg_array = absl::make_unique<Array2D<float>>(2, 3);
   arg_array->FillUnique(1.0f);
   auto arg_literal = LiteralUtil::CreateR2FromArray2D<float>(*arg_array);
 
@@ -1339,7 +1348,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   //  { 1, 2, 3 },
   //  { 5, 6, 7 },
   // }
-  auto arg_array = MakeUnique<Array2D<float>>(2, 3);
+  auto arg_array = absl::make_unique<Array2D<float>>(2, 3);
   arg_array->FillUnique(1.0f);
   auto arg_literal = LiteralUtil::CreateR2FromArray2D<float>(*arg_array);
 
@@ -1390,7 +1399,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   //  { 1, 2, 3 },
   //  { 5, 6, 7 },
   // }
-  auto arg_array = MakeUnique<Array2D<float>>(2, 3);
+  auto arg_array = absl::make_unique<Array2D<float>>(2, 3);
   arg_array->FillUnique(1.0f);
   auto arg_literal = LiteralUtil::CreateR2FromArray2D<float>(*arg_array);
 
@@ -1511,7 +1520,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
   //  { 9, 10, 11, 12, 13 },
   //  { 17, 18, 19, 20, 21 },
   // }
-  auto operand_array = MakeUnique<Array2D<float>>(3, 5);
+  auto operand_array = absl::make_unique<Array2D<float>>(3, 5);
   operand_array->FillUnique(1.0f);
   auto operand_literal =
       LiteralUtil::CreateR2FromArray2D<float>(*operand_array);
@@ -1544,7 +1553,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
   //  { 1, 2, 3, 4 },
   //  { 5, 6, 7, 8 },
   // }
-  auto operand_array = MakeUnique<Array2D<float>>(2, 4);
+  auto operand_array = absl::make_unique<Array2D<float>>(2, 4);
   operand_array->FillUnique(1.0f);
   auto operand_literal =
       LiteralUtil::CreateR2FromArray2D<float>(*operand_array);
@@ -1580,7 +1589,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   //  { 1, 2, 3, 4 },
   //  { 5, 6, 7, 8 },
   // }
-  auto operand_array = MakeUnique<Array2D<float>>(2, 4);
+  auto operand_array = absl::make_unique<Array2D<float>>(2, 4);
   operand_array->FillUnique(1.0f);
   auto operand_literal =
       LiteralUtil::CreateR2FromArray2D<float>(*operand_array);
@@ -1614,7 +1623,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   //  { 1, 2, 3 },
   //  { 5, 6, 7 },
   // }
-  auto operand_array = MakeUnique<Array2D<double>>(2, 3);
+  auto operand_array = absl::make_unique<Array2D<double>>(2, 3);
   operand_array->FillUnique(1.0);
   auto operand_literal =
       LiteralUtil::CreateR2FromArray2D<double>(*operand_array);
@@ -1651,7 +1660,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   //  { 1, 2, 3 },
   //  { 5, 6, 7 },
   // }
-  auto operand_array = MakeUnique<Array2D<double>>(2, 3);
+  auto operand_array = absl::make_unique<Array2D<double>>(2, 3);
   operand_array->FillUnique(1.0);
   auto operand_literal2 =
       LiteralUtil::CreateR2FromArray2D<double>(*operand_array);
@@ -1687,7 +1696,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   //  { 1, 2, 3 },
   //  { 5, 6, 7 },
   // }
-  auto operand_array = MakeUnique<Array2D<double>>(2, 3);
+  auto operand_array = absl::make_unique<Array2D<double>>(2, 3);
   operand_array->FillUnique(1.0);
 
   HloInstruction* operand2 = b.AddInstruction(HloInstruction::CreateConstant(
@@ -1826,21 +1835,20 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 3}
+      slice_sizes={1, 3}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *LiteralUtil::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()})));
+      *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
@@ -1851,21 +1859,20 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *LiteralUtil::CreateR2<int32>({{1, 3}, {4, 6}, {7, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()})));
+      *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
@@ -1876,22 +1883,22 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *LiteralUtil::CreateR3<int32>(
           {{{1, 3}, {4, 6}, {7, 9}}, {{3, 2}, {6, 5}, {9, 8}}}),
-      *Evaluate({operand.get(), gather_indices.get()})));
+      *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
@@ -1902,11 +1909,11 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   ParseAndVerifyModule(hlo_text);
@@ -1914,11 +1921,11 @@ ENTRY main {
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{-1, 1}, {-4, 4}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest,
@@ -1930,11 +1937,11 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   ParseAndVerifyModule(hlo_text);
@@ -1942,11 +1949,11 @@ ENTRY main {
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{-2, 2}, {-1, 1}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
@@ -1957,21 +1964,20 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({1, 1});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{5}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
@@ -1982,21 +1988,21 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR3<int32>({{{8}}, {{5}}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
@@ -2007,20 +2013,19 @@ ENTRY main {
   operand = s32[3,0] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,0] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 0}
+      slice_sizes={1, 0}
 }
 )";
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{}, {}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
@@ -2031,21 +2036,21 @@ ENTRY main {
   operand = s32[3] parameter(0)
   indices = s32[2,2,1] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1}
+      slice_sizes={1}
 }
 )";
   ParseAndVerifyModule(hlo_text);
 
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*LiteralUtil::CreateR2<int32>({{0, 1}, {2, 1}}),
-                             *Evaluate({operand.get(), gather_indices.get()})));
+                             *Evaluate({operand.get(), start_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
@@ -2517,6 +2522,31 @@ TEST_P(HloEvaluatorTest, DoesCompareBF16) {
                std::move(rhs));
 }
 
+TEST_P(HloEvaluatorTest, Bf16Reduction) {
+  const string hlo_text = R"(
+HloModule Bf16Reduction
+
+add_bf16 (lhs: bf16[], rhs: bf16[]) -> bf16[] {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(bf16[] lhs, bf16[] rhs)
+}
+
+ENTRY main {
+  arg0 = bf16[4]{0} parameter(0)
+  init = bf16[] constant(0)
+  ROOT %reduce = bf16[] reduce(arg0, init), dimensions={0}, to_apply=add_bf16
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+
+  std::unique_ptr<Literal> arg = LiteralUtil::CreateR1<bfloat16>(
+      {bfloat16(1.0f), bfloat16(3.0f), bfloat16(-2.0f), bfloat16(42.0f)});
+  std::unique_ptr<Literal> expected =
+      LiteralUtil::CreateR0<bfloat16>(bfloat16(44.0f));
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *Evaluate({arg.get()})));
+}
+
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
                         ::testing::ValuesIn(use_bf16_params));
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 084b49b478..2da2cc2d71 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/core/lib/core/casts.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 
@@ -86,6 +89,29 @@ bool SafeLess(const NativeT& a, const NativeT& b) {
 // of this class.
 template <typename ReturnT, typename ElementwiseT = ReturnT>
 class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
+ private:
+  // Get the value in the given literal static_cast as a double.
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  double GetAsDouble(const Literal& literal,
+                     tensorflow::gtl::ArraySlice<int64> input_index) {
+    return static_cast<double>(literal.Get<NativeT>(input_index));
+  }
+
+  // Specialization for complex types. In this case it is not possible to
+  // static_cast value to a double so just CHECK fail. This method is not used
+  // at run-time, but must be available at compile-time to keep the compiler
+  // happy.
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  double GetAsDouble(const Literal& literal,
+                     tensorflow::gtl::ArraySlice<int64> input_index) {
+    LOG(FATAL) << "Trying to get complex literal as double: "
+               << literal.ToString();
+  }
+
  public:
   explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {}
 
@@ -525,7 +551,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleDivide(HloInstruction* divide) override {
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_floating_point<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleDivide(HloInstruction* divide) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
                         ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
                                                        ElementwiseT rhs_elem) {
@@ -535,6 +565,46 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   template <typename NativeT,
+            typename std::enable_if<std::is_signed<NativeT>::value &&
+                                    std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleDivide(HloInstruction* divide) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[divide],
+        ElementWiseBinaryOp(
+            divide,
+            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) -> ElementwiseT {
+              if (rhs_elem == 0) {
+                return static_cast<ElementwiseT>(-1);
+              }
+              if (rhs_elem == -1 &&
+                  lhs_elem == std::numeric_limits<ElementwiseT>::min()) {
+                return lhs_elem;
+              }
+              return lhs_elem / rhs_elem;
+            }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
+                nullptr>
+  Status HandleDivide(HloInstruction* divide) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
+                        ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
+                                                       ElementwiseT rhs_elem) {
+                          return rhs_elem == 0
+                                     ? std::numeric_limits<ElementwiseT>::max()
+                                     : (lhs_elem / rhs_elem);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleDivide(HloInstruction* divide) {
+    return HandleDivide<ElementwiseT>(divide);
+  }
+
+  template <typename NativeT,
             typename std::enable_if<std::is_integral<NativeT>::value>::type* =
                 nullptr>
   Status HandleMaximum(HloInstruction* maximum) {
@@ -620,9 +690,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
   Status HandleRemainder(HloInstruction* remainder) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
                         ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
@@ -632,6 +701,40 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT,
+            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
+                nullptr>
+  Status HandleRemainder(HloInstruction* remainder) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
+                        ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
+                                                          ElementwiseT rhs_el) {
+                          return rhs_el == 0 ? lhs_el : (lhs_el % rhs_el);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_signed<NativeT>::value &&
+                                    std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleRemainder(HloInstruction* remainder) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[remainder],
+        ElementWiseBinaryOp(
+            remainder,
+            [](ElementwiseT lhs_el, ElementwiseT rhs_el) -> ElementwiseT {
+              if (rhs_el == 0) {
+                return lhs_el;
+              }
+              if (rhs_el == -1 &&
+                  lhs_el == std::numeric_limits<ElementwiseT>::min()) {
+                return 0;
+              }
+              return lhs_el % rhs_el;
+            }));
+    return Status::OK();
+  }
+
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
@@ -873,7 +976,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         << ShapeUtil::HumanString(inferred_return_shape);
 
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto result = MakeUnique<Literal>(result_shape);
+    auto result = absl::make_unique<Literal>(result_shape);
 
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> out_index) {
@@ -1030,7 +1133,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       return static_cast<ReturnT>(result_val);
     };
 
-    auto result = MakeUnique<Literal>(result_shape);
+    auto result = absl::make_unique<Literal>(result_shape);
     TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
 
     parent_->evaluated_[conv] = std::move(result);
@@ -1078,7 +1181,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     // result_index_locations[i] contains one or two pointers to the locations
     // in lhs_index or rhs_index where the i'th result index should go.
-    tensorflow::gtl::InlinedVector<std::pair<int64*, int64*>, kInlineRank>
+    absl::InlinedVector<std::pair<int64*, int64*>, kInlineRank>
         result_index_locations;
     result_index_locations.reserve(lhs_rank + rhs_rank - 2);
 
@@ -1104,7 +1207,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       }
     }
 
-    auto result = MakeUnique<Literal>(dot->shape());
+    auto result = absl::make_unique<Literal>(dot->shape());
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> result_index) {
           ElementwiseT result_val = static_cast<ElementwiseT>(0);
@@ -1153,7 +1256,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     // Create new HLO of padded shape with padding value.
     ReturnT scalar =
         parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
-    auto result = MakeUnique<Literal>(pad->shape());
+    auto result = absl::make_unique<Literal>(pad->shape());
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&scalar](tensorflow::gtl::ArraySlice<int64> multi_index) {
           return scalar;
@@ -1318,7 +1421,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto operands = map->operands();
     HloComputation* computation = map->to_apply();
 
-    auto result = MakeUnique<Literal>(map->shape());
+    auto result = absl::make_unique<Literal>(map->shape());
 
     HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
@@ -1432,7 +1535,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                 [](const ReturnT& a, const ReturnT& b) {
                   return SafeLess<ReturnT>(a, b);
                 });
-      auto result_literal = MakeUnique<Literal>(keys_literal.shape());
+      auto result_literal = absl::make_unique<Literal>(keys_literal.shape());
       result_literal->PopulateR1(
           tensorflow::gtl::ArraySlice<ReturnT>(result_data));
       VLOG(3) << "HandleSort result_literal: " << result_literal->ToString();
@@ -1444,7 +1547,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     } else {
       // For R2 sort, the desired semantics are to sort each matrix row
       // independently.
-      auto result_literal = MakeUnique<Literal>(keys_literal.shape());
+      auto result_literal = absl::make_unique<Literal>(keys_literal.shape());
       int64 r1_length = keys->shape().dimensions(1);
       for (int64 row = 0; row < keys->shape().dimensions(0); ++row) {
         TF_ASSIGN_OR_RETURN(auto r1_slice,
@@ -1518,11 +1621,15 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     }
 
     HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    auto result = MakeUnique<Literal>(reduce->shape());
+    auto result = absl::make_unique<Literal>(reduce->shape());
+    Status eval_status;
     // For each resulting dimension, calculate and assign computed value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
           ReturnT result_val = init_scalar;
+          if (!eval_status.ok()) {
+            return result_val;
+          }
 
           std::vector<int64> base(arg_dimensions.size());
           for (int64 i = 0; i < multi_index.size(); ++i) {
@@ -1536,14 +1643,15 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               IsScalarAdd(function)) {
             double computed_result = 0;
             auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
-              computed_result += arg_literal.Get<float>(input_index);
+              computed_result += GetAsDouble<ReturnT>(arg_literal, input_index);
               return true;
             };
             ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
                                     arg_dim_steps, func);
             return static_cast<ReturnT>(computed_result);
           }
-          auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
+          auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index)
+              -> StatusOr<bool> {
             auto curr_val = arg_literal.Get<ReturnT>(input_index);
 
             // Evaluate computation with specified literal operands.
@@ -1551,12 +1659,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             auto result_val_literal =
                 LiteralUtil::CreateR0<ReturnT>(result_val);
 
-            std::unique_ptr<Literal> computed_result =
-                embedded_evaluator
-                    .Evaluate<const Literal*>(
-                        *function,
-                        {result_val_literal.get(), curr_val_literal.get()})
-                    .ConsumeValueOrDie();
+            TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> computed_result,
+                                embedded_evaluator.Evaluate<const Literal*>(
+                                    *function, {result_val_literal.get(),
+                                                curr_val_literal.get()}));
             // Clear visit states so that we can use the evaluator again on
             // the same computation.
             embedded_evaluator.ResetVisitStates();
@@ -1566,13 +1672,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           };
           // Computes one element of the result, reducing all dimensions that
           // contribute to that element.
-          ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
-                                  arg_dim_steps, func);
+          eval_status = ShapeUtil::ForEachIndexWithStatus(
+              arg_literal.shape(), base, arg_dim_counts, arg_dim_steps, func);
           return result_val;
         }));
 
     parent_->evaluated_[reduce] = std::move(result);
-    return Status::OK();
+    return eval_status;
   }
 
   bool IsScalarAdd(HloComputation* computation) {
@@ -1599,7 +1705,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
     auto init_scalar = init_literal.Get<ReturnT>({});
 
-    auto result = MakeUnique<Literal>(select_and_scatter->shape());
+    auto result = absl::make_unique<Literal>(select_and_scatter->shape());
 
     // Initialize result array with the init value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
@@ -1643,8 +1749,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       // 2. Using the selected index, scatter value from `source` to result. We
       // do this by iterating through the window, and compare each index with
       // the selected index.
-      tensorflow::gtl::optional<ReturnT> selected_val;
-      tensorflow::gtl::optional<std::vector<int64>> selected_index;
+      absl::optional<ReturnT> selected_val;
+      absl::optional<std::vector<int64>> selected_index;
 
       IterateThroughWindow(
           window_shape, window, operand_literal.shape(), source_index,
@@ -1735,7 +1841,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
 
     HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    auto result = MakeUnique<Literal>(reduce_window->shape());
+    auto result = absl::make_unique<Literal>(reduce_window->shape());
     // For each resulting dimension, calculate and assign computed value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> output_index) {
@@ -1802,7 +1908,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     std::vector<int64> index_count(updates_rank, 1);
     for (int64 i = 0; i < updates_rank; i++) {
       bool is_update_scatter_dim =
-          !c_binary_search(dim_numbers.update_window_dims(), i);
+          !absl::c_binary_search(dim_numbers.update_window_dims(), i);
       if (is_update_scatter_dim) {
         index_count[i] = updates_shape.dimensions(i);
       }
@@ -1821,7 +1927,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     std::vector<int64> index_count(updates_rank, 1);
     for (int64 i = 0; i < updates_rank; i++) {
       bool is_update_window_dim =
-          c_binary_search(dim_numbers.update_window_dims(), i);
+          absl::c_binary_search(dim_numbers.update_window_dims(), i);
       if (is_update_window_dim) {
         index_count[i] = updates_shape.dimensions(i);
       }
@@ -1848,7 +1954,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         : dim_numbers_(*dim_numbers), scatter_indices_(*scatter_indices) {
       for (int64 i = 0; i < updates_shape.dimensions_size(); i++) {
         update_dim_is_scatter_dims_.push_back(
-            !c_binary_search(dim_numbers_.update_window_dims(), i));
+            !absl::c_binary_search(dim_numbers_.update_window_dims(), i));
       }
 
       for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
@@ -1978,7 +2084,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       std::vector<int64> window_index_to_update_index;
       int64 update_index_count = 0;
       for (int64 i = 0; i < updates_shape.dimensions_size(); i++) {
-        if (c_binary_search(dim_numbers.update_window_dims(), i)) {
+        if (absl::c_binary_search(dim_numbers.update_window_dims(), i)) {
           window_index_to_update_index.push_back(update_index_count++);
         } else {
           update_index_count++;
@@ -1987,7 +2093,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
       int64 window_dim_count = 0;
       for (int64 i = 0; i < input_shape.dimensions_size(); i++) {
-        if (c_binary_search(dim_numbers.inserted_window_dims(), i)) {
+        if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) {
           input_dim_value_to_update_index_.push_back(-1);
         } else {
           input_dim_value_to_update_index_.push_back(
@@ -2388,7 +2494,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                 std::is_same<NativeT, int32>::value ||
                 std::is_same<NativeT, uint32>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
-    auto result = MakeUnique<Literal>(iota->shape());
+    auto result = absl::make_unique<Literal>(iota->shape());
     auto data = result->data<ReturnT>();
     std::iota(data.begin(), data.end(), 0);
     parent_->evaluated_[iota] = std::move(result);
@@ -2470,7 +2576,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     }
 
     std::vector<int64> operand_indices(start.size());
-    auto result = MakeUnique<Literal>(result_shape);
+    auto result = absl::make_unique<Literal>(result_shape);
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
           for (int64 i = 0; i < operand_indices.size(); ++i) {
@@ -2556,7 +2662,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
-    auto result = MakeUnique<Literal>(shape);
+    auto result = absl::make_unique<Literal>(shape);
 
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
@@ -2594,7 +2700,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
     const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
 
-    auto result = MakeUnique<Literal>(shape);
+    auto result = absl::make_unique<Literal>(shape);
 
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index c3ccbf0f0c..de3d7a1677 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
@@ -49,7 +51,7 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
   size_t profile_counters_size = hlo_profile_index_map.total_count();
 
   std::unique_ptr<HloProfilePrinterData> profile_printer_data =
-      MakeUnique<HloProfilePrinterData>();
+      absl::make_unique<HloProfilePrinterData>();
   profile_printer_data->set_profile_counters_size(profile_counters_size);
   profile_printer_data->mutable_computation_infos()->Reserve(
       hlo_profile_index_map.computation_count());
@@ -67,11 +69,11 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
 
   // The profile indices were computed deterministically in
   // HloProfileIndexMap::HloProfileIndexMap.
-  c_sort(computation_and_profile_idx_list,
-         [](const std::pair<const HloComputation*, int64>& left,
-            const std::pair<const HloComputation*, int64>& right) {
-           return left.second < right.second;
-         });
+  absl::c_sort(computation_and_profile_idx_list,
+               [](const std::pair<const HloComputation*, int64>& left,
+                  const std::pair<const HloComputation*, int64>& right) {
+                 return left.second < right.second;
+               });
 
   for (const auto& pair : computation_and_profile_idx_list) {
     CHECK_LT(pair.second, profile_counters_size);
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index eba80c0f19..460ae2b5ec 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -14,15 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace {
 
-using tensorflow::strings::StrCat;
+using absl::StrCat;
 using ::testing::AllOf;
 using ::testing::ContainsRegex;
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 1efa6eb5bd..59c628e945 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -26,6 +26,11 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_replace.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -37,30 +42,26 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/regexp.h"
 
-using ::tensorflow::Env;
-using ::tensorflow::WriteStringToFile;
-using ::tensorflow::gtl::nullopt;
-using ::tensorflow::gtl::optional;
-using ::tensorflow::io::JoinPath;
-using ::tensorflow::str_util::Join;
-using ::tensorflow::str_util::StringReplace;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
-
 namespace xla {
 namespace hlo_graph_dumper {
 namespace {
 
+using absl::nullopt;
+using absl::optional;
+using absl::StrAppend;
+using absl::StrCat;
+using absl::StrJoin;
+using tensorflow::Env;
+using tensorflow::WriteStringToFile;
+using tensorflow::io::JoinPath;
+
 // Helpers for Printf and Appendf.
 template <typename T>
 struct PrintfConvert {
@@ -217,9 +218,8 @@ string NodeColorAttributes(ColorScheme color) {
 
 // Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
 // graphviz HTML-like string.
-string HtmlLikeStringSanitize(tensorflow::StringPiece s) {
-  return StringReplace(StringReplace(s, "<", "&lt;", /*replace_all=*/true), ">",
-                       "&gt;", /*replace_all=*/true);
+string HtmlLikeStringSanitize(absl::string_view s) {
+  return absl::StrReplaceAll(s, {{"<", "&lt;"}, {">", "&gt;"}});
 }
 
 // Tries to generates a human-readable one-word description of the given
@@ -322,7 +322,7 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 // Encapsulates logic for dumping an HLO module to DOT (i.e. graphviz syntax).
 class HloDotDumper {
  public:
-  HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label,
+  HloDotDumper(const HloComputation* computation, absl::string_view label,
                const DebugOptions& debug_options, bool show_backend_config,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
@@ -457,7 +457,7 @@ labelloc = t;
 tooltip = " ";
 // DOT graphs accept a stylesheet as a URI.  So naturally, an inline
 // stylesheet is a data URI!
-stylesheet="
+stylesheet=<
   data:text/css,
   @import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
   svg text {
@@ -466,7 +466,7 @@ stylesheet="
   }
 
   %s
-"
+>
 
 )";
 
@@ -559,10 +559,10 @@ stylesheet="
     }
   }
 
-  return Printf(fmt, graph_label, Join(edge_css_rules, "\n"));
+  return Printf(fmt, graph_label, StrJoin(edge_css_rules, "\n"));
 }
 
-string HloDotDumper::Footer() { return StrCat(Join(edges_, "\n"), "\n}"); }
+string HloDotDumper::Footer() { return StrCat(StrJoin(edges_, "\n"), "\n}"); }
 
 bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) {
   CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
@@ -854,7 +854,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 
     // Otherwise, print e.g. "%constant.42 (s32[100])".
     string constant_name;
-    if (tensorflow::str_util::StartsWith(constant->name(), "constant")) {
+    if (absl::StartsWith(constant->name(), "constant")) {
       constant_name = constant->name();
     } else {
       constant_name = StrCat("constant ", constant->name());
@@ -896,7 +896,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
       }
     }
   }
-  return Join(lines, "<br/>");
+  return StrJoin(lines, "<br/>");
 }
 
 ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
@@ -1059,7 +1059,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kCustomCall:
-    case HloOpcode::kHostCompute:
     case HloOpcode::kWhile:
       return kDarkGreen;
     case HloOpcode::kConstant:
@@ -1085,8 +1084,7 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
 
   // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
   // an add instruction.  In this case we render just the name.
-  if (tensorflow::str_util::StartsWith(instr->name(),
-                                       HloOpcodeString(instr->opcode()))) {
+  if (absl::StartsWith(instr->name(), HloOpcodeString(instr->opcode()))) {
     return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
   }
   string extended_opcode =
@@ -1114,7 +1112,7 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
                            instr->metadata().source_line()));
   }
 
-  return Join(lines, "<br/>");
+  return StrJoin(lines, "<br/>");
 }
 
 string HloDotDumper::GetInstructionNodeBackendConfig(
@@ -1161,8 +1159,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
     constexpr int kMaxShapeLen = 64;
     if (instr_shape.length() > kMaxShapeLen) {
       instr_shape = StrCat(
-          tensorflow::StringPiece(instr_shape).substr(0, kMaxShapeLen - 3),
-          "...");
+          absl::string_view(instr_shape).substr(0, kMaxShapeLen - 3), "...");
     }
     lines.push_back(instr_shape);
   }
@@ -1179,7 +1176,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
                  100 * hlo_cycles_executed / total_cycles_executed));
     }
   }
-  return Join(lines, "<br/>");
+  return StrJoin(lines, "<br/>");
 }
 
 // Gets the total number of array elements in the given shape.  For tuples, this
@@ -1272,7 +1269,7 @@ string HloDotDumper::GetInstructionTrivialComputationStr(
                              HtmlLikeStringSanitize(*computation_type)));
     }
   }
-  return Join(lines, "<br/>");
+  return StrJoin(lines, "<br/>");
 }
 
 const HloInstruction* HloDotDumper::GetNodeForEdge(
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 1d7a062c55..064c53252c 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -23,12 +24,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace {
 
-using ::tensorflow::strings::StrCat;
+using absl::StrCat;
 using ::testing::HasSubstr;
 
 string TestName() {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 8690f2cdaa..2bb9de686f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -21,10 +21,17 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -39,17 +46,15 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/human_readable_json.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
-using tensorflow::str_util::CEscape;
-using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::CEscape;
+using absl::StrAppend;
+using absl::StrCat;
+using absl::StrJoin;
 
 /* static */
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
@@ -224,7 +229,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                             Literal::CreateFromProto(proto.literal()));
         instruction = CreateConstant(std::move(literal));
       } else {
-        instruction = MakeUnique<HloConstantInstruction>(proto.shape());
+        instruction = absl::make_unique<HloConstantInstruction>(proto.shape());
       }
       break;
     }
@@ -281,41 +286,28 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kInfeed: {
       const Shape& data_shape =
           ShapeUtil::GetTupleElementShape(proto.shape(), 0);
-      if (proto.operand_ids_size() == 0) {
-        // TODO(b/80000000): Remove this when all uses of infeed are
-        // converted to take tokens.
-        instruction = CreateInfeed(data_shape, proto.infeed_config());
-      } else {
-        CHECK_EQ(proto.operand_ids_size(), 1);
-        instruction =
-            CreateInfeed(data_shape, operands(0), proto.infeed_config());
-      }
+      TF_RET_CHECK(proto.operand_ids_size() == 1);
+      instruction =
+          CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
     case HloOpcode::kOutfeed:
-      if (proto.operand_ids_size() == 1) {
-        // TODO(b/80000000): Remove this when all uses of outfeed are
-        // converted to take tokens.
-        instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
-                                    proto.outfeed_config());
-      } else {
-        CHECK_EQ(proto.operand_ids_size(), 2);
-        instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
-                                    operands(1), proto.outfeed_config());
-      }
+      TF_RET_CHECK(proto.operand_ids_size() == 2);
+      instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
+                                  operands(1), proto.outfeed_config());
       break;
     case HloOpcode::kCrossReplicaSum: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "CrossReplicaSum should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      tensorflow::gtl::optional<int64> all_reduce_id;
+      absl::optional<int64> all_reduce_id;
       if (proto.all_reduce_id() > 0) {
         all_reduce_id = proto.all_reduce_id();
       }
       instruction = CreateCrossReplicaSum(
           proto.shape(), all_operands(), computations(0),
-          /*replica_group_ids=*/
-          std::vector<int64>(proto.replica_group_ids().begin(),
-                             proto.replica_group_ids().end()),
+          /*replica_groups=*/
+          std::vector<ReplicaGroup>(proto.replica_groups().begin(),
+                                    proto.replica_groups().end()),
           /*barrier=*/proto.cross_replica_sum_barrier(),
           /*all_reduce_id=*/all_reduce_id);
       break;
@@ -325,8 +317,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           proto.shape(), all_operands(),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
-                                    proto.replica_groups().end()),
-          /*barrier=*/proto.cross_replica_sum_barrier());
+                                    proto.replica_groups().end()));
       break;
     }
     case HloOpcode::kConvolution:
@@ -335,9 +326,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_window());
       TF_RET_CHECK(proto.has_convolution_dimension_numbers());
-      instruction =
-          CreateConvolve(proto.shape(), operands(0), operands(1),
-                         proto.window(), proto.convolution_dimension_numbers());
+      instruction = CreateConvolve(
+          proto.shape(), operands(0), operands(1), proto.window(),
+          proto.convolution_dimension_numbers(),
+          std::max(static_cast<int64>(proto.feature_group_count()), 1LL));
       break;
     case HloOpcode::kReduceWindow:
       TF_RET_CHECK(proto.operand_ids_size() == 2)
@@ -373,11 +365,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                 proto.convolution_dimension_numbers());
       }
       break;
-    case HloOpcode::kHostCompute:
-      instruction =
-          CreateHostCompute(proto.shape(), all_operands(), proto.channel_name(),
-                            proto.cost_estimate_ns());
-      break;
     case HloOpcode::kPad:
       TF_RET_CHECK(proto.operand_ids_size() == 2)
           << "Pad instruction should have 2 operands but sees "
@@ -391,7 +378,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "DynamicSlice instruction should have 2 operands but sees "
           << proto.operand_ids_size();
       std::vector<int64> slice_sizes(proto.dynamic_slice_sizes_size());
-      c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
+      absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
       instruction = CreateDynamicSlice(proto.shape(), operands(0), operands(1),
                                        slice_sizes);
       break;
@@ -403,14 +390,14 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.has_gather_dimension_numbers())
           << "Gather instruction should have GatherDimensionNumbers set.";
       std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers =
-          MakeUnique<GatherDimensionNumbers>(proto.gather_dimension_numbers());
-      std::vector<int64> gather_window_bounds;
-      for (int64 bound : proto.gather_window_bounds()) {
-        gather_window_bounds.push_back(bound);
+          absl::make_unique<GatherDimensionNumbers>(
+              proto.gather_dimension_numbers());
+      std::vector<int64> gather_slice_sizes;
+      for (int64 bound : proto.gather_slice_sizes()) {
+        gather_slice_sizes.push_back(bound);
       }
-      instruction =
-          CreateGather(proto.shape(), operands(0), operands(1),
-                       *gather_dimension_numbers, gather_window_bounds);
+      instruction = CreateGather(proto.shape(), operands(0), operands(1),
+                                 *gather_dimension_numbers, gather_slice_sizes);
       break;
     }
     case HloOpcode::kScatter: {
@@ -422,15 +409,16 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "Scatter instruction should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      auto scatter_dimension_numbers = MakeUnique<ScatterDimensionNumbers>(
-          proto.scatter_dimension_numbers());
+      auto scatter_dimension_numbers =
+          absl::make_unique<ScatterDimensionNumbers>(
+              proto.scatter_dimension_numbers());
       instruction =
           CreateScatter(proto.shape(), operands(0), operands(1), operands(2),
                         computations(0), *scatter_dimension_numbers);
       break;
     }
     default: {
-      instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
+      instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
         TF_RET_CHECK(ContainsKey(instruction_map, operand_id))
             << "No instruction with id " << operand_id;
@@ -458,10 +446,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->SetAndSanitizeName(proto.name());
   instruction->metadata_ = proto.metadata();
   instruction->backend_config_ = proto.backend_config();
+  instruction->precision_config_ = proto.precision_config();
 
   if (proto.has_dot_dimension_numbers()) {
     instruction->dot_dimension_numbers_ =
-        MakeUnique<DotDimensionNumbers>(proto.dot_dimension_numbers());
+        absl::make_unique<DotDimensionNumbers>(proto.dot_dimension_numbers());
   }
 
   if (proto.has_sharding()) {
@@ -475,34 +464,36 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateParameter(
     int64 parameter_number, const Shape& shape, const string& name) {
-  return MakeUnique<HloParameterInstruction>(parameter_number, shape, name);
+  return absl::make_unique<HloParameterInstruction>(parameter_number, shape,
+                                                    name);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTrace(
     const string& tag, HloInstruction* operand) {
-  return MakeUnique<HloTraceInstruction>(tag, operand);
+  return absl::make_unique<HloTraceInstruction>(tag, operand);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConstant(
     std::unique_ptr<Literal> literal) {
-  return MakeUnique<HloConstantInstruction>(std::move(literal));
+  return absl::make_unique<HloConstantInstruction>(std::move(literal));
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateIota(
     const Shape& shape) {
-  return WrapUnique(new HloInstruction(HloOpcode::kIota, shape));
+  return absl::WrapUnique(new HloInstruction(HloOpcode::kIota, shape));
 }
 
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateGetTupleElement(const Shape& shape,
                                       HloInstruction* operand, int64 index) {
-  return MakeUnique<HloGetTupleElementInstruction>(shape, operand, index);
+  return absl::make_unique<HloGetTupleElementInstruction>(shape, operand,
+                                                          index);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRng(
     const Shape& shape, RandomDistribution distribution,
     tensorflow::gtl::ArraySlice<HloInstruction*> parameters) {
-  return MakeUnique<HloRngInstruction>(shape, distribution, parameters);
+  return absl::make_unique<HloRngInstruction>(shape, distribution, parameters);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateNary(
@@ -512,7 +503,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     // It is impossible to copy an opaque shape, we don't know how big it is.
     CHECK(!ShapeUtil::IsOpaque(shape));
   }
-  auto instruction = WrapUnique(new HloInstruction(opcode, shape));
+  auto instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
@@ -617,31 +608,33 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateMap(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* map_computation) {
-  return MakeUnique<HloMapInstruction>(shape, operands, map_computation);
+  return absl::make_unique<HloMapInstruction>(shape, operands, map_computation);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConvolve(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    const Window& window,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  return MakeUnique<HloConvolutionInstruction>(shape, lhs, rhs, window,
-                                               dimension_numbers);
+    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count) {
+  return absl::make_unique<HloConvolutionInstruction>(
+      shape, lhs, rhs, window, dimension_numbers, feature_group_count);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFft(
     const Shape& shape, HloInstruction* operand, FftType fft_type,
     tensorflow::gtl::ArraySlice<int64> fft_length) {
-  return MakeUnique<HloFftInstruction>(shape, operand, fft_type, fft_length);
+  return absl::make_unique<HloFftInstruction>(shape, operand, fft_type,
+                                              fft_length);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
     const DotDimensionNumbers& dimension_numbers) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
+  auto instruction =
+      absl::WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
   instruction->AppendOperand(lhs);
   instruction->AppendOperand(rhs);
   instruction->dot_dimension_numbers_ =
-      MakeUnique<DotDimensionNumbers>(dimension_numbers);
+      absl::make_unique<DotDimensionNumbers>(dimension_numbers);
   return instruction;
 }
 
@@ -650,10 +643,12 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   CHECK_EQ(ShapeUtil::Rank(lhs->shape()), 2);
   CHECK_EQ(ShapeUtil::Rank(rhs->shape()), 2);
 
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
+  auto instruction =
+      absl::WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
   instruction->AppendOperand(lhs);
   instruction->AppendOperand(rhs);
-  instruction->dot_dimension_numbers_ = MakeUnique<DotDimensionNumbers>();
+  instruction->dot_dimension_numbers_ =
+      absl::make_unique<DotDimensionNumbers>();
   instruction->dot_dimension_numbers_->add_lhs_contracting_dimensions(1);
   instruction->dot_dimension_numbers_->add_rhs_contracting_dimensions(0);
   return instruction;
@@ -664,7 +659,7 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
                                       HloInstruction* operand,
                                       const int exponent_bits,
                                       const int mantissa_bits) {
-  return MakeUnique<HloReducePrecisionInstruction>(
+  return absl::make_unique<HloReducePrecisionInstruction>(
       shape, operand, exponent_bits, mantissa_bits);
 }
 
@@ -672,52 +667,39 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 HloInstruction::CreateCrossReplicaSum(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* reduce_computation,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-    tensorflow::StringPiece barrier,
-    const tensorflow::gtl::optional<int64>& all_reduce_id) {
-  return MakeUnique<HloAllReduceInstruction>(
-      shape, operands, reduce_computation, replica_group_ids, barrier,
+    const std::vector<ReplicaGroup>& replica_groups, absl::string_view barrier,
+    const absl::optional<int64>& all_reduce_id) {
+  return absl::make_unique<HloAllReduceInstruction>(
+      shape, operands, reduce_computation, replica_groups, barrier,
       all_reduce_id);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    const std::vector<ReplicaGroup>& replica_groups,
-    tensorflow::StringPiece barrier) {
-  return MakeUnique<HloAllToAllInstruction>(shape, operands, replica_groups,
-                                            barrier);
+    const std::vector<ReplicaGroup>& replica_groups) {
+  return absl::make_unique<HloAllToAllInstruction>(shape, operands,
+                                                   replica_groups);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
     const Shape& infeed_shape, HloInstruction* token_operand,
     const string& config) {
-  return MakeUnique<HloInfeedInstruction>(infeed_shape, token_operand, config);
-}
-
-/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
-    const Shape& infeed_shape, const string& config) {
-  return MakeUnique<HloInfeedInstruction>(infeed_shape, config);
-}
-
-/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateOutfeed(
-    const Shape& outfeed_shape, HloInstruction* operand,
-    HloInstruction* token_operand, tensorflow::StringPiece outfeed_config) {
-  return MakeUnique<HloOutfeedInstruction>(outfeed_shape, operand,
-                                           token_operand, outfeed_config);
+  return absl::make_unique<HloInfeedInstruction>(infeed_shape, token_operand,
+                                                 config);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateOutfeed(
     const Shape& outfeed_shape, HloInstruction* operand,
-    tensorflow::StringPiece outfeed_config) {
-  return MakeUnique<HloOutfeedInstruction>(outfeed_shape, operand,
-                                           outfeed_config);
+    HloInstruction* token_operand, absl::string_view outfeed_config) {
+  return absl::make_unique<HloOutfeedInstruction>(
+      outfeed_shape, operand, token_operand, outfeed_config);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSend(
     HloInstruction* operand, HloInstruction* token, int64 channel_id,
     bool is_host_transfer) {
-  return MakeUnique<HloSendInstruction>(operand, token, channel_id,
-                                        is_host_transfer);
+  return absl::make_unique<HloSendInstruction>(operand, token, channel_id,
+                                               is_host_transfer);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSendDone(
@@ -725,14 +707,15 @@ HloInstruction::CreateCrossReplicaSum(
   auto send_operand = DynCast<HloSendInstruction>(operand);
   CHECK(send_operand != nullptr)
       << "SendDone must take the context operand from Send";
-  return MakeUnique<HloSendDoneInstruction>(send_operand, is_host_transfer);
+  return absl::make_unique<HloSendDoneInstruction>(send_operand,
+                                                   is_host_transfer);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecv(
     const Shape& shape, HloInstruction* token, int64 channel_id,
     bool is_host_transfer) {
-  return MakeUnique<HloRecvInstruction>(shape, token, channel_id,
-                                        is_host_transfer);
+  return absl::make_unique<HloRecvInstruction>(shape, token, channel_id,
+                                               is_host_transfer);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecvDone(
@@ -740,19 +723,20 @@ HloInstruction::CreateCrossReplicaSum(
   auto recv_operand = DynCast<HloRecvInstruction>(operand);
   CHECK(recv_operand != nullptr)
       << "RecvDone must take the context operand from Recv";
-  return MakeUnique<HloRecvDoneInstruction>(recv_operand, is_host_transfer);
+  return absl::make_unique<HloRecvDoneInstruction>(recv_operand,
+                                                   is_host_transfer);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReverse(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return MakeUnique<HloReverseInstruction>(shape, operand, dimensions);
+  return absl::make_unique<HloReverseInstruction>(shape, operand, dimensions);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAfterAll(
     tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
   CHECK(!operands.empty());
-  auto instruction = WrapUnique(
+  auto instruction = absl::WrapUnique(
       new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape()));
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
@@ -761,14 +745,15 @@ HloInstruction::CreateCrossReplicaSum(
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateToken() {
-  return WrapUnique(
+  return absl::WrapUnique(
       new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape()));
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateWhile(
     const Shape& shape, HloComputation* condition, HloComputation* body,
     HloInstruction* init) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kWhile, shape));
+  auto instruction =
+      absl::WrapUnique(new HloInstruction(HloOpcode::kWhile, shape));
   instruction->AppendOperand(init);
   // Body comes before condition computation in the vector.
   instruction->called_computations_.push_back(body);
@@ -781,7 +766,7 @@ HloInstruction::CreateCrossReplicaSum(
     HloInstruction* true_computation_arg, HloComputation* true_computation,
     HloInstruction* false_computation_arg, HloComputation* false_computation) {
   auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kConditional, shape));
+      absl::WrapUnique(new HloInstruction(HloOpcode::kConditional, shape));
   instruction->AppendOperand(pred);
   instruction->AppendOperand(true_computation_arg);
   instruction->AppendOperand(false_computation_arg);
@@ -798,15 +783,15 @@ HloInstruction::CreateCrossReplicaSum(
     tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices,
     tensorflow::gtl::ArraySlice<int64> strides) {
-  return MakeUnique<HloSliceInstruction>(shape, operand, start_indices,
-                                         limit_indices, strides);
+  return absl::make_unique<HloSliceInstruction>(shape, operand, start_indices,
+                                                limit_indices, strides);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDynamicSlice(
     const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  return MakeUnique<HloDynamicSliceInstruction>(shape, operand, start_indices,
-                                                slice_sizes);
+  return absl::make_unique<HloDynamicSliceInstruction>(
+      shape, operand, start_indices, slice_sizes);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -814,8 +799,8 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
                                          HloInstruction* operand,
                                          HloInstruction* update,
                                          HloInstruction* start_indices) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kDynamicUpdateSlice, shape));
+  auto instruction = absl::WrapUnique(
+      new HloInstruction(HloOpcode::kDynamicUpdateSlice, shape));
   instruction->AppendOperand(operand);
   instruction->AppendOperand(update);
   instruction->AppendOperand(start_indices);
@@ -825,12 +810,14 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConcatenate(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     int64 dimension) {
-  return MakeUnique<HloConcatenateInstruction>(shape, operands, dimension);
+  return absl::make_unique<HloConcatenateInstruction>(shape, operands,
+                                                      dimension);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConvert(
     const Shape& shape, HloInstruction* operand) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kConvert, shape));
+  auto instruction =
+      absl::WrapUnique(new HloInstruction(HloOpcode::kConvert, shape));
   instruction->AppendOperand(operand);
   return instruction;
 }
@@ -839,7 +826,7 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
 HloInstruction::CreateBitcastConvert(const Shape& shape,
                                      HloInstruction* operand) {
   auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kBitcastConvert, shape));
+      absl::WrapUnique(new HloInstruction(HloOpcode::kBitcastConvert, shape));
   instruction->AppendOperand(operand);
   return instruction;
 }
@@ -848,7 +835,7 @@ HloInstruction::CreateBitcastConvert(const Shape& shape,
     const Shape& shape, HloInstruction* operand, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
     HloComputation* reduce_computation) {
-  auto instruction = WrapUnique(new HloReduceInstruction(
+  auto instruction = absl::WrapUnique(new HloReduceInstruction(
       shape, {operand, init_value}, dimensions_to_reduce, reduce_computation));
   return std::move(instruction);
 }
@@ -862,15 +849,15 @@ HloInstruction::CreateBitcastConvert(const Shape& shape,
   all_args.reserve(operands.size() * 2);
   all_args.insert(all_args.end(), operands.begin(), operands.end());
   all_args.insert(all_args.end(), init_values.begin(), init_values.end());
-  return MakeUnique<HloReduceInstruction>(shape, all_args, dimensions_to_reduce,
-                                          reduce_computation);
+  return absl::make_unique<HloReduceInstruction>(
+      shape, all_args, dimensions_to_reduce, reduce_computation);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReduceWindow(
     const Shape& shape, HloInstruction* operand, HloInstruction* init_value,
     const Window& window, HloComputation* reduce_computation) {
-  return MakeUnique<HloReduceWindowInstruction>(shape, operand, init_value,
-                                                window, reduce_computation);
+  return absl::make_unique<HloReduceWindowInstruction>(
+      shape, operand, init_value, window, reduce_computation);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -879,7 +866,7 @@ HloInstruction::CreateBatchNormTraining(const Shape& shape,
                                         HloInstruction* scale,
                                         HloInstruction* offset, float epsilon,
                                         int64 feature_index) {
-  return MakeUnique<HloBatchNormTrainingInstruction>(
+  return absl::make_unique<HloBatchNormTrainingInstruction>(
       shape, operand, scale, offset, epsilon, feature_index);
 }
 
@@ -888,7 +875,7 @@ HloInstruction::CreateBatchNormInference(
     const Shape& shape, HloInstruction* operand, HloInstruction* scale,
     HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
     float epsilon, int64 feature_index) {
-  return MakeUnique<HloBatchNormInferenceInstruction>(
+  return absl::make_unique<HloBatchNormInferenceInstruction>(
       shape, operand, scale, offset, mean, variance, epsilon, feature_index);
 }
 
@@ -898,9 +885,9 @@ HloInstruction::CreateBatchNormGrad(const Shape& shape, HloInstruction* operand,
                                     HloInstruction* variance,
                                     HloInstruction* grad_output, float epsilon,
                                     int64 feature_index) {
-  return MakeUnique<HloBatchNormGradInstruction>(shape, operand, scale, mean,
-                                                 variance, grad_output, epsilon,
-                                                 feature_index);
+  return absl::make_unique<HloBatchNormGradInstruction>(
+      shape, operand, scale, mean, variance, grad_output, epsilon,
+      feature_index);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -908,15 +895,15 @@ HloInstruction::CreateSelectAndScatter(
     const Shape& shape, HloInstruction* operand, HloComputation* select,
     const Window& window, HloInstruction* source, HloInstruction* init_value,
     HloComputation* scatter) {
-  return MakeUnique<HloSelectAndScatterInstruction>(
+  return absl::make_unique<HloSelectAndScatterInstruction>(
       shape, operand, select, window, source, init_value, scatter);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateBroadcast(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return MakeUnique<HloBroadcastInstruction>(shape, operand,
-                                             broadcast_dimensions);
+  return absl::make_unique<HloBroadcastInstruction>(shape, operand,
+                                                    broadcast_dimensions);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -974,8 +961,8 @@ HloInstruction::CreateBroadcastSequence(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreatePad(
     const Shape& shape, HloInstruction* operand, HloInstruction* padding_value,
     const PaddingConfig& padding_config) {
-  return MakeUnique<HloPadInstruction>(shape, operand, padding_value,
-                                       padding_config);
+  return absl::make_unique<HloPadInstruction>(shape, operand, padding_value,
+                                              padding_config);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReshape(
@@ -984,7 +971,8 @@ HloInstruction::CreateBroadcastSequence(
            ShapeUtil::ElementsIn(operand->shape()))
       << "shape: " << ShapeUtil::HumanString(shape)
       << " operand: " << ShapeUtil::HumanString(operand->shape());
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReshape, shape));
+  auto instruction =
+      absl::WrapUnique(new HloInstruction(HloOpcode::kReshape, shape));
   instruction->AppendOperand(operand);
   return instruction;
 }
@@ -992,26 +980,27 @@ HloInstruction::CreateBroadcastSequence(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTranspose(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return MakeUnique<HloTransposeInstruction>(shape, operand, dimensions);
+  return absl::make_unique<HloTransposeInstruction>(shape, operand, dimensions);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSort(
     const Shape& shape, int64 dimension, HloInstruction* keys,
     HloInstruction* values) {
-  return MakeUnique<HloSortInstruction>(shape, dimension, keys, values);
+  return absl::make_unique<HloSortInstruction>(shape, dimension, keys, values);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
     const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root) {
-  return MakeUnique<HloFusionInstruction>(shape, fusion_kind, fused_root);
+  return absl::make_unique<HloFusionInstruction>(shape, fusion_kind,
+                                                 fused_root);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
     const Shape& shape, FusionKind fusion_kind,
     tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* fusion_computation) {
-  return MakeUnique<HloFusionInstruction>(shape, fusion_kind, operands,
-                                          fusion_computation);
+  return absl::make_unique<HloFusionInstruction>(shape, fusion_kind, operands,
+                                                 fusion_computation);
 }
 
 void HloInstruction::set_single_sharding(const HloSharding& sharding) {
@@ -1031,6 +1020,7 @@ void HloInstruction::SetupDerivedInstruction(
     derived_instruction->clear_sharding();
   }
   derived_instruction->set_metadata(metadata_);
+  derived_instruction->set_precision_config(precision_config_);
 }
 
 bool HloInstruction::HasSideEffectNoRecurse() const {
@@ -1043,7 +1033,6 @@ bool HloInstruction::HasSideEffectNoRecurse() const {
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
-    case HloOpcode::kHostCompute:
       return true;
     case HloOpcode::kCrossReplicaSum:
       return all_reduce_id().has_value();
@@ -1069,7 +1058,7 @@ bool HloInstruction::HasSideEffect() const {
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* computation) {
   std::unique_ptr<HloInstruction> instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kCall, shape));
+      absl::WrapUnique(new HloInstruction(HloOpcode::kCall, shape));
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
@@ -1079,16 +1068,9 @@ bool HloInstruction::HasSideEffect() const {
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCustomCall(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    tensorflow::StringPiece custom_call_target) {
-  return MakeUnique<HloCustomCallInstruction>(shape, operands,
-                                              custom_call_target);
-}
-
-/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateHostCompute(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    tensorflow::StringPiece channel_name, const int64 cost_estimate_ns) {
-  return MakeUnique<HloHostComputeInstruction>(shape, operands, channel_name,
-                                               cost_estimate_ns);
+    absl::string_view custom_call_target) {
+  return absl::make_unique<HloCustomCallInstruction>(shape, operands,
+                                                     custom_call_target);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTuple(
@@ -1102,11 +1084,11 @@ bool HloInstruction::HasSideEffect() const {
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateGather(
-    const Shape& shape, HloInstruction* operand, HloInstruction* gather_indices,
+    const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     const GatherDimensionNumbers& gather_dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  return MakeUnique<HloGatherInstruction>(shape, operand, gather_indices,
-                                          gather_dim_numbers, window_bounds);
+    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return absl::make_unique<HloGatherInstruction>(
+      shape, operand, start_indices, gather_dim_numbers, slice_sizes);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateScatter(
@@ -1114,16 +1096,17 @@ bool HloInstruction::HasSideEffect() const {
     HloInstruction* scatter_indices, HloInstruction* updates,
     HloComputation* update_computation,
     const ScatterDimensionNumbers& scatter_dim_numbers) {
-  return MakeUnique<HloScatterInstruction>(shape, operand, scatter_indices,
-                                           updates, update_computation,
-                                           scatter_dim_numbers);
+  return absl::make_unique<HloScatterInstruction>(
+      shape, operand, scatter_indices, updates, update_computation,
+      scatter_dim_numbers);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDomain(
     const Shape& shape, HloInstruction* operand,
     std::unique_ptr<DomainMetadata> operand_side_metadata,
     std::unique_ptr<DomainMetadata> user_side_metadata) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDomain, shape));
+  auto instruction =
+      absl::WrapUnique(new HloInstruction(HloOpcode::kDomain, shape));
   instruction->operand_side_metadata_ = std::move(operand_side_metadata);
   instruction->user_side_metadata_ = std::move(user_side_metadata);
   instruction->AppendOperand(operand);
@@ -1177,7 +1160,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kCustomCall:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kSelectAndScatter:
-    case HloOpcode::kHostCompute:
     case HloOpcode::kPad:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kSort:
@@ -1299,6 +1281,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       }
       break;
   }
+  // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
   clone->set_raw_backend_config_string(backend_config_);
@@ -1364,7 +1347,7 @@ std::unique_ptr<HloInstruction> HloInstruction::Clone(
         // If names ends with .suffix[0-9]+ then replace with a suffix with the
         // numeric value incremented.
         int64 numeric_suffix;
-        if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+        if (absl::SimpleAtoi(after_suffix, &numeric_suffix)) {
           clone->name_ =
               StrCat(name().substr(0, index), dot_suffix, numeric_suffix + 1);
         } else {
@@ -1643,7 +1626,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCustomCall:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kSelectAndScatter:
-    case HloOpcode::kHostCompute:
     case HloOpcode::kPad:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kGather:
@@ -1837,7 +1819,7 @@ void HloInstruction::set_false_computation(HloComputation* false_computation) {
 
 string HloInstruction::SignatureString() const {
   string operands =
-      Join(operands_, ", ", [](string* out, HloInstruction* operand) {
+      StrJoin(operands_, ", ", [](string* out, HloInstruction* operand) {
         StrAppend(out, ShapeUtil::HumanString(operand->shape()));
       });
   return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
@@ -1857,7 +1839,7 @@ string HloInstruction::ToString(const HloPrintOptions& options) const {
 }
 
 bool HloInstruction::IsElementwiseImpl(
-    const tensorflow::gtl::optional<int64>& operand_idx) const {
+    const absl::optional<int64>& operand_idx) const {
   switch (opcode_) {
     // Unary elementwise operations.
     case HloOpcode::kAbs:
@@ -1984,7 +1966,7 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap(
       slice.size() > kMaxOperandsToShowIfCompact) {
     slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact);
   }
-  operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) {
+  operands = StrJoin(slice, ", ", [&](string* out, HloInstruction* operand) {
     // If operand is already been deleted, put `null` to the string output.
     if (operand == nullptr) {
       StrAppend(out, "null ");
@@ -2004,7 +1986,7 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap(
     } else if (!options.compact_operands()) {
       str.push_back(PrintName(operand->name(), options));
     }
-    StrAppend(out, Join(str, " "));
+    StrAppend(out, StrJoin(str, " "));
   });
   const int64 remaining = operands_.size() - slice.size();
   if (slice.size() != operands_.size()) {
@@ -2021,6 +2003,11 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(DotDimensionNumbersToString());
   }
 
+  string precision_config_string = PrecisionConfigToString();
+  if (!precision_config_string.empty()) {
+    extra.push_back(precision_config_string);
+  }
+
   if (options.print_subcomputation_mode() ==
       HloPrintOptions::PrintSubcomputationMode::kNameOnly) {
     if (opcode() == HloOpcode::kWhile) {
@@ -2045,8 +2032,9 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
       extra.push_back(
           StrCat("to_apply=", PrintName(to_apply()->name(), options)));
     } else if (!called_computations().empty()) {
-      extra.push_back(StrCat(
-          "calls=", Join(called_computations(), ", ",
+      extra.push_back(
+          StrCat("calls=",
+                 StrJoin(called_computations(), ", ",
                          [&](string* out, const HloComputation* computation) {
                            StrAppend(out,
                                      PrintName(computation->name(), options));
@@ -2083,12 +2071,12 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
         break;
       default:
         if (!called_computations().empty()) {
-          extra.push_back(
-              StrCat("calls=\n",
-                     Join(called_computations(), ", ",
-                          [&](string* out, const HloComputation* computation) {
-                            StrAppend(out, computation->ToString(new_options));
-                          })));
+          extra.push_back(StrCat(
+              "calls=\n",
+              StrJoin(called_computations(), ", ",
+                      [&](string* out, const HloComputation* computation) {
+                        StrAppend(out, computation->ToString(new_options));
+                      })));
         }
         break;
     }
@@ -2099,11 +2087,11 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
   }
   if (!control_predecessors_.empty()) {
     extra.push_back(StrCat("control-predecessors={",
-                           Join(control_predecessors_, ", ",
-                                [&](string* out, HloInstruction* pre) {
-                                  StrAppend(out,
-                                            PrintName(pre->name(), options));
-                                }),
+                           StrJoin(control_predecessors_, ", ",
+                                   [&](string* out, HloInstruction* pre) {
+                                     StrAppend(out,
+                                               PrintName(pre->name(), options));
+                                   }),
                            "}"));
   }
   if (operand_side_metadata_ != nullptr && user_side_metadata_ != nullptr) {
@@ -2117,10 +2105,10 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
 
 string HloInstruction::ToShortString() const {
   return StrCat("%", name(), " = ", HloOpcodeString(opcode()), "(",
-                Join(operands_, ", ",
-                     [](string* out, HloInstruction* operand) {
-                       StrAppend(out, "%", operand->name());
-                     }),
+                StrJoin(operands_, ", ",
+                        [](string* out, HloInstruction* operand) {
+                          StrAppend(out, "%", operand->name());
+                        }),
                 ")");
 }
 
@@ -2142,6 +2130,7 @@ HloInstructionProto HloInstruction::ToProto() const {
 
   *proto.mutable_metadata() = metadata_;
   proto.set_backend_config(backend_config_);
+  *proto.mutable_precision_config() = precision_config_;
   if (opcode() != HloOpcode::kFusion) {
     for (const HloComputation* computation : called_computations_) {
       proto.add_called_computation_ids(computation->unique_id());
@@ -2354,8 +2343,6 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleInfeed(this);
     case HloOpcode::kOutfeed:
       return visitor->HandleOutfeed(this);
-    case HloOpcode::kHostCompute:
-      return visitor->HandleHostCompute(this);
     case HloOpcode::kRng:
       return visitor->HandleRng(this);
     case HloOpcode::kWhile:
@@ -2401,8 +2388,7 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
 template Status HloInstruction::Visit(DfsHloVisitor* visitor);
 template Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
 
-using DFSStack =
-    tensorflow::gtl::InlinedVector<std::pair<int, HloInstruction*>, 16>;
+using DFSStack = absl::InlinedVector<std::pair<int, HloInstruction*>, 16>;
 
 // Push "child" onto the dfs_stack if not already visited.  Returns false if a
 // cycle was detected, and true otherwise.
@@ -2647,7 +2633,7 @@ bool HloInstruction::IsElementwiseBinary() const {
 }
 
 bool HloInstruction::IsElementwise() const {
-  return IsElementwiseImpl(tensorflow::gtl::nullopt);
+  return IsElementwiseImpl(absl::nullopt);
 }
 
 bool HloInstruction::ImplicitlyBroadcastsOperand(int64 operand_idx) const {
@@ -2812,7 +2798,7 @@ string PaddingConfigToString(const PaddingConfig& padding) {
                   [](const PaddingConfig::PaddingConfigDimension& dim) {
                     return dim.interior_padding() != 0;
                   });
-  return Join(
+  return StrJoin(
       padding.dimensions(), "x",
       [&](string* out, const PaddingConfig::PaddingConfigDimension& dim) {
         StrAppend(
@@ -2836,11 +2822,15 @@ string OpMetadataToString(const OpMetadata& metadata) {
   if (metadata.source_line() != 0) {
     result.push_back(StrCat("source_line=", metadata.source_line()));
   }
-  return Join(result, " ");
+  return StrJoin(result, " ");
 }
 
 string RandomDistributionToString(const RandomDistribution& distribution) {
-  return tensorflow::str_util::Lowercase(RandomDistribution_Name(distribution));
+  return absl::AsciiStrToLower(RandomDistribution_Name(distribution));
+}
+
+string PrecisionToString(const PrecisionConfigProto::Precision& precision) {
+  return absl::AsciiStrToLower(PrecisionConfigProto::Precision_Name(precision));
 }
 
 string ConvolutionDimensionNumbersToString(
@@ -2868,8 +2858,8 @@ string ConvolutionDimensionNumbersToString(
     output_dims[dnums.output_spatial_dimensions(i)] = StrCat(i);
   }
 
-  return StrCat(Join(lhs_dims, ""), "_", Join(rhs_dims, ""), "->",
-                Join(output_dims, ""));
+  return StrCat(StrJoin(lhs_dims, ""), "_", StrJoin(rhs_dims, ""), "->",
+                StrJoin(output_dims, ""));
 }
 
 string HloInstruction::DotDimensionNumbersToString() const {
@@ -2880,19 +2870,21 @@ string HloInstruction::DotDimensionNumbersToString() const {
   const DotDimensionNumbers& dnums = *dot_dimension_numbers_;
   if (!dnums.lhs_batch_dimensions().empty()) {
     result.push_back(StrCat("lhs_batch_dims={",
-                            Join(dnums.lhs_batch_dimensions(), ","), "}"));
+                            StrJoin(dnums.lhs_batch_dimensions(), ","), "}"));
   }
   result.push_back(StrCat("lhs_contracting_dims={",
-                          Join(dnums.lhs_contracting_dimensions(), ","), "}"));
+                          StrJoin(dnums.lhs_contracting_dimensions(), ","),
+                          "}"));
 
   if (!dnums.rhs_batch_dimensions().empty()) {
     result.push_back(StrCat("rhs_batch_dims={",
-                            Join(dnums.rhs_batch_dimensions(), ","), "}"));
+                            StrJoin(dnums.rhs_batch_dimensions(), ","), "}"));
   }
   result.push_back(StrCat("rhs_contracting_dims={",
-                          Join(dnums.rhs_contracting_dimensions(), ","), "}"));
+                          StrJoin(dnums.rhs_contracting_dimensions(), ","),
+                          "}"));
 
-  return Join(result, ", ");
+  return StrJoin(result, ", ");
 }
 
 StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
@@ -2906,7 +2898,44 @@ StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
     }
     return map;
   }();
-  auto found = map->find(tensorflow::str_util::Lowercase(name));
+  auto found = map->find(absl::AsciiStrToLower(name));
+  if (found == map->end()) {
+    return InvalidArgument("Unknown distribution");
+  }
+  return found->second;
+}
+
+string HloInstruction::PrecisionConfigToString() const {
+  if (precision_config_.operand_precision().empty()) {
+    return "";
+  }
+  return StrCat(
+      "operand_precision={",
+      StrJoin(precision_config_.operand_precision(), ",",
+              [](string* out, int32 precision) {
+                CHECK(PrecisionConfigProto::Precision_IsValid(precision))
+                    << precision;
+                StrAppend(out, PrecisionToString(
+                                   static_cast<PrecisionConfigProto::Precision>(
+                                       precision)));
+              }),
+      "}");
+}
+
+StatusOr<PrecisionConfigProto::Precision> StringToPrecision(
+    const string& name) {
+  static std::unordered_map<string, PrecisionConfigProto::Precision>* map = [] {
+    static auto* map =
+        new std::unordered_map<string, PrecisionConfigProto::Precision>;
+    for (int i = 0; i < PrecisionConfigProto::Precision_ARRAYSIZE; i++) {
+      if (PrecisionConfigProto::Precision_IsValid(i)) {
+        auto value = static_cast<PrecisionConfigProto::Precision>(i);
+        (*map)[PrecisionToString(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = map->find(absl::AsciiStrToLower(name));
   if (found == map->end()) {
     return InvalidArgument("Unknown distribution");
   }
@@ -3156,31 +3185,20 @@ const string& HloInstruction::outfeed_config() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_config();
 }
 
-const std::vector<int64>& HloInstruction::replica_group_ids() const {
-  return Cast<HloAllReduceInstruction>(this)->replica_group_ids();
-}
-
 const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
-  return Cast<HloAllToAllInstruction>(this)->replica_groups();
+  return Cast<HloCollectiveInstruction>(this)->replica_groups();
 }
 
 string HloInstruction::cross_replica_sum_barrier() const {
-  if (opcode() == HloOpcode::kCrossReplicaSum) {
     return Cast<HloAllReduceInstruction>(this)->cross_replica_sum_barrier();
-  }
-  return Cast<HloAllToAllInstruction>(this)->cross_replica_sum_barrier();
 }
 
 void HloInstruction::set_cross_replica_sum_barrier(const string& barrier) {
-  if (opcode() == HloOpcode::kCrossReplicaSum) {
     return Cast<HloAllReduceInstruction>(this)->set_cross_replica_sum_barrier(
         barrier);
-  }
-  return Cast<HloAllToAllInstruction>(this)->set_cross_replica_sum_barrier(
-      barrier);
 }
 
-tensorflow::gtl::optional<int64> HloInstruction::all_reduce_id() const {
+absl::optional<int64> HloInstruction::all_reduce_id() const {
   return Cast<HloAllReduceInstruction>(this)->all_reduce_id();
 }
 
@@ -3206,6 +3224,10 @@ void HloInstruction::set_convolution_dimension_numbers(
   }
 }
 
+int64 HloInstruction::feature_group_count() const {
+  return Cast<HloConvolutionInstruction>(this)->feature_group_count();
+}
+
 HloComputation* HloInstruction::select() const {
   return Cast<HloSelectAndScatterInstruction>(this)->select();
 }
@@ -3226,10 +3248,6 @@ const string& HloInstruction::custom_call_target() const {
   return Cast<HloCustomCallInstruction>(this)->custom_call_target();
 }
 
-const string& HloInstruction::channel_name() const {
-  return Cast<HloHostComputeInstruction>(this)->channel_name();
-}
-
 const PaddingConfig& HloInstruction::padding_config() const {
   return Cast<HloPadInstruction>(this)->padding_config();
 }
@@ -3246,9 +3264,8 @@ const GatherDimensionNumbers& HloInstruction::gather_dimension_numbers() const {
   return Cast<HloGatherInstruction>(this)->gather_dimension_numbers();
 }
 
-tensorflow::gtl::ArraySlice<int64> HloInstruction::gather_window_bounds()
-    const {
-  return Cast<HloGatherInstruction>(this)->gather_window_bounds();
+tensorflow::gtl::ArraySlice<int64> HloInstruction::gather_slice_sizes() const {
+  return Cast<HloGatherInstruction>(this)->gather_slice_sizes();
 }
 
 const ScatterDimensionNumbers& HloInstruction::scatter_dimension_numbers()
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 3c575ae6ea..948e33a0a3 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -32,6 +32,10 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -45,10 +49,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -101,6 +103,7 @@ class HloPrintOptions {
     return HloPrintOptions()
         .set_print_subcomputation_mode(PrintSubcomputationMode::kFullBodies)
         .set_print_metadata(false)
+        .set_print_backend_config(false)
         .set_compact_operands(true)
         .set_print_operand_shape(true)
         .set_print_program_shape(false)
@@ -182,7 +185,7 @@ class HloPrintOptions {
     return print_subcomputation_mode_;
   }
   bool print_metadata() const { return print_metadata_; }
-  bool print_backend_config() const { return print_metadata_; }
+  bool print_backend_config() const { return print_backend_config_; }
   bool compact_operands() const { return compact_operands_; }
   bool print_operand_shape() const { return print_operand_shape_; }
   bool print_program_shape() const { return print_program_shape_; }
@@ -220,7 +223,7 @@ class CanonicalNameMap {
       return iter->second;
     }
 
-    string new_name = tensorflow::strings::StrCat("tmp_", index++);
+    string new_name = absl::StrCat("tmp_", index++);
     canonical_name_map[old_name] = new_name;
     return new_name;
   }
@@ -402,7 +405,8 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateConvolve(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
       const Window& window,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1);
 
   // Creates an FFT op, of the type indicated by fft_type.
   static std::unique_ptr<HloInstruction> CreateFft(
@@ -432,9 +436,10 @@ class HloInstruction {
   //
   // `reduction_computation`: the reduction function.
   //
-  // `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
-  // replicas belong to one group. Allreduce will be applied within subgroups.
-  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+  // `replica_groups`: each ReplicaGroup contains a list of replica id. If
+  // empty, all replicas belong to one group in the order of 0 - (n-1).
+  // Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_groups={{0,2},{1,3}} means,
   // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
   //
   // `all_reduce_id`: for Allreduce nodes from different modules, if they have
@@ -445,9 +450,8 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloComputation* reduce_computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-      tensorflow::StringPiece barrier,
-      const tensorflow::gtl::optional<int64>& all_reduce_id);
+      const std::vector<ReplicaGroup>& replica_groups,
+      absl::string_view barrier, const absl::optional<int64>& all_reduce_id);
 
   // This op handles the communication of an Alltoall operation. On each core,
   // the operands are N ops in the same shape, where N is the number of cores
@@ -462,12 +466,9 @@ class HloInstruction {
   // within replica 1, 2, 3, and in the gather phase, the received blocks will
   // be concatenated in the order of 1, 2, 3; another Alltoall will be applied
   // within replica 4, 5, 0, and the concatenation order is 4, 5, 0.
-  //
-  // TODO(b/110096724): This is NOT YET ready to use.
   static std::unique_ptr<HloInstruction> CreateAllToAll(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      const std::vector<ReplicaGroup>& replica_groups,
-      tensorflow::StringPiece barrier);
+      const std::vector<ReplicaGroup>& replica_groups);
 
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
@@ -486,24 +487,13 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateInfeed(
       const Shape& infeed_shape, HloInstruction* token_operand,
       const string& config);
-  // Overload which does not require a token.
-  // TODO(b/80000000): Remove this overload when all uses of infeed are
-  // converted to take tokens.
-  static std::unique_ptr<HloInstruction> CreateInfeed(const Shape& infeed_shape,
-                                                      const string& config);
 
   // Creates an outfeed instruction, which outputs data. outfeed_shape is the
   // shape of the data being outfed *not* the shape of the outfeed instruction
   // which is a TOKEN.
   static std::unique_ptr<HloInstruction> CreateOutfeed(
       const Shape& outfeed_shape, HloInstruction* operand,
-      HloInstruction* token_operand, tensorflow::StringPiece outfeed_config);
-  // Overload which does not require a token.
-  // TODO(b/80000000): Remove this overload when all uses of outfeed are
-  // converted to take tokens.
-  static std::unique_ptr<HloInstruction> CreateOutfeed(
-      const Shape& outfeed_shape, HloInstruction* operand,
-      tensorflow::StringPiece outfeed_config);
+      HloInstruction* token_operand, absl::string_view outfeed_config);
 
   // Creates an asynchronous send instruction with the given channel id, which
   // initiates sending the operand data to a unique receive instruction in
@@ -677,9 +667,9 @@ class HloInstruction {
 
   static std::unique_ptr<HloInstruction> CreateGather(
       const Shape& shape, HloInstruction* operand,
-      HloInstruction* gather_indices,
+      HloInstruction* start_indices,
       const GatherDimensionNumbers& gather_dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
   static std::unique_ptr<HloInstruction> CreateScatter(
       const Shape& shape, HloInstruction* operand,
@@ -716,13 +706,7 @@ class HloInstruction {
   // to the given operands. "shape" is the resultant shape.
   static std::unique_ptr<HloInstruction> CreateCustomCall(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      tensorflow::StringPiece custom_call_target);
-
-  // Creates a HostCompute instruction, which records host-side control and
-  // data dependencies for use in instruction scheduling.
-  static std::unique_ptr<HloInstruction> CreateHostCompute(
-      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      tensorflow::StringPiece channel_name, const int64 cost_estimate_ns);
+      absl::string_view custom_call_target);
 
   // Creates a tuple instruction with the given elements. This is a convenience
   // wrapper around CreateVariadic.
@@ -776,7 +760,7 @@ class HloInstruction {
   int64 operand_count() const { return operands_.size(); }
 
   // Returns the vector of operands of this instruction.
-  using InstructionVector = tensorflow::gtl::InlinedVector<HloInstruction*, 2>;
+  using InstructionVector = absl::InlinedVector<HloInstruction*, 2>;
   const InstructionVector& operands() const { return operands_; }
 
   // Returns the vector of unique operands, in the same order they are found
@@ -873,6 +857,11 @@ class HloInstruction {
       return false;
     }
 
+    if (!ContainersEqual(precision_config_.operand_precision(),
+                         other.precision_config_.operand_precision())) {
+      return false;
+    }
+
     return IdenticalSlowPath(other, eq_computations);
   }
 
@@ -1048,21 +1037,26 @@ class HloInstruction {
     CHECK(has_sharding());
     return *sharding_;
   }
+  std::shared_ptr<const HloSharding> sharding_ptr() const { return sharding_; }
+
   // Returns the sharding applied to this operator, or default_ if none exists.
   const HloSharding& sharding_or_default(const HloSharding& default_) const {
     return sharding_ ? *sharding_ : default_;
   }
   // Returns the sharding unique device, if any.
-  tensorflow::gtl::optional<int64> sharding_unique_device() const {
+  absl::optional<int64> sharding_unique_device() const {
     if (sharding_ == nullptr) {
-      return tensorflow::gtl::optional<int64>();
+      return absl::optional<int64>();
     }
     return sharding_->UniqueDevice();
   }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
-    sharding_ = MakeUnique<HloSharding>(sharding);
+    sharding_ = std::make_shared<const HloSharding>(sharding);
+  }
+  void set_sharding(std::shared_ptr<const HloSharding> sharding) {
+    sharding_ = std::move(sharding);
   }
   void set_single_sharding(const HloSharding& sharding);
   // Sets a sharding that assigns the current instruction to device.
@@ -1098,19 +1092,6 @@ class HloInstruction {
   // instruction.
   void SetupDerivedInstruction(HloInstruction* derived_instruction) const;
 
-  // TODO(b/80249101): Remove these methods once HLO scheduling and copy
-  // insertion are integrated, and we don't need to run a separate pass
-  // of copy elision anymore.
-  bool CopyElisionAllowed() const {
-    CHECK_EQ(HloOpcode::kCopy, opcode_);
-    return copy_elision_allowed_;
-  }
-
-  void SetCopyElisionAllowed(bool value) {
-    CHECK_EQ(HloOpcode::kCopy, opcode_);
-    copy_elision_allowed_ = value;
-  }
-
   // Returns data on the dimension numbers used for a dot operation.
   const DotDimensionNumbers& dot_dimension_numbers() const {
     CHECK(dot_dimension_numbers_ != nullptr);
@@ -1120,6 +1101,9 @@ class HloInstruction {
   // Returns the dump string of the dot dimension numbers.
   string DotDimensionNumbersToString() const;
 
+  // Returns the dump string of the precision configuration.
+  string PrecisionConfigToString() const;
+
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
@@ -1263,6 +1247,20 @@ class HloInstruction {
   static StatusOr<string> BackendConfigToRawString(
       const tensorflow::protobuf::Message& proto);
 
+  // Returns the information used to tell the implementation information about
+  // what sort of precision is requested. The meaning of the field is backend
+  // specific. At the moment, it is only supported for kConvolution and kDot.
+  // Transformations on one kDot or kConvolution to another will preserve this
+  // information. Transformations to other HLOs will not preserve this
+  // information but it is presumed that the alternate lowering is strictly
+  // superior.
+  const PrecisionConfigProto& precision_config() const {
+    return precision_config_;
+  }
+  void set_precision_config(const PrecisionConfigProto& precision_config) {
+    precision_config_ = precision_config;
+  }
+
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
   const OpMetadata& metadata() const { return metadata_; }
@@ -1431,9 +1429,6 @@ class HloInstruction {
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const;
 
-  // Delegates to HloAllReduceInstruction::replica_group_ids.
-  const std::vector<int64>& replica_group_ids() const;
-
   // Delegates to HloAllToAllInstruction::replica_groups.
   const std::vector<ReplicaGroup>& replica_groups() const;
 
@@ -1442,7 +1437,7 @@ class HloInstruction {
   void set_cross_replica_sum_barrier(const string& barrier);
 
   // Delegates to HloAllReduceInstruction::all_reduce_id.
-  tensorflow::gtl::optional<int64> all_reduce_id() const;
+  absl::optional<int64> all_reduce_id() const;
 
   // Returns data on the window in a windowed operation such as
   // convolution.
@@ -1466,6 +1461,10 @@ class HloInstruction {
   void set_convolution_dimension_numbers(
       const ConvolutionDimensionNumbers& dnums);
 
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64 feature_group_count() const;
+
   // Delegates to HloSelectAndScatterInstruction::select.
   HloComputation* select() const;
 
@@ -1481,9 +1480,6 @@ class HloInstruction {
   // Delegates to HloCustomCallInstruction::custom_call_target.
   const string& custom_call_target() const;
 
-  // Delegates to HloHostComputeInstruction::channel_name.
-  const string& channel_name() const;
-
   // Delegates to HloPadInstruction::padding_config.
   const PaddingConfig& padding_config() const;
 
@@ -1495,8 +1491,8 @@ class HloInstruction {
 
   // Delegates to HloGatherInstruction::gather_dimension_numbers.
   const GatherDimensionNumbers& gather_dimension_numbers() const;
-  // Delegates to HloGatherInstruction::gather_window_bounds.
-  tensorflow::gtl::ArraySlice<int64> gather_window_bounds() const;
+  // Delegates to HloGatherInstruction::gather_slice_sizes.
+  tensorflow::gtl::ArraySlice<int64> gather_slice_sizes() const;
 
   // Delegates to HloScatterInstruction::scatter_dimension_numbers().
   const ScatterDimensionNumbers& scatter_dimension_numbers() const;
@@ -1571,7 +1567,7 @@ class HloInstruction {
   // NOTE: For all instructions other than kFusion, being elementwise on one of
   // the operands is equivalent to being elementwise on all the operands.
   virtual bool IsElementwiseImpl(
-      const tensorflow::gtl::optional<int64>& operand_idx) const;
+      const absl::optional<int64>& operand_idx) const;
   // Prints an instruction to a string.
   //
   // The canonical string representation needs to name operands and instruction
@@ -1648,7 +1644,10 @@ class HloInstruction {
   bool copy_elision_allowed_ = true;
 
   // The sharding, if one exists.
-  std::unique_ptr<HloSharding> sharding_;
+  // Uses std::shared_ptr to allow reuse of the same sharding object between
+  // HloInstructions and other components as HloSharding can be very large for
+  // many element tuples.
+  std::shared_ptr<const HloSharding> sharding_;
 
   // Fields used by the kDomain instruction.
   std::unique_ptr<DomainMetadata> operand_side_metadata_;
@@ -1667,6 +1666,10 @@ class HloInstruction {
   // HLO. See the documentation on backend_config().
   string backend_config_;
 
+  // Information used to communicate to the implementation about the algorithm
+  // used to produce results. See the documentation on precision_config().
+  PrecisionConfigProto precision_config_;
+
   // String identifier for instruction.
   string name_;
 
@@ -1689,10 +1692,12 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 string PaddingConfigToString(const PaddingConfig& padding);
 string OpMetadataToString(const OpMetadata& metadata);
 string RandomDistributionToString(const RandomDistribution& distribution);
+string PrecisionToString(const PrecisionConfigProto::Precision& precision);
 string ConvolutionDimensionNumbersToString(
     const ConvolutionDimensionNumbers& dnums);
 
 StatusOr<RandomDistribution> StringToRandomDistribution(const string& name);
+StatusOr<PrecisionConfigProto::Precision> StringToPrecision(const string& name);
 
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 8a694dde80..504b13043f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1355,7 +1355,7 @@ TEST_F(HloInstructionTest, Stringification) {
 
 TEST_F(HloInstructionTest, StringifyGather_0) {
   Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46});
-  Shape gather_indices_tensor_shape =
+  Shape start_indices_tensor_shape =
       ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 5});
   Shape gather_result_shape =
       ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26});
@@ -1363,19 +1363,18 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
   HloComputation::Builder builder("Gather");
   HloInstruction* input = builder.AddInstruction(
       HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor"));
-  HloInstruction* gather_indices =
+  HloInstruction* start_indices =
       builder.AddInstruction(HloInstruction::CreateParameter(
-          1, gather_indices_tensor_shape, "gather_indices"));
-
-  HloInstruction* gather_instruction =
-      builder.AddInstruction(HloInstruction::CreateGather(
-          gather_result_shape, input, gather_indices,
-          HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
-              /*index_vector_dim=*/4),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          1, start_indices_tensor_shape, "start_indices"));
+
+  HloInstruction* gather_instruction = builder.AddInstruction(
+      HloInstruction::CreateGather(gather_result_shape, input, start_indices,
+                                   HloGatherInstruction::MakeGatherDimNumbers(
+                                       /*offset_dims=*/{4, 5, 6, 7, 8},
+                                       /*collapsed_slice_dims=*/{},
+                                       /*start_index_map=*/{0, 1, 2, 3, 4},
+                                       /*index_vector_dim=*/4),
+                                   /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
@@ -1383,15 +1382,15 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
             "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
-            "s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), "
-            "output_window_dims={4,5,6,7,8}, elided_window_dims={}, "
-            "gather_dims_to_operand_dims={0,1,2,3,4}, "
-            "index_vector_dim=4, window_bounds={30,29,28,27,26}");
+            "s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), "
+            "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, "
+            "start_index_map={0,1,2,3,4}, "
+            "index_vector_dim=4, slice_sizes={30,29,28,27,26}");
 }
 
 TEST_F(HloInstructionTest, StringifyGather_1) {
   Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46});
-  Shape gather_indices_tensor_shape =
+  Shape start_indices_tensor_shape =
       ShapeUtil::MakeShape(S64, {10, 9, 5, 7, 6});
   Shape gather_result_shape =
       ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26});
@@ -1399,19 +1398,18 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
   HloComputation::Builder builder("Gather");
   HloInstruction* input = builder.AddInstruction(
       HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor"));
-  HloInstruction* gather_indices =
+  HloInstruction* start_indices =
       builder.AddInstruction(HloInstruction::CreateParameter(
-          1, gather_indices_tensor_shape, "gather_indices"));
-
-  HloInstruction* gather_instruction =
-      builder.AddInstruction(HloInstruction::CreateGather(
-          gather_result_shape, input, gather_indices,
-          HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
-              /*index_vector_dim=*/2),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          1, start_indices_tensor_shape, "start_indices"));
+
+  HloInstruction* gather_instruction = builder.AddInstruction(
+      HloInstruction::CreateGather(gather_result_shape, input, start_indices,
+                                   HloGatherInstruction::MakeGatherDimNumbers(
+                                       /*offset_dims=*/{4, 5, 6, 7, 8},
+                                       /*collapsed_slice_dims=*/{},
+                                       /*start_index_map=*/{0, 1, 2, 3, 4},
+                                       /*index_vector_dim=*/2),
+                                   /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
@@ -1419,10 +1417,10 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
             "gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, "
-            "s64[10,9,5,7,6]{4,3,2,1,0} %gather_indices), "
-            "output_window_dims={4,5,6,7,8}, elided_window_dims={}, "
-            "gather_dims_to_operand_dims={0,1,2,3,4}, "
-            "index_vector_dim=2, window_bounds={30,29,28,27,26}");
+            "s64[10,9,5,7,6]{4,3,2,1,0} %start_indices), "
+            "offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, "
+            "start_index_map={0,1,2,3,4}, "
+            "index_vector_dim=2, slice_sizes={30,29,28,27,26}");
 }
 
 TEST_F(HloInstructionTest, StringifyScatter) {
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 1de5032670..a0de253eda 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -17,6 +17,12 @@ limitations under the License.
 
 #include <deque>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -27,10 +33,10 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using ::tensorflow::str_util::CEscape;
-using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::CEscape;
+using absl::StrAppend;
+using absl::StrCat;
+using absl::StrJoin;
 
 bool IsInstructionElementwiseOnOperand(const HloInstruction* instruction,
                                        const HloInstruction* operand) {
@@ -89,7 +95,7 @@ HloBatchNormTrainingInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 3);
-  return MakeUnique<HloBatchNormTrainingInstruction>(
+  return absl::make_unique<HloBatchNormTrainingInstruction>(
       shape, new_operands[0], new_operands[1], new_operands[2], epsilon(),
       feature_index());
 }
@@ -111,7 +117,7 @@ HloBatchNormInferenceInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 5);
-  return MakeUnique<HloBatchNormInferenceInstruction>(
+  return absl::make_unique<HloBatchNormInferenceInstruction>(
       shape, new_operands[0], new_operands[1], new_operands[2], new_operands[3],
       new_operands[4], epsilon(), feature_index());
 }
@@ -133,7 +139,7 @@ HloBatchNormGradInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 5);
-  return MakeUnique<HloBatchNormGradInstruction>(
+  return absl::make_unique<HloBatchNormGradInstruction>(
       shape, new_operands[0], new_operands[1], new_operands[2], new_operands[3],
       new_operands[4], epsilon(), feature_index());
 }
@@ -158,7 +164,7 @@ HloInstructionProto HloFftInstruction::ToProto() const {
 std::vector<string> HloFftInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   return {StrCat("fft_type=", FftType_Name(fft_type())),
-          StrCat("fft_length={", Join(fft_length(), ","), "}")};
+          StrCat("fft_length={", StrJoin(fft_length(), ","), "}")};
 }
 
 bool HloFftInstruction::IdenticalSlowPath(
@@ -175,8 +181,8 @@ std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloFftInstruction>(shape, new_operands[0], fft_type_,
-                                       fft_length_);
+  return absl::make_unique<HloFftInstruction>(shape, new_operands[0], fft_type_,
+                                              fft_length_);
 }
 
 HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
@@ -230,8 +236,8 @@ std::unique_ptr<HloInstruction> HloSendInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloSendInstruction>(new_operands[0], new_operands[1],
-                                        channel_id(), is_host_transfer());
+  return absl::make_unique<HloSendInstruction>(
+      new_operands[0], new_operands[1], channel_id(), is_host_transfer());
 }
 
 HloSendDoneInstruction::HloSendDoneInstruction(HloSendInstruction* operand,
@@ -248,7 +254,7 @@ HloSendDoneInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloSendDoneInstruction>(
+  return absl::make_unique<HloSendDoneInstruction>(
       Cast<HloSendInstruction>(new_operands[0]), is_host_transfer());
 }
 
@@ -269,7 +275,7 @@ std::unique_ptr<HloInstruction> HloRecvInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloRecvInstruction>(
+  return absl::make_unique<HloRecvInstruction>(
       ShapeUtil::GetTupleElementShape(shape, 0), new_operands[0], channel_id(),
       is_host_transfer());
 }
@@ -291,31 +297,67 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloRecvDoneInstruction>(
+  return absl::make_unique<HloRecvDoneInstruction>(
       Cast<HloRecvInstruction>(new_operands[0]), is_host_transfer());
 }
 
+HloCollectiveInstruction::HloCollectiveInstruction(
+    HloOpcode opcode, const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    const std::vector<ReplicaGroup>& replica_groups)
+    : HloInstruction(opcode, shape), replica_groups_(replica_groups) {
+  for (auto operand : operands) {
+    AppendOperand(operand);
+  }
+}
+
+HloInstructionProto HloCollectiveInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_replica_groups() = {replica_groups_.begin(),
+                                     replica_groups_.end()};
+  return proto;
+}
+
+std::vector<string> HloCollectiveInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& /*options*/) const {
+  std::vector<string> result;
+  std::vector<string> replica_group_str;
+  for (const ReplicaGroup& group : replica_groups()) {
+    replica_group_str.push_back(
+        StrCat("{", StrJoin(group.replica_ids(), ","), "}"));
+  }
+  result.push_back(
+      StrCat("replica_groups={", StrJoin(replica_group_str, ","), "}"));
+  return result;
+}
+
+bool HloCollectiveInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+    /*eq_computations*/) const {
+  const auto& casted_other =
+      static_cast<const HloCollectiveInstruction&>(other);
+  return ContainersEqual(replica_groups(), casted_other.replica_groups(),
+                         [](const ReplicaGroup& a, const ReplicaGroup& b) {
+                           return ContainersEqual(a.replica_ids(),
+                                                  b.replica_ids());
+                         });
+}
+
 HloAllReduceInstruction::HloAllReduceInstruction(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* reduce_computation,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-    tensorflow::StringPiece barrier,
-    const tensorflow::gtl::optional<int64>& all_reduce_id)
-    : HloInstruction(HloOpcode::kCrossReplicaSum, shape),
-      replica_group_ids_(replica_group_ids.begin(), replica_group_ids.end()),
-      cross_replica_sum_barrier_(barrier.begin(), barrier.end()),
+    const std::vector<ReplicaGroup>& replica_groups, absl::string_view barrier,
+    const absl::optional<int64>& all_reduce_id)
+    : HloCollectiveInstruction(HloOpcode::kCrossReplicaSum, shape, operands,
+                               replica_groups),
+      cross_replica_sum_barrier_(barrier),
       all_reduce_id_(all_reduce_id) {
-  for (auto operand : operands) {
-    AppendOperand(operand);
-  }
   AppendComputation(reduce_computation);
 }
 
 HloInstructionProto HloAllReduceInstruction::ToProto() const {
-  HloInstructionProto proto = HloInstruction::ToProto();
-  for (int64 i : replica_group_ids_) {
-    proto.add_replica_group_ids(i);
-  }
+  HloInstructionProto proto = HloCollectiveInstruction::ToProto();
   // Proto3 is so sad.
   if (all_reduce_id_) {
     proto.set_all_reduce_id(*all_reduce_id_);
@@ -325,9 +367,9 @@ HloInstructionProto HloAllReduceInstruction::ToProto() const {
 }
 
 std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
-    const HloPrintOptions& /*options*/) const {
-  std::vector<string> result = {
-      StrCat("replica_group_ids={", Join(replica_group_ids(), ","), "}")};
+    const HloPrintOptions& options) const {
+  std::vector<string> result =
+      HloCollectiveInstruction::ExtraAttributesToStringImpl(options);
   if (!cross_replica_sum_barrier().empty()) {
     result.push_back(StrCat("barrier=\"", cross_replica_sum_barrier(), "\""));
   }
@@ -342,7 +384,7 @@ bool HloAllReduceInstruction::IdenticalSlowPath(
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloAllReduceInstruction&>(other);
-  return replica_group_ids() == casted_other.replica_group_ids() &&
+  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
          eq_computations(to_apply(), casted_other.to_apply()) &&
          cross_replica_sum_barrier() ==
              casted_other.cross_replica_sum_barrier() &&
@@ -354,70 +396,24 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* /*context*/) const {
-  return MakeUnique<HloAllReduceInstruction>(
-      shape, new_operands, to_apply(), replica_group_ids(),
+  return absl::make_unique<HloAllReduceInstruction>(
+      shape, new_operands, to_apply(), replica_groups(),
       cross_replica_sum_barrier(), all_reduce_id());
 }
 
 HloAllToAllInstruction::HloAllToAllInstruction(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    const std::vector<ReplicaGroup>& replica_groups,
-    tensorflow::StringPiece barrier)
-    : HloInstruction(HloOpcode::kAllToAll, shape),
-      replica_groups_(replica_groups),
-      cross_replica_sum_barrier_(barrier.begin(), barrier.end()) {
-  for (auto operand : operands) {
-    AppendOperand(operand);
-  }
-}
-
-bool HloAllToAllInstruction::IdenticalSlowPath(
-    const HloInstruction& other,
-    const std::function<bool(const HloComputation*, const HloComputation*)>&
-        eq_computations) const {
-  const auto& casted_other = static_cast<const HloAllToAllInstruction&>(other);
-  return ContainersEqual(replica_groups(), casted_other.replica_groups(),
-                         [](const ReplicaGroup& a, const ReplicaGroup& b) {
-                           return ContainersEqual(a.replica_ids(),
-                                                  b.replica_ids());
-                         }) &&
-         cross_replica_sum_barrier() ==
-             casted_other.cross_replica_sum_barrier();
-}
+    const std::vector<ReplicaGroup>& replica_groups)
+    : HloCollectiveInstruction(HloOpcode::kAllToAll, shape, operands,
+                               replica_groups) {}
 
 std::unique_ptr<HloInstruction>
 HloAllToAllInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* /*context*/) const {
-  return MakeUnique<HloAllToAllInstruction>(
-      shape, new_operands, replica_groups(), cross_replica_sum_barrier());
-}
-
-std::vector<string> HloAllToAllInstruction::ExtraAttributesToStringImpl(
-    const HloPrintOptions& options) const {
-  std::vector<string> result;
-  std::vector<string> replica_group_str;
-  for (const ReplicaGroup& group : replica_groups()) {
-    replica_group_str.push_back(
-        StrCat("{", Join(group.replica_ids(), ","), "}"));
-  }
-  result.push_back(
-      StrCat("replica_groups={", Join(replica_group_str, ","), "}"));
-
-  if (!cross_replica_sum_barrier().empty()) {
-    result.push_back(StrCat("barrier=\"", cross_replica_sum_barrier(), "\""));
-  }
-
-  return result;
-}
-
-HloInstructionProto HloAllToAllInstruction::ToProto() const {
-  HloInstructionProto proto = HloInstruction::ToProto();
-  *proto.mutable_replica_groups() = {replica_groups_.begin(),
-                                     replica_groups_.end()};
-  proto.set_cross_replica_sum_barrier(cross_replica_sum_barrier_);
-  return proto;
+  return absl::make_unique<HloAllToAllInstruction>(shape, new_operands,
+                                                   replica_groups());
 }
 
 HloReverseInstruction::HloReverseInstruction(
@@ -438,7 +434,7 @@ HloInstructionProto HloReverseInstruction::ToProto() const {
 
 std::vector<string> HloReverseInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
 }
 
 bool HloReverseInstruction::IdenticalSlowPath(
@@ -454,8 +450,8 @@ std::unique_ptr<HloInstruction> HloReverseInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloReverseInstruction>(shape, new_operands[0],
-                                           dimensions());
+  return absl::make_unique<HloReverseInstruction>(shape, new_operands[0],
+                                                  dimensions());
 }
 
 HloConcatenateInstruction::HloConcatenateInstruction(
@@ -477,7 +473,7 @@ HloInstructionProto HloConcatenateInstruction::ToProto() const {
 
 std::vector<string> HloConcatenateInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
 }
 
 bool HloConcatenateInstruction::IdenticalSlowPath(
@@ -494,8 +490,8 @@ HloConcatenateInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  return MakeUnique<HloConcatenateInstruction>(shape, new_operands,
-                                               dimensions(0));
+  return absl::make_unique<HloConcatenateInstruction>(shape, new_operands,
+                                                      dimensions(0));
 }
 
 HloReduceInstruction::HloReduceInstruction(
@@ -520,7 +516,7 @@ HloInstructionProto HloReduceInstruction::ToProto() const {
 
 std::vector<string> HloReduceInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
 }
 
 bool HloReduceInstruction::IdenticalSlowPath(
@@ -539,8 +535,8 @@ std::unique_ptr<HloInstruction> HloReduceInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloReduceInstruction>(shape, new_operands, dimensions(),
-                                          to_apply());
+  return absl::make_unique<HloReduceInstruction>(shape, new_operands,
+                                                 dimensions(), to_apply());
 }
 
 HloSortInstruction::HloSortInstruction(const Shape& shape, int64 dimension,
@@ -563,7 +559,7 @@ HloInstructionProto HloSortInstruction::ToProto() const {
 
 std::vector<string> HloSortInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
 }
 
 bool HloSortInstruction::IdenticalSlowPath(
@@ -580,7 +576,8 @@ std::unique_ptr<HloInstruction> HloSortInstruction::CloneWithNewOperandsImpl(
     HloCloneContext* context) const {
   HloInstruction* keys = new_operands[0];
   HloInstruction* values = new_operands.size() == 2 ? new_operands[1] : nullptr;
-  return MakeUnique<HloSortInstruction>(shape, dimensions(0), keys, values);
+  return absl::make_unique<HloSortInstruction>(shape, dimensions(0), keys,
+                                               values);
 }
 
 HloTransposeInstruction::HloTransposeInstruction(
@@ -595,7 +592,7 @@ HloTransposeInstruction::HloTransposeInstruction(
                    Permute(dimensions, shape.dimensions()).begin()))
       << "shape: " << ShapeUtil::HumanString(shape)
       << ", operand->shape(): " << ShapeUtil::HumanString(shape)
-      << ", dimensions: {" << Join(dimensions, ", ") << "}";
+      << ", dimensions: {" << StrJoin(dimensions, ", ") << "}";
   AppendOperand(operand);
 }
 
@@ -616,7 +613,7 @@ HloInstructionProto HloTransposeInstruction::ToProto() const {
 
 std::vector<string> HloTransposeInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
 }
 
 bool HloTransposeInstruction::IdenticalSlowPath(
@@ -633,8 +630,8 @@ HloTransposeInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloTransposeInstruction>(shape, new_operands[0],
-                                             dimensions());
+  return absl::make_unique<HloTransposeInstruction>(shape, new_operands[0],
+                                                    dimensions());
 }
 
 HloBroadcastInstruction::HloBroadcastInstruction(
@@ -655,7 +652,7 @@ HloInstructionProto HloBroadcastInstruction::ToProto() const {
 
 std::vector<string> HloBroadcastInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
 }
 
 bool HloBroadcastInstruction::IdenticalSlowPath(
@@ -672,8 +669,8 @@ HloBroadcastInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloBroadcastInstruction>(shape, new_operands[0],
-                                             dimensions());
+  return absl::make_unique<HloBroadcastInstruction>(shape, new_operands[0],
+                                                    dimensions());
 }
 
 HloMapInstruction::HloMapInstruction(
@@ -699,7 +696,7 @@ HloInstructionProto HloMapInstruction::ToProto() const {
 }
 
 bool HloMapInstruction::IsElementwiseImpl(
-    const tensorflow::gtl::optional<int64>& operand_idx) const {
+    const absl::optional<int64>& operand_idx) const {
   if (!dimensions().empty()) {
     // Check that the map is executed in elementwise compatible dimensions.
     if (dimensions().size() != shape().dimensions_size()) {
@@ -716,7 +713,7 @@ bool HloMapInstruction::IsElementwiseImpl(
 
 std::vector<string> HloMapInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
 }
 
 bool HloMapInstruction::IdenticalSlowPath(
@@ -730,7 +727,7 @@ std::unique_ptr<HloInstruction> HloMapInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  return MakeUnique<HloMapInstruction>(shape, new_operands, to_apply());
+  return absl::make_unique<HloMapInstruction>(shape, new_operands, to_apply());
 }
 
 HloSliceInstruction::HloSliceInstruction(
@@ -774,7 +771,7 @@ std::vector<string> HloSliceInstruction::ExtraAttributesToStringImpl(
     bounds.push_back(
         StrCat("[", slice_starts_[i], ":", slice_limits_[i], stride_str, "]"));
   }
-  return {StrCat("slice={", Join(bounds, ", "), "}")};
+  return {StrCat("slice={", StrJoin(bounds, ", "), "}")};
 }
 
 bool HloSliceInstruction::IdenticalSlowPath(
@@ -792,8 +789,8 @@ std::unique_ptr<HloInstruction> HloSliceInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloSliceInstruction>(shape, new_operands[0], slice_starts_,
-                                         slice_limits_, slice_strides_);
+  return absl::make_unique<HloSliceInstruction>(
+      shape, new_operands[0], slice_starts_, slice_limits_, slice_strides_);
 }
 
 HloConstantInstruction::HloConstantInstruction(std::unique_ptr<Literal> literal)
@@ -812,7 +809,7 @@ HloInstructionProto HloConstantInstruction::ToProto() const {
 }
 
 bool HloConstantInstruction::IsElementwiseImpl(
-    const tensorflow::gtl::optional<int64>& operand_idx) const {
+    const absl::optional<int64>& operand_idx) const {
   return true;
 }
 
@@ -845,7 +842,7 @@ HloConstantInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  return MakeUnique<HloConstantInstruction>(literal_->CloneToUnique());
+  return absl::make_unique<HloConstantInstruction>(literal_->CloneToUnique());
 }
 
 string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
@@ -860,7 +857,7 @@ string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
     // lines. Compact this into one line by stripping out white space.
     string tmp = literal().ToString();
     std::replace(tmp.begin(), tmp.end(), '\n', ' ');
-    std::vector<string> v = tensorflow::str_util::Split(tmp, ' ');
+    std::vector<string> v = absl::StrSplit(tmp, ' ');
     bool first = true;
     // Concatenate elements in "v" with spaces separating them, but ignoring
     // empty entries.
@@ -952,7 +949,7 @@ HloInstructionProto HloFusionInstruction::ToProto() const {
 }
 
 bool HloFusionInstruction::IsElementwiseImpl(
-    const tensorflow::gtl::optional<int64>& operand_idx) const {
+    const absl::optional<int64>& operand_idx) const {
   if (!operand_idx.has_value()) {
     for (auto* fused : fused_instructions()) {
       if (fused->opcode() != HloOpcode::kParameter && !fused->IsElementwise()) {
@@ -1339,8 +1336,8 @@ std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
     new_fused_computation = module->AddEmbeddedComputation(
         fused_instructions_computation()->Clone("clone", context));
   }
-  return MakeUnique<HloFusionInstruction>(shape, fusion_kind(), new_operands,
-                                          new_fused_computation);
+  return absl::make_unique<HloFusionInstruction>(
+      shape, fusion_kind(), new_operands, new_fused_computation);
 }
 
 Status HloFusionInstruction::DeduplicateFusionOperands() {
@@ -1384,7 +1381,7 @@ std::vector<string> HloRngInstruction::ExtraAttributesToStringImpl(
 }
 
 bool HloRngInstruction::IsElementwiseImpl(
-    const tensorflow::gtl::optional<int64>& operand_idx) const {
+    const absl::optional<int64>& operand_idx) const {
   return true;
 }
 
@@ -1399,7 +1396,8 @@ std::unique_ptr<HloInstruction> HloRngInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  return MakeUnique<HloRngInstruction>(shape, distribution_, new_operands);
+  return absl::make_unique<HloRngInstruction>(shape, distribution_,
+                                              new_operands);
 }
 
 HloParameterInstruction::HloParameterInstruction(int64 parameter_number,
@@ -1435,7 +1433,8 @@ HloParameterInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  return MakeUnique<HloParameterInstruction>(parameter_number_, shape, name());
+  return absl::make_unique<HloParameterInstruction>(parameter_number_, shape,
+                                                    name());
 }
 
 HloGetTupleElementInstruction::HloGetTupleElementInstruction(
@@ -1471,8 +1470,8 @@ HloGetTupleElementInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloGetTupleElementInstruction>(shape, new_operands[0],
-                                                   tuple_index());
+  return absl::make_unique<HloGetTupleElementInstruction>(
+      shape, new_operands[0], tuple_index());
 }
 
 HloReducePrecisionInstruction::HloReducePrecisionInstruction(
@@ -1514,7 +1513,7 @@ HloReducePrecisionInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return MakeUnique<HloReducePrecisionInstruction>(
+  return absl::make_unique<HloReducePrecisionInstruction>(
       shape, new_operands[0], exponent_bits(), mantissa_bits());
 }
 
@@ -1528,13 +1527,6 @@ HloInfeedInstruction::HloInfeedInstruction(const Shape& infeed_shape,
   AppendOperand(token_operand);
 }
 
-HloInfeedInstruction::HloInfeedInstruction(const Shape& infeed_shape,
-                                           const string& config)
-    : HloInstruction(HloOpcode::kInfeed,
-                     ShapeUtil::MakeTupleShape(
-                         {infeed_shape, ShapeUtil::MakeTokenShape()})),
-      infeed_config_(config) {}
-
 HloInstructionProto HloInfeedInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_infeed_config(infeed_config_);
@@ -1561,21 +1553,18 @@ std::unique_ptr<HloInstruction> HloInfeedInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  if (new_operands.empty()) {
-    return MakeUnique<HloInfeedInstruction>(infeed_shape(), infeed_config());
-  } else {
-    CHECK_EQ(new_operands.size(), 1);
-    return MakeUnique<HloInfeedInstruction>(infeed_shape(), new_operands[0],
-                                            infeed_config());
-  }
+  CHECK_EQ(new_operands.size(), 1);
+  return absl::make_unique<HloInfeedInstruction>(
+      infeed_shape(), new_operands[0], infeed_config());
 }
 
-HloOutfeedInstruction::HloOutfeedInstruction(
-    const Shape& outfeed_shape, HloInstruction* operand,
-    HloInstruction* token_operand, tensorflow::StringPiece outfeed_config)
+HloOutfeedInstruction::HloOutfeedInstruction(const Shape& outfeed_shape,
+                                             HloInstruction* operand,
+                                             HloInstruction* token_operand,
+                                             absl::string_view outfeed_config)
     : HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeTokenShape()),
       outfeed_shape_(outfeed_shape),
-      outfeed_config_(outfeed_config.begin(), outfeed_config.end()) {
+      outfeed_config_(outfeed_config) {
   CHECK(ShapeUtil::Compatible(operand->shape(), outfeed_shape))
       << "Outfeed shape " << outfeed_shape
       << " must be compatible with operand shape " << operand->shape();
@@ -1583,18 +1572,6 @@ HloOutfeedInstruction::HloOutfeedInstruction(
   AppendOperand(token_operand);
 }
 
-HloOutfeedInstruction::HloOutfeedInstruction(
-    const Shape& outfeed_shape, HloInstruction* operand,
-    tensorflow::StringPiece outfeed_config)
-    : HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeTokenShape()),
-      outfeed_shape_(outfeed_shape),
-      outfeed_config_(outfeed_config.begin(), outfeed_config.end()) {
-  CHECK(ShapeUtil::Compatible(operand->shape(), outfeed_shape))
-      << "Outfeed shape " << outfeed_shape
-      << " must be compatible with operand shape " << operand->shape();
-  AppendOperand(operand);
-}
-
 HloInstructionProto HloOutfeedInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_outfeed_config(outfeed_config());
@@ -1622,22 +1599,19 @@ std::unique_ptr<HloInstruction> HloOutfeedInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  if (new_operands.size() == 1) {
-    return MakeUnique<HloOutfeedInstruction>(outfeed_shape(), new_operands[0],
-                                             outfeed_config());
-  } else {
-    CHECK_EQ(new_operands.size(), 2);
-    return MakeUnique<HloOutfeedInstruction>(outfeed_shape(), new_operands[0],
-                                             new_operands[1], outfeed_config());
-  }
+  CHECK_EQ(new_operands.size(), 2);
+  return absl::make_unique<HloOutfeedInstruction>(
+      outfeed_shape(), new_operands[0], new_operands[1], outfeed_config());
 }
 
 HloConvolutionInstruction::HloConvolutionInstruction(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers)
+    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count)
     : HloInstruction(HloOpcode::kConvolution, shape),
       window_(window),
-      convolution_dimension_numbers_(dimension_numbers) {
+      convolution_dimension_numbers_(dimension_numbers),
+      feature_group_count_(feature_group_count) {
   if (window_util::HasBaseDilation(window)) {
     SetAndSanitizeName(StrCat(name(), "-base-dilated"));
   }
@@ -1675,6 +1649,7 @@ std::vector<string> HloConvolutionInstruction::ExtraAttributesToStringImpl(
   }
   extra.push_back(StrCat("dim_labels=", ConvolutionDimensionNumbersToString(
                                             convolution_dimension_numbers_)));
+  extra.push_back(StrCat("feature_group_count=", feature_group_count_));
   return extra;
 }
 
@@ -1696,9 +1671,9 @@ HloConvolutionInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloConvolutionInstruction>(shape, new_operands[0],
-                                               new_operands[1], window(),
-                                               convolution_dimension_numbers_);
+  return absl::make_unique<HloConvolutionInstruction>(
+      shape, new_operands[0], new_operands[1], window(),
+      convolution_dimension_numbers_, feature_group_count_);
 }
 
 HloReduceWindowInstruction::HloReduceWindowInstruction(
@@ -1741,7 +1716,7 @@ HloReduceWindowInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloReduceWindowInstruction>(
+  return absl::make_unique<HloReduceWindowInstruction>(
       shape, new_operands[0], new_operands[1], window(), to_apply());
 }
 
@@ -1790,14 +1765,14 @@ HloSelectAndScatterInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 3);
-  return MakeUnique<HloSelectAndScatterInstruction>(
+  return absl::make_unique<HloSelectAndScatterInstruction>(
       shape, new_operands[0], select(), window(), new_operands[1],
       new_operands[2], scatter());
 }
 
 HloCustomCallInstruction::HloCustomCallInstruction(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    tensorflow::StringPiece custom_call_target)
+    absl::string_view custom_call_target)
     : HloInstruction(HloOpcode::kCustomCall, shape),
       custom_call_target_(custom_call_target.begin(),
                           custom_call_target.end()) {
@@ -1865,8 +1840,8 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
-  auto cloned = MakeUnique<HloCustomCallInstruction>(shape, new_operands,
-                                                     custom_call_target());
+  auto cloned = absl::make_unique<HloCustomCallInstruction>(
+      shape, new_operands, custom_call_target());
   if (window_ != nullptr) {
     cloned->set_window(*window_);
   }
@@ -1876,41 +1851,6 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
   return std::move(cloned);
 }
 
-HloHostComputeInstruction::HloHostComputeInstruction(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    tensorflow::StringPiece channel_name, const int64 cost_estimate_ns)
-    : HloInstruction(HloOpcode::kHostCompute, shape),
-      channel_name_(channel_name.begin(), channel_name.end()),
-      cost_estimate_ns_(cost_estimate_ns) {
-  for (auto operand : operands) {
-    AppendOperand(operand);
-  }
-}
-
-HloInstructionProto HloHostComputeInstruction::ToProto() const {
-  HloInstructionProto proto = HloInstruction::ToProto();
-  proto.set_channel_name(channel_name_);
-  proto.set_cost_estimate_ns(cost_estimate_ns_);
-  return proto;
-}
-
-bool HloHostComputeInstruction::IdenticalSlowPath(
-    const HloInstruction& other,
-    const std::function<bool(const HloComputation*, const HloComputation*)>&
-        eq_computations) const {
-  // Not yet supported.
-  return false;
-}
-
-std::unique_ptr<HloInstruction>
-HloHostComputeInstruction::CloneWithNewOperandsImpl(
-    const Shape& shape,
-    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
-    HloCloneContext* context) const {
-  return MakeUnique<HloHostComputeInstruction>(
-      shape, new_operands, channel_name_, cost_estimate_ns_);
-}
-
 HloPadInstruction::HloPadInstruction(const Shape& shape,
                                      HloInstruction* operand,
                                      HloInstruction* padding_value,
@@ -1945,8 +1885,8 @@ std::unique_ptr<HloInstruction> HloPadInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloPadInstruction>(shape, new_operands[0], new_operands[1],
-                                       padding_config_);
+  return absl::make_unique<HloPadInstruction>(shape, new_operands[0],
+                                              new_operands[1], padding_config_);
 }
 
 HloDynamicSliceInstruction::HloDynamicSliceInstruction(
@@ -1968,8 +1908,8 @@ HloInstructionProto HloDynamicSliceInstruction::ToProto() const {
 
 std::vector<string> HloDynamicSliceInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {
-      StrCat("dynamic_slice_sizes={", Join(dynamic_slice_sizes(), ","), "}")};
+  return {StrCat("dynamic_slice_sizes={", StrJoin(dynamic_slice_sizes(), ","),
+                 "}")};
 }
 
 bool HloDynamicSliceInstruction::IdenticalSlowPath(
@@ -1985,56 +1925,55 @@ HloDynamicSliceInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloDynamicSliceInstruction>(
+  return absl::make_unique<HloDynamicSliceInstruction>(
       shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
 }
 
 HloGatherInstruction::HloGatherInstruction(
-    const Shape& shape, HloInstruction* operand, HloInstruction* gather_indices,
+    const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     const GatherDimensionNumbers& gather_dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds)
+    tensorflow::gtl::ArraySlice<int64> slice_sizes)
     : HloInstruction(HloOpcode::kGather, shape) {
   AppendOperand(operand);
-  AppendOperand(gather_indices);
+  AppendOperand(start_indices);
   gather_dimension_numbers_ =
-      MakeUnique<GatherDimensionNumbers>(gather_dim_numbers);
-  c_copy(window_bounds, std::back_inserter(gather_window_bounds_));
+      absl::make_unique<GatherDimensionNumbers>(gather_dim_numbers);
+  absl::c_copy(slice_sizes, std::back_inserter(gather_slice_sizes_));
 }
 
 string HloGatherInstruction::GatherDimensionNumbersToString() const {
   CHECK(gather_dimension_numbers_ != nullptr);
-  string output_window_dims =
-      StrCat("output_window_dims={",
-             Join(gather_dimension_numbers_->output_window_dims(), ","), "}");
-  string elided_window_dims =
-      StrCat("elided_window_dims={",
-             Join(gather_dimension_numbers_->elided_window_dims(), ","), "}");
-  string gather_dims_to_operand_dims = StrCat(
-      "gather_dims_to_operand_dims={",
-      Join(gather_dimension_numbers_->gather_dims_to_operand_dims(), ","), "}");
+  string offset_dims =
+      StrCat("offset_dims={",
+             StrJoin(gather_dimension_numbers_->offset_dims(), ","), "}");
+  string collapsed_slice_dims = StrCat(
+      "collapsed_slice_dims={",
+      StrJoin(gather_dimension_numbers_->collapsed_slice_dims(), ","), "}");
+  string start_index_map =
+      StrCat("start_index_map={",
+             StrJoin(gather_dimension_numbers_->start_index_map(), ","), "}");
   string index_vector_dim = StrCat(
       "index_vector_dim=", gather_dimension_numbers_->index_vector_dim());
 
-  return Join<std::initializer_list<string>>(
-      {output_window_dims, elided_window_dims, gather_dims_to_operand_dims,
-       index_vector_dim},
+  return StrJoin<std::initializer_list<string>>(
+      {offset_dims, collapsed_slice_dims, start_index_map, index_vector_dim},
       ", ");
 }
 
 /* static */ GatherDimensionNumbers HloGatherInstruction::MakeGatherDimNumbers(
-    tensorflow::gtl::ArraySlice<int64> output_window_dims,
-    tensorflow::gtl::ArraySlice<int64> elided_window_dims,
-    tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims,
+    tensorflow::gtl::ArraySlice<int64> offset_dims,
+    tensorflow::gtl::ArraySlice<int64> collapsed_slice_dims,
+    tensorflow::gtl::ArraySlice<int64> start_index_map,
     int64 index_vector_dim) {
   GatherDimensionNumbers gather_dim_numbers;
-  for (int64 output_window_dim : output_window_dims) {
-    gather_dim_numbers.add_output_window_dims(output_window_dim);
+  for (int64 output_window_dim : offset_dims) {
+    gather_dim_numbers.add_offset_dims(output_window_dim);
   }
-  for (int64 elided_window_dim : elided_window_dims) {
-    gather_dim_numbers.add_elided_window_dims(elided_window_dim);
+  for (int64 elided_window_dim : collapsed_slice_dims) {
+    gather_dim_numbers.add_collapsed_slice_dims(elided_window_dim);
   }
-  for (int64 gather_dim_to_input_dim : gather_dims_to_operand_dims) {
-    gather_dim_numbers.add_gather_dims_to_operand_dims(gather_dim_to_input_dim);
+  for (int64 gather_dim_to_input_dim : start_index_map) {
+    gather_dim_numbers.add_start_index_map(gather_dim_to_input_dim);
   }
 
   gather_dim_numbers.set_index_vector_dim(index_vector_dim);
@@ -2044,8 +1983,8 @@ string HloGatherInstruction::GatherDimensionNumbersToString() const {
 HloInstructionProto HloGatherInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   *proto.mutable_gather_dimension_numbers() = gather_dimension_numbers();
-  for (int64 bound : gather_window_bounds()) {
-    proto.add_gather_window_bounds(bound);
+  for (int64 bound : gather_slice_sizes()) {
+    proto.add_gather_slice_sizes(bound);
   }
   return proto;
 }
@@ -2053,7 +1992,7 @@ HloInstructionProto HloGatherInstruction::ToProto() const {
 std::vector<string> HloGatherInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   return {GatherDimensionNumbersToString(),
-          StrCat("window_bounds={", Join(gather_window_bounds(), ","), "}")};
+          StrCat("slice_sizes={", StrJoin(gather_slice_sizes(), ","), "}")};
 }
 
 bool HloGatherInstruction::IdenticalSlowPath(
@@ -2064,7 +2003,7 @@ bool HloGatherInstruction::IdenticalSlowPath(
   return protobuf_util::ProtobufEquals(
              gather_dimension_numbers(),
              casted_other.gather_dimension_numbers()) &&
-         gather_window_bounds() == casted_other.gather_window_bounds();
+         gather_slice_sizes() == casted_other.gather_slice_sizes();
 }
 
 std::unique_ptr<HloInstruction> HloGatherInstruction::CloneWithNewOperandsImpl(
@@ -2072,9 +2011,9 @@ std::unique_ptr<HloInstruction> HloGatherInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
-  return MakeUnique<HloGatherInstruction>(
+  return absl::make_unique<HloGatherInstruction>(
       shape, new_operands[0], new_operands[1], gather_dimension_numbers(),
-      gather_window_bounds());
+      gather_slice_sizes());
 }
 
 HloScatterInstruction::HloScatterInstruction(
@@ -2088,24 +2027,24 @@ HloScatterInstruction::HloScatterInstruction(
   AppendOperand(updates);
   AppendComputation(update_computation);
   scatter_dimension_numbers_ =
-      MakeUnique<ScatterDimensionNumbers>(scatter_dim_numbers);
+      absl::make_unique<ScatterDimensionNumbers>(scatter_dim_numbers);
 }
 
 string HloScatterInstruction::ScatterDimensionNumbersToString() const {
-  string update_window_dims =
-      StrCat("update_window_dims={",
-             Join(scatter_dimension_numbers().update_window_dims(), ","), "}");
+  string update_window_dims = StrCat(
+      "update_window_dims={",
+      StrJoin(scatter_dimension_numbers().update_window_dims(), ","), "}");
   string inserted_window_dims = StrCat(
       "inserted_window_dims={",
-      Join(scatter_dimension_numbers().inserted_window_dims(), ","), "}");
+      StrJoin(scatter_dimension_numbers().inserted_window_dims(), ","), "}");
   string scatter_dims_to_operand_dims = StrCat(
       "scatter_dims_to_operand_dims={",
-      Join(scatter_dimension_numbers().scatter_dims_to_operand_dims(), ","),
+      StrJoin(scatter_dimension_numbers().scatter_dims_to_operand_dims(), ","),
       "}");
   string index_vector_dim = StrCat(
       "index_vector_dim=", scatter_dimension_numbers().index_vector_dim());
 
-  return Join<std::initializer_list<string>>(
+  return StrJoin<std::initializer_list<string>>(
       {update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims,
        index_vector_dim},
       ", ");
@@ -2159,7 +2098,7 @@ std::unique_ptr<HloInstruction> HloScatterInstruction::CloneWithNewOperandsImpl(
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 3);
-  return MakeUnique<HloScatterInstruction>(
+  return absl::make_unique<HloScatterInstruction>(
       shape, new_operands[0], new_operands[1], new_operands[2], to_apply(),
       scatter_dimension_numbers());
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 9586ad6673..efdb9e9781 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -217,19 +218,37 @@ class HloRecvDoneInstruction : public HloSendRecvInstruction {
       HloCloneContext* context) const override;
 };
 
-class HloAllReduceInstruction : public HloInstruction {
+class HloCollectiveInstruction : public HloInstruction {
+ public:
+  const std::vector<ReplicaGroup>& replica_groups() const {
+    return replica_groups_;
+  }
+
+ protected:
+  explicit HloCollectiveInstruction(
+      HloOpcode opcode, const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      const std::vector<ReplicaGroup>& replica_groups);
+
+  HloInstructionProto ToProto() const override;
+
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  std::vector<ReplicaGroup> replica_groups_;
+};
+
+class HloAllReduceInstruction : public HloCollectiveInstruction {
  public:
   explicit HloAllReduceInstruction(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloComputation* reduce_computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-      tensorflow::StringPiece barrier,
-      const tensorflow::gtl::optional<int64>& all_reduce_id);
-
-  // Returns the group ids of each replica for CrossReplicaSum op.
-  const std::vector<int64>& replica_group_ids() const {
-    return replica_group_ids_;
-  }
+      const std::vector<ReplicaGroup>& replica_groups,
+      absl::string_view barrier, const absl::optional<int64>& all_reduce_id);
 
   // Returns the barrier config used for the CrossReplicaSum implementation of
   // each backend.
@@ -240,9 +259,7 @@ class HloAllReduceInstruction : public HloInstruction {
     cross_replica_sum_barrier_ = barrier;
   }
 
-  tensorflow::gtl::optional<int64> all_reduce_id() const {
-    return all_reduce_id_;
-  }
+  absl::optional<int64> all_reduce_id() const { return all_reduce_id_; }
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -261,57 +278,27 @@ class HloAllReduceInstruction : public HloInstruction {
       tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
       HloCloneContext* context) const override;
 
-  // The group id of each replica for CrossReplicaSum.
-  std::vector<int64> replica_group_ids_;
-
   // The string representation of the barrier config used for CrossReplicaSum.
   string cross_replica_sum_barrier_;
 
   // For Allreduce nodes from different modules, if they have the same
   // all_reduce_id, they will be 'Allreduce'd. If empty, Allreduce will not be
   // applied cross modules.
-  tensorflow::gtl::optional<int64> all_reduce_id_;
+  absl::optional<int64> all_reduce_id_;
 };
 
-class HloAllToAllInstruction : public HloInstruction {
+class HloAllToAllInstruction : public HloCollectiveInstruction {
  public:
   explicit HloAllToAllInstruction(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operand,
-      const std::vector<ReplicaGroup>& replica_groups,
-      tensorflow::StringPiece barrier);
-
-  const std::vector<ReplicaGroup>& replica_groups() const {
-    return replica_groups_;
-  }
-
-  // TODO(b/110096724): rename this.
-  void set_cross_replica_sum_barrier(string barrier) {
-    cross_replica_sum_barrier_ = barrier;
-  }
-  string cross_replica_sum_barrier() const {
-    return cross_replica_sum_barrier_;
-  }
-
-  HloInstructionProto ToProto() const override;
+      const std::vector<ReplicaGroup>& replica_groups);
 
  private:
-  std::vector<string> ExtraAttributesToStringImpl(
-      const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
-      const HloInstruction& other,
-      const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations) const override;
-
   // Implementation for non-common logic of CloneWithNewOperands.
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape,
       tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
       HloCloneContext* context) const override;
-
-  std::vector<ReplicaGroup> replica_groups_;
-
-  // The string representation of the barrier config.
-  string cross_replica_sum_barrier_;
 };
 
 class HloReverseInstruction : public HloInstruction {
@@ -507,7 +494,7 @@ class HloMapInstruction : public HloInstruction {
 
  private:
   bool IsElementwiseImpl(
-      const tensorflow::gtl::optional<int64>& operand_idx) const override;
+      const absl::optional<int64>& operand_idx) const override;
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
   bool IdenticalSlowPath(
@@ -600,7 +587,7 @@ class HloConstantInstruction : public HloInstruction {
 
  private:
   bool IsElementwiseImpl(
-      const tensorflow::gtl::optional<int64>& operand_idx) const override;
+      const absl::optional<int64>& operand_idx) const override;
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
@@ -751,7 +738,7 @@ class HloFusionInstruction : public HloInstruction {
                                        bool add_output = false);
 
   bool IsElementwiseImpl(
-      const tensorflow::gtl::optional<int64>& operand_idx) const override;
+      const absl::optional<int64>& operand_idx) const override;
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
   bool IdenticalSlowPath(
@@ -780,7 +767,7 @@ class HloRngInstruction : public HloInstruction {
 
  private:
   bool IsElementwiseImpl(
-      const tensorflow::gtl::optional<int64>& operand_idx) const override;
+      const absl::optional<int64>& operand_idx) const override;
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
   bool IdenticalSlowPath(
@@ -883,10 +870,6 @@ class HloInfeedInstruction : public HloInstruction {
   explicit HloInfeedInstruction(const Shape& infeed_shape,
                                 HloInstruction* token_operand,
                                 const string& config);
-  // TODO(b/80000000): Remove this constructor when all uses of infeed are
-  // converted to take tokens.
-  explicit HloInfeedInstruction(const Shape& infeed_shape,
-                                const string& config);
   // Returns the infeed configuration string. The infeed configuration includes
   // any metadata needed for the backend compiler (e.g., infeed buffer address)
   // and is target-dependent.
@@ -924,13 +907,7 @@ class HloOutfeedInstruction : public HloInstruction {
   explicit HloOutfeedInstruction(const Shape& outfeed_shape,
                                  HloInstruction* operand,
                                  HloInstruction* token_operand,
-                                 tensorflow::StringPiece outfeed_config);
-  // TODO(b/80000000): Remove this constructor when all uses of outfeed are
-  // converted to take tokens.
-  explicit HloOutfeedInstruction(const Shape& outfeed_shape,
-                                 HloInstruction* operand,
-                                 tensorflow::StringPiece outfeed_config);
-
+                                 absl::string_view outfeed_config);
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const {
     TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape_));
@@ -965,7 +942,8 @@ class HloConvolutionInstruction : public HloInstruction {
   explicit HloConvolutionInstruction(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
       const Window& window,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count);
   const Window& window() const override { return window_; }
   void set_window(const Window& window) override { window_ = window; }
   const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
@@ -975,6 +953,9 @@ class HloConvolutionInstruction : public HloInstruction {
       const ConvolutionDimensionNumbers& dnums) {
     convolution_dimension_numbers_ = dnums;
   }
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64 feature_group_count() const { return feature_group_count_; }
   string ToCategory() const override;
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -994,6 +975,9 @@ class HloConvolutionInstruction : public HloInstruction {
   Window window_;
   // Describes the dimension numbers used for a convolution.
   ConvolutionDimensionNumbers convolution_dimension_numbers_;
+  // The number of feature groups. Must be a divisor of the input feature
+  // dimension and output feature dimension.
+  int64 feature_group_count_;
 };
 
 class HloReduceWindowInstruction : public HloInstruction {
@@ -1076,14 +1060,14 @@ class HloCustomCallInstruction : public HloInstruction {
  public:
   explicit HloCustomCallInstruction(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      tensorflow::StringPiece custom_call_target);
+      absl::string_view custom_call_target);
   const Window& window() const override {
     CHECK(window_ != nullptr);
     return *window_;
   }
 
   void set_window(const Window& window) override {
-    window_ = MakeUnique<Window>(window);
+    window_ = absl::make_unique<Window>(window);
   }
 
   const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
@@ -1094,7 +1078,7 @@ class HloCustomCallInstruction : public HloInstruction {
   void set_convolution_dimension_numbers(
       const ConvolutionDimensionNumbers& dnums) {
     convolution_dimension_numbers_ =
-        MakeUnique<ConvolutionDimensionNumbers>(dnums);
+        absl::make_unique<ConvolutionDimensionNumbers>(dnums);
   }
   const string& custom_call_target() const { return custom_call_target_; }
   // Returns a serialized representation of this instruction.
@@ -1120,33 +1104,6 @@ class HloCustomCallInstruction : public HloInstruction {
   std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
 };
 
-class HloHostComputeInstruction : public HloInstruction {
- public:
-  explicit HloHostComputeInstruction(
-      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      tensorflow::StringPiece channel_name, const int64 cost_estimate_ns);
-  // Returns the channel name associated with the instruction. The name is
-  // used to identify host Send/Recv operations.
-  const string& channel_name() const { return channel_name_; }
-  // Returns a serialized representation of this instruction.
-  HloInstructionProto ToProto() const override;
-
- private:
-  bool IdenticalSlowPath(
-      const HloInstruction& other,
-      const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations) const override;
-  // Implementation for non-common logic of CloneWithNewOperands.
-  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
-      const Shape& shape,
-      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
-      HloCloneContext* context) const override;
-  // Name to use for host send/recv channels.
-  string channel_name_;
-  // Estimate of the duration of a host computation in nanoseconds.
-  int64 cost_estimate_ns_ = 0;
-};
-
 class HloPadInstruction : public HloInstruction {
  public:
   explicit HloPadInstruction(const Shape& shape, HloInstruction* operand,
@@ -1215,15 +1172,15 @@ class HloGatherInstruction : public HloInstruction {
  public:
   explicit HloGatherInstruction(
       const Shape& shape, HloInstruction* operand,
-      HloInstruction* gather_indices,
+      HloInstruction* start_indices,
       const GatherDimensionNumbers& gather_dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
   const GatherDimensionNumbers& gather_dimension_numbers() const {
     CHECK(gather_dimension_numbers_ != nullptr);
     return *gather_dimension_numbers_;
   }
-  tensorflow::gtl::ArraySlice<int64> gather_window_bounds() const {
-    return gather_window_bounds_;
+  tensorflow::gtl::ArraySlice<int64> gather_slice_sizes() const {
+    return gather_slice_sizes_;
   }
   // Returns the dump string of the gather dimension numbers.
   string GatherDimensionNumbersToString() const;
@@ -1232,9 +1189,9 @@ class HloGatherInstruction : public HloInstruction {
 
   // Creates an instance of GatherDimensionNumbers.
   static GatherDimensionNumbers MakeGatherDimNumbers(
-      tensorflow::gtl::ArraySlice<int64> output_window_dims,
-      tensorflow::gtl::ArraySlice<int64> elided_window_dims,
-      tensorflow::gtl::ArraySlice<int64> gather_dims_to_operand_dims,
+      tensorflow::gtl::ArraySlice<int64> offset_dims,
+      tensorflow::gtl::ArraySlice<int64> collapsed_slice_dims,
+      tensorflow::gtl::ArraySlice<int64> start_index_map,
       int64 index_vector_dim);
 
  private:
@@ -1250,7 +1207,7 @@ class HloGatherInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers_;
-  std::vector<int64> gather_window_bounds_;
+  std::vector<int64> gather_slice_sizes_;
 };
 
 class HloScatterInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 8e0d38b6a6..0e49d343d6 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -17,20 +17,20 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/strings/escaping.h"
+#include "absl/strings/numbers.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-
-using ::tensorflow::StringPiece;
-
 namespace {
 
+using absl::string_view;
+
 constexpr int kEOF = -1;
 constexpr int kError = -2;
 
@@ -66,12 +66,12 @@ bool HloLexer::CanDereference(const char* ptr) const {
   return ptr < buf_.end() && ptr >= buf_.begin();
 }
 
-tensorflow::StringPiece HloLexer::StringPieceFromPointers(
-    const char* begin, const char* end) const {
+absl::string_view HloLexer::StringPieceFromPointers(const char* begin,
+                                                    const char* end) const {
   CHECK(begin <= end);
   CHECK(begin == buf_.end() || CanDereference(begin));
   CHECK(end == buf_.end() || CanDereference(end));
-  return tensorflow::StringPiece(begin, end - begin);
+  return absl::string_view(begin, end - begin);
 }
 
 tensorflow::RegexpStringPiece HloLexer::RegexpStringPieceFromPointers(
@@ -235,7 +235,7 @@ TokKind HloLexer::LexIdentifier() {
     return TokKind::kAttributeName;
   }
 
-  tensorflow::StringPiece identifier =
+  absl::string_view identifier =
       StringPieceFromPointers(token_start_, current_ptr_);
 
   // See if this is a keyword.
@@ -306,8 +306,8 @@ TokKind HloLexer::LexNumberOrPattern() {
       R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|[-]?(\d+[.]\d*|\d*[.]\d+))"};
   if (RE2::Consume(&consumable, *float_pattern)) {
     current_ptr_ = consumable.begin();
-    tensorflow::strings::safe_strtod(string(token_start_, current_ptr_).c_str(),
-                                     &decimal_val_);
+    CHECK(absl::SimpleAtod(string(token_start_, current_ptr_).c_str(),
+                           &decimal_val_));
     return TokKind::kDecimal;
   }
 
@@ -339,7 +339,7 @@ TokKind HloLexer::LexNumberOrPattern() {
   if (RE2::Consume(&consumable, *int_pattern)) {
     current_ptr_ = consumable.begin();
     auto slice = StringPieceFromPointers(token_start_, current_ptr_);
-    if (tensorflow::strings::safe_strto64(slice, &int64_val_)) {
+    if (absl::SimpleAtoi(slice, &int64_val_)) {
       return TokKind::kInt;
     }
     LOG(ERROR) << "Failed to parse int literal: " << slice;
@@ -365,6 +365,7 @@ std::pair<unsigned, unsigned> HloLexer::GetLineAndColumn(LocTy location) const {
     line_no = line_no_cache_.line_no_of_query;
   }
   for (; ptr != location; ptr++) {
+    CHECK_LT(ptr, buf_.end());
     if (*ptr == '\n') {
       line_no++;
     }
@@ -374,24 +375,24 @@ std::pair<unsigned, unsigned> HloLexer::GetLineAndColumn(LocTy location) const {
   line_no_cache_.last_query = ptr;
   line_no_cache_.line_no_of_query = line_no;
   size_t line_offset = StringPieceFromPointers(start, ptr).rfind('\n');
-  if (line_offset == tensorflow::StringPiece::npos) {
+  if (line_offset == absl::string_view::npos) {
     line_offset = 0;
   }
   return {line_no, ptr - start - line_offset};
 }
 
-tensorflow::StringPiece HloLexer::GetLine(LocTy loc) const {
+absl::string_view HloLexer::GetLine(LocTy loc) const {
   if (!CanDereference(loc)) {
     return "LINE OUT OF RANGE";
   }
   size_t line_start =
       StringPieceFromPointers(buf_.begin(), loc + 1).rfind('\n');
-  const char* start = line_start == tensorflow::StringPiece::npos
+  const char* start = line_start == absl::string_view::npos
                           ? buf_.begin()
                           : buf_.begin() + line_start + 1;
   size_t line_end = StringPieceFromPointers(loc, buf_.end()).find('\n');
   const char* end =
-      line_end == tensorflow::StringPiece::npos ? buf_.end() : loc + line_end;
+      line_end == absl::string_view::npos ? buf_.end() : loc + line_end;
 
   return StringPieceFromPointers(start, end);
 }
@@ -403,10 +404,14 @@ TokKind HloLexer::LexString() {
   static LazyRE2 escaping_pattern = {R"("([^"\\]|\\.)*")"};
   if (RE2::Consume(&consumable, *escaping_pattern)) {
     current_ptr_ = consumable.begin();
-    tensorflow::StringPiece raw =
+    absl::string_view raw =
         StringPieceFromPointers(token_start_ + 1, current_ptr_ - 1);
     string error;
-    if (!tensorflow::str_util::CUnescape(raw, &str_val_, &error)) {
+    // TODO(b/113077997): Change to absl::CUnescape once it works properly with
+    // copy-on-write std::string implementations.
+    if (!tensorflow::str_util::CUnescape(                     // non-absl ok
+            tensorflow::StringPiece(raw.data(), raw.size()),  // non-absl ok
+            &str_val_, &error)) {
       LOG(ERROR) << "Failed unescaping string: " << raw << ". error: " << error;
       return TokKind::kError;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 003ac34ace..3e2f8bcd52 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_token.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/types.h"
@@ -34,7 +34,7 @@ namespace xla {
 // it directly.
 class HloLexer {
  public:
-  explicit HloLexer(tensorflow::StringPiece buf) : buf_(buf) {
+  explicit HloLexer(absl::string_view buf) : buf_(buf) {
     current_ptr_ = buf_.begin();
   }
 
@@ -77,7 +77,7 @@ class HloLexer {
   std::pair<unsigned, unsigned> GetLineAndColumn(LocTy location) const;
 
   // Returns the whole line given the location.
-  tensorflow::StringPiece GetLine(LocTy loc) const;
+  absl::string_view GetLine(LocTy loc) const;
 
  private:
   // Returns the current character. If it's neither the end of input buffer nor
@@ -89,8 +89,8 @@ class HloLexer {
 
   // Creates StringPiece with the given begin and end. Exits if the begin > end,
   // or it's out of the range of the current buffer.
-  tensorflow::StringPiece StringPieceFromPointers(const char* begin,
-                                                  const char* end) const;
+  absl::string_view StringPieceFromPointers(const char* begin,
+                                            const char* end) const;
   tensorflow::RegexpStringPiece RegexpStringPieceFromPointers(
       const char* begin, const char* end) const;
 
@@ -107,11 +107,11 @@ class HloLexer {
   TokKind LexNumberOrPattern();
   TokKind LexString();
 
-  const tensorflow::StringPiece buf_;
+  const absl::string_view buf_;
   const char* current_ptr_;
 
   // Information about the current token.
-  const char* token_start_;
+  const char* token_start_ = nullptr;
   TokKind current_kind_;
   string str_val_;
   Shape shape_val_;
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
index 43c41ece6e..3a1dd471c6 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 #include <deque>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -29,17 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
+namespace {
 
 using Worklist = std::deque<const HloInstruction*>;
 using Workset = std::unordered_set<const HloInstruction*>;
 
-namespace {
-
 void AddToWorklist(const HloInstruction* instruction, Worklist* worklist,
                    Workset* workset) {
   if (workset->count(instruction) == 0) {
@@ -296,7 +294,7 @@ StatusOr<std::unique_ptr<HloLivenessAnalysis>> HloLivenessAnalysis::Run(
   VLOG(1) << "HloLivenessAnalysis::Run on module " << module.name();
   XLA_VLOG_LINES(2, module.ToString());
 
-  auto liveness_analysis = WrapUnique(new HloLivenessAnalysis(module));
+  auto liveness_analysis = absl::WrapUnique(new HloLivenessAnalysis(module));
 
   liveness_analysis->RunAnalysis();
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 7e4b883435..5269cad94d 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -15,15 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace testing {
 
-using ::tensorflow::str_util::Join;
-
 bool HloMatcher::MatchAndExplain(
     const HloInstruction* instruction,
     ::testing::MatchResultListener* listener) const {
@@ -210,8 +208,8 @@ bool HloDotWithContractingDimsMatcher::MatchAndExplain(
       dim_nums.lhs_contracting_dimensions(0) != lhs_contracting_dim_) {
     *listener << instruction->ToString()
               << " has wrong lhs_contracting_dimensions (got {"
-              << Join(dim_nums.lhs_contracting_dimensions(), ",") << "} want {"
-              << lhs_contracting_dim_ << "})";
+              << absl::StrJoin(dim_nums.lhs_contracting_dimensions(), ",")
+              << "} want {" << lhs_contracting_dim_ << "})";
     return false;
   }
 
@@ -219,8 +217,8 @@ bool HloDotWithContractingDimsMatcher::MatchAndExplain(
       dim_nums.rhs_contracting_dimensions(0) != rhs_contracting_dim_) {
     *listener << instruction->ToString()
               << " has wrong rhs_contracting_dimensions (got {"
-              << Join(dim_nums.rhs_contracting_dimensions(), ",") << "} want {"
-              << rhs_contracting_dim_ << "})";
+              << absl::StrJoin(dim_nums.rhs_contracting_dimensions(), ",")
+              << "} want {" << rhs_contracting_dim_ << "})";
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index c577b4359a..9ace0d76e0 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 namespace testing {
@@ -120,8 +120,7 @@ class HloShapeAndLayoutMatcher
 class HloShardingMatcher
     : public ::testing::MatcherInterface<const HloInstruction*> {
  public:
-  explicit HloShardingMatcher(
-      const tensorflow::gtl::optional<HloSharding>& sharding)
+  explicit HloShardingMatcher(const absl::optional<HloSharding>& sharding)
       : sharding_(sharding) {}
 
   bool MatchAndExplain(const HloInstruction* instruction,
@@ -129,7 +128,7 @@ class HloShardingMatcher
   void DescribeTo(std::ostream* os) const override;
 
  private:
-  tensorflow::gtl::optional<HloSharding> sharding_;
+  absl::optional<HloSharding> sharding_;
 };
 
 // Matches a Dot HLO instruction with specific LHS and RHS contracting
@@ -307,7 +306,7 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
   return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape));
 }
 inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
-    tensorflow::StringPiece shape) {
+    absl::string_view shape) {
   return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(
       ShapeUtil::ParseShapeString(shape).ValueOrDie()));
 }
@@ -317,7 +316,7 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
       new ::xla::testing::HloShapeAndLayoutMatcher(shape));
 }
 inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
-    tensorflow::StringPiece shape) {
+    absl::string_view shape) {
   return ::testing::MakeMatcher(new ::xla::testing::HloShapeAndLayoutMatcher(
       ShapeUtil::ParseShapeString(shape).ValueOrDie()));
 }
@@ -330,14 +329,14 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
 }
 // Matcher for Sharding from sharding string
 inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
-    tensorflow::StringPiece sharding) {
+    absl::string_view sharding) {
   return ::testing::MakeMatcher(new ::xla::testing::HloShardingMatcher(
       ParseSharding(sharding).ValueOrDie()));
 }
 // Verifies that no HloSharding is set for an HLO instruction.
 inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
   return ::testing::MakeMatcher(
-      new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt));
+      new ::xla::testing::HloShardingMatcher(absl::nullopt));
 }
 
 inline ::testing::Matcher<const ::xla::HloInstruction*> Dot(
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 55ff073d3f..78167335c8 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -22,12 +22,13 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -274,7 +275,7 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   }
   TF_RET_CHECK(entry != nullptr);
 
-  auto module = MakeUnique<HloModule>(proto.name(), module_config);
+  auto module = absl::make_unique<HloModule>(proto.name(), module_config);
 
   // Sort the computations in the proto id's order.
   std::sort(computations.begin(), computations.end(),
@@ -409,7 +410,7 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
     string error_message =
         "The subcomputation to outline has multiple outputs:\n";
     for (HloInstruction* output : outputs) {
-      tensorflow::strings::StrAppend(&error_message, output->ToString(), "\n");
+      absl::StrAppend(&error_message, output->ToString(), "\n");
     }
     LOG(FATAL) << error_message;
   }
@@ -507,7 +508,7 @@ std::vector<HloComputation*> HloModule::MakeNonfusionComputations() const {
 
 std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
-  auto module = MakeUnique<HloModule>(name_ + "-" + suffix, config_);
+  auto module = absl::make_unique<HloModule>(name_ + "-" + suffix, config_);
 
   HloCloneContext context(module.get(), suffix);
   auto cloned_computation = entry_computation_->Clone(suffix, &context);
@@ -535,12 +536,11 @@ uint64 HloModule::RandomNew64() const {
   return rng_();
 }
 
-HloComputation* HloModule::GetComputationWithName(
-    tensorflow::StringPiece name) {
+HloComputation* HloModule::GetComputationWithName(absl::string_view name) {
   auto computations_in_module = computations();
-  auto it = c_find_if(computations_in_module, [&](HloComputation* computation) {
-    return computation->name() == name;
-  });
+  auto it = absl::c_find_if(
+      computations_in_module,
+      [&](HloComputation* computation) { return computation->name() == name; });
   return it == computations_in_module.end() ? nullptr : *it;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index d2e726a0db..cf129b835d 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
@@ -142,7 +142,7 @@ class HloModule {
 
   // Returns the computation in this module that has the name `name`.  Returns
   // null if there is no such computation.
-  HloComputation* GetComputationWithName(tensorflow::StringPiece name);
+  HloComputation* GetComputationWithName(absl::string_view name);
 
   // Gets the number of computations in this module.
   int64 computation_count() const { return computations_.size(); }
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index 07a8c798db..9bfa3a5f45 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -18,15 +18,15 @@ limitations under the License.
 #include <atomic>
 #include <vector>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
-using tensorflow::strings::StrAppend;
+using absl::StrAppend;
 
 HloModuleConfig::HloModuleConfig(const ProgramShape& program_shape,
                                  bool ignore_layouts)
@@ -39,15 +39,14 @@ void HloModuleConfig::SetDefaultComputationLayout(
 }
 
 string HloModuleConfig::compilation_cache_key() const {
-  string key =
-      tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled());
+  string key = absl::StrCat("profiling=", hlo_profiling_enabled());
   StrAppend(&key, "::(");
   std::vector<string> params;
   for (const ShapeLayout& param_layout :
        entry_computation_layout_->parameter_layouts()) {
     params.push_back(param_layout.shape().DebugString());
   }
-  StrAppend(&key, tensorflow::str_util::Join(params, ", "), ") => ",
+  StrAppend(&key, absl::StrJoin(params, ", "), ") => ",
             entry_computation_layout_->result_shape().SerializeAsString());
   if (seed() != 0) {
     // TODO(b/32083678): force recompilation to reset global state.
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 074e9c9070..3f1e1cc73e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 
@@ -72,15 +72,6 @@ class HloModuleConfig {
     return debug_options_.xla_hlo_profile();
   }
 
-  // Sets/returns whether this is a "host module".  Host modules are used to
-  // record the data- and control-flow dependencies of host side computation
-  // that communicates with compiled code.  They are used for analysis and
-  // scheduling purposes, but no code is generated.
-  bool is_host_module() const { return is_host_module_; }
-  void set_is_host_module(bool is_host_module) {
-    is_host_module_ = is_host_module;
-  }
-
   // Sets/returns the module seed set during execution.
   void set_seed(uint64 seed) { seed_ = seed; }
   uint64 seed() const { return seed_; }
@@ -113,7 +104,7 @@ class HloModuleConfig {
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
-  tensorflow::gtl::optional<ComputationLayout> entry_computation_layout_;
+  absl::optional<ComputationLayout> entry_computation_layout_;
 
   // Whether this is a 'host module'.
   bool is_host_module_ = false;
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.h b/tensorflow/compiler/xla/service/hlo_module_dce.h
index 29024085c1..12ca2340a6 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce.h
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.h
@@ -31,7 +31,7 @@ namespace xla {
 class HloModuleDCE : public HloPassInterface {
  public:
   ~HloModuleDCE() override {}
-  tensorflow::StringPiece name() const override { return "hlo-module-dce"; }
+  absl::string_view name() const override { return "hlo-module-dce"; }
 
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 10bf9ffd6c..f52a37bc74 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -59,7 +59,7 @@ string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
 
 /* static */ StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
 HloModuleGroupMetadata::Build(const std::vector<HloModule*>& modules) {
-  auto metadata = MakeUnique<HloModuleGroupMetadata>(modules);
+  auto metadata = absl::make_unique<HloModuleGroupMetadata>(modules);
   TF_RETURN_IF_ERROR(metadata->Build());
   return std::move(metadata);
 }
@@ -204,6 +204,10 @@ const HloModuleGroupMetadata::Channel& HloModuleGroupMetadata::GetChannel(
   return channels_[channel_id_map_.at(channel_id)];
 }
 
+bool HloModuleGroupMetadata::HasChannel(int64 channel_id) const {
+  return channel_id_map_.find(channel_id) != channel_id_map_.end();
+}
+
 HloComputation* HloModuleGroupMetadata::PeerComputation(
     const HloInstruction* instruction) const {
   CHECK(IsChannelInstruction(instruction));
@@ -267,15 +271,14 @@ int64 HloModuleGroupMetadata::GetModuleId(const HloModule* module) const {
   LOG(FATAL) << "unknown module";
 }
 
-tensorflow::gtl::optional<int64> HloModuleGroupMetadata::GetInstructionDevice(
+absl::optional<int64> HloModuleGroupMetadata::GetInstructionDevice(
     const HloInstruction& instruction) const {
   // The module group metadata can be created in both "single module, multiple
   // devices" and "multiple modules, no explicit devices" fashions.
   // The API returns an optional even though the current implementation always
   // returns a device, to account for cases where we cannot guess a device.
   // In such cases the VerifyChannelInstructions() will return proper errors.
-  tensorflow::gtl::optional<int64> device =
-      instruction.sharding_unique_device();
+  absl::optional<int64> device = instruction.sharding_unique_device();
   if (!device) {
     device = GetModuleId(instruction.parent()->parent());
   }
@@ -283,10 +286,7 @@ tensorflow::gtl::optional<int64> HloModuleGroupMetadata::GetInstructionDevice(
 }
 
 int64 HloModuleGroupMetadata::GetDeviceModulesCount() const {
-  return std::count_if(modules_.begin(), modules_.end(),
-                       [](const HloModule* module) {
-                         return !module->config().is_host_module();
-                       });
+  return modules_.size();
 }
 
 Status HloModuleGroupMetadata::RecordInstructions() {
@@ -383,7 +383,7 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
   if (!ContainsKey(companion_set_index_, instruction1) &&
       !ContainsKey(companion_set_index_, instruction2)) {
     companion_sets_.push_back(
-        tensorflow::MakeUnique<std::unordered_set<HloInstruction*>>());
+        absl::make_unique<std::unordered_set<HloInstruction*>>());
     auto companion_set = companion_sets_.back().get();
     companion_set->insert(instruction1);
     companion_set->insert(instruction2);
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 84f2d3f5fb..dead6d9c20 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -125,6 +125,9 @@ class HloModuleGroupMetadata {
   // Returns the Channel instance for the given channel id.
   const Channel& GetChannel(int64 channel_id) const;
 
+  // Returns if the given channel id exists in metadata.
+  bool HasChannel(int64 channel_id) const;
+
   // Returns the all-reduce instructions with the same all_reduce_id.
   const std::vector<HloInstruction*>& GetAllReduceGroup(
       int64 all_reduce_id) const;
@@ -156,7 +159,7 @@ class HloModuleGroupMetadata {
   // Retrieves the device an instruction is assigned to. Either from the
   // sharding information, or from the ordinal of the module the instruction
   // is in.
-  tensorflow::gtl::optional<int64> GetInstructionDevice(
+  absl::optional<int64> GetInstructionDevice(
       const HloInstruction& instruction) const;
 
   // Returns the number of modules for devices (excluding the host module).
@@ -166,7 +169,7 @@ class HloModuleGroupMetadata {
   //
   // Precondition: IsCompanionWhile(instruction) is true.
   const std::unordered_set<HloInstruction*>& Companions(
-      HloInstruction* instruction) const {
+      const HloInstruction* instruction) const {
     CHECK_EQ(companion_set_index_.count(instruction), 1);
     return companion_set(companion_set_index_.at(instruction));
   }
@@ -243,7 +246,7 @@ class HloModuleGroupMetadata {
       companion_sets_;
 
   // Map from each companion while instruction to the index into companion_set_.
-  tensorflow::gtl::FlatMap<HloInstruction*, int64> companion_set_index_;
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> companion_set_index_;
 
   // Map from computation to the instruction using it (a kWhile, kConditional).
   tensorflow::gtl::FlatMap<const HloComputation*, TrackedInstruction>
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index 9fd0ade153..b5c7681edd 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -22,14 +22,17 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -37,24 +40,38 @@ namespace xla {
 
 std::vector<HloInstruction*> HloModuleGroupUtil::GlobalPredecessors(
     HloInstruction* instruction) {
-  std::vector<HloInstruction*> predecessors;
-
-  // Adds to the unique predecessors list and also add companion instructions
-  // if the given predecessor has those.
+  std::vector<HloInstruction*>
+      predecessors;  // Use a vector to avoid non-determinism.
+  tensorflow::gtl::FlatSet<HloInstruction*> unique;
+
+  // Adds to the unique predecessors list; if the predecessors is a companion
+  // instruction, also add companion instructions; if the predecessors is a
+  // cross-module all-reduce, also add the all-reduce instructions in the same
+  // group.
   auto add_unique_predecessor = [&](HloInstruction* predecessor) {
-    if (std::find(predecessors.begin(), predecessors.end(), predecessor) !=
-        predecessors.end()) {
+    if (unique.find(predecessor) != unique.end()) {
       return;
     }
-    if (!metadata_.IsCompanionInstruction(predecessor)) {
-      predecessors.push_back(predecessor);
+    if (metadata_.IsCompanionInstruction(predecessor)) {
+      for (HloInstruction* instr : metadata_.Companions(predecessor)) {
+        if (unique.insert(instr).second) {
+          predecessors.push_back(instr);
+        }
+      }
       return;
     }
-    for (HloInstruction* companion : metadata_.Companions(predecessor)) {
-      predecessors.push_back(companion);
+    if (predecessor->IsCrossModuleAllReduce()) {
+      for (HloInstruction* instr :
+           metadata_.GetAllReduceGroup(*predecessor->all_reduce_id())) {
+        if (unique.insert(instr).second) {
+          predecessors.push_back(instr);
+        }
+      }
+      return;
     }
+    unique.insert(predecessor);
+    predecessors.push_back(predecessor);
   };
-
   // If the given instruction is a companion instruction, we need to find the
   // predecessors of all of its companion instructions. If the instruction is an
   // all-reduce, we need to find the predecessors of all the peer all-reduce
@@ -79,12 +96,14 @@ std::vector<HloInstruction*> HloModuleGroupUtil::GlobalPredecessors(
       add_unique_predecessor(control_predecessor);
     }
   }
-  if (instruction->opcode() == HloOpcode::kRecvDone) {
+  if (instruction->opcode() == HloOpcode::kRecvDone &&
+      !DynCast<HloRecvDoneInstruction>(instruction)->is_host_transfer()) {
     // Send is a remote predecessor of RecvDone.
     HloInstruction* send = metadata_.GetChannel(instruction->channel_id()).send;
     add_unique_predecessor(send);
   }
-  if (instruction->opcode() == HloOpcode::kSend) {
+  if (instruction->opcode() == HloOpcode::kSend &&
+      !DynCast<HloSendInstruction>(instruction)->is_host_transfer()) {
     // Recv is a remote predecessor of Send.
     HloInstruction* recv_done =
         metadata_.GetChannel(instruction->channel_id()).recv_done;
@@ -98,22 +117,37 @@ std::vector<HloInstruction*> HloModuleGroupUtil::GlobalPredecessors(
 
 std::vector<HloInstruction*> HloModuleGroupUtil::GlobalSuccessors(
     HloInstruction* instruction) {
-  std::vector<HloInstruction*> successors;
-
-  // Adds to the unique successors list and also add companion instructions
-  // if the given successor has those.
+  std::vector<HloInstruction*>
+      successors;  // Use a vector to avoid non-determinism.
+  tensorflow::gtl::FlatSet<HloInstruction*> unique;
+
+  // Adds to the unique successors list; if the successor is a companion
+  // instruction, also add companion instructions; if the successor is a
+  // cross-module all-reduce, also add the all-reduce instructions in the same
+  // group.
   auto add_unique_successor = [&](HloInstruction* successor) {
-    if (std::find(successors.begin(), successors.end(), successor) !=
-        successors.end()) {
+    if (unique.find(successor) != unique.end()) {
       return;
     }
-    if (!metadata_.IsCompanionInstruction(successor)) {
-      successors.push_back(successor);
+    if (metadata_.IsCompanionInstruction(successor)) {
+      for (HloInstruction* instr : metadata_.Companions(successor)) {
+        if (unique.insert(instr).second) {
+          successors.push_back(instr);
+        }
+      }
       return;
     }
-    for (HloInstruction* companion : metadata_.Companions(successor)) {
-      successors.push_back(companion);
+    if (successor->IsCrossModuleAllReduce()) {
+      for (HloInstruction* instr :
+           metadata_.GetAllReduceGroup(*successor->all_reduce_id())) {
+        if (unique.insert(instr).second) {
+          successors.push_back(instr);
+        }
+      }
+      return;
     }
+    unique.insert(successor);
+    successors.push_back(successor);
   };
 
   // If the given instruction is a companion instruction, we need to find the
@@ -140,14 +174,16 @@ std::vector<HloInstruction*> HloModuleGroupUtil::GlobalSuccessors(
       add_unique_successor(control_successor);
     }
   }
-  if (instruction->opcode() == HloOpcode::kRecv) {
+  if (instruction->opcode() == HloOpcode::kRecv &&
+      !DynCast<HloRecvInstruction>(instruction)->is_host_transfer()) {
     // Send is a remote successor of Recv.
     const HloInstruction* recv_done = instruction->users().front();
     CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
     HloInstruction* send = metadata_.GetChannel(instruction->channel_id()).send;
     add_unique_successor(send);
   }
-  if (instruction->opcode() == HloOpcode::kSend) {
+  if (instruction->opcode() == HloOpcode::kSend &&
+      !DynCast<HloSendInstruction>(instruction)->is_host_transfer()) {
     // RecvDone is a remote successor of Send.
     HloInstruction* recv_done =
         metadata_.GetChannel(instruction->channel_id()).recv_done;
@@ -234,8 +270,8 @@ Status HloModuleGroupUtil::VisitTopologicalOrder(
           string cyclic_instructions;
           for (const auto& state : *visit_state) {
             if (state.second == VisitState::kVisiting) {
-              tensorflow::strings::StrAppend(&cyclic_instructions,
-                                             state.first->ToString(), "\n");
+              absl::StrAppend(&cyclic_instructions, state.first->ToString(),
+                              "\n");
             }
           }
           // TODO(b/64305524): Improve the error message to print out the
@@ -302,7 +338,7 @@ HloModuleGroupUtil::ComputeReachability(
     TF_RETURN_IF_ERROR(
         VisitTopologicalOrder(&visit_states, visit_function, root));
   }
-  auto reachability = MakeUnique<HloReachabilityMap>(post_order);
+  auto reachability = absl::make_unique<HloReachabilityMap>(post_order);
   for (HloInstruction* hlo : post_order) {
     reachability->FastSetReachabilityToUnion(GlobalPredecessors(hlo), hlo);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 236f450086..209ad5e58c 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index ec279867e5..b8f2a21ff9 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -85,7 +85,6 @@ namespace xla {
   V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kGetTupleElement, "get-tuple-element")                   \
   V(kGt, "greater-than", kHloOpcodeIsComparison)             \
-  V(kHostCompute, "host-compute")                            \
   V(kImag, "imag")                                           \
   V(kInfeed, "infeed")                                       \
   V(kIota, "iota")                                           \
@@ -156,7 +155,7 @@ enum HloOpcodeProperty {
 // Returns a string representation of the opcode.
 string HloOpcodeString(HloOpcode opcode);
 
-// Returns a string representation of the opcode.
+// Retrieves the opcode enum by name if the opcode exists.
 StatusOr<HloOpcode> StringToHloOpcode(const string& opcode_name);
 
 inline std::ostream& operator<<(std::ostream& os, HloOpcode opcode) {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 6c1e015f77..8fe91c7278 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -254,6 +254,10 @@ bool HloOrdering::LiveRangeStrictlyBefore(
   }
   // All uses of 'a' must be before 'b' is defined.
   for (const HloUse& use : a.uses()) {
+    if (dataflow.DoesNotUseOperandBuffer(a.instruction(), a.index(),
+                                         use.instruction)) {
+      continue;
+    }
     if (!UseIsBeforeValueDefinition(use, b, dataflow)) {
       VLOG(4) << "use of " << a << " (" << use << ") not before " << b
               << " is defined";
@@ -317,7 +321,7 @@ string PredecessorHloOrdering::ToStringHelper(const string& name) const {
       }
     }
   }
-  return tensorflow::str_util::Join(pieces, "\n");
+  return absl::StrJoin(pieces, "\n");
 }
 
 DependencyHloOrdering::DependencyHloOrdering(const HloModule* module)
@@ -388,7 +392,7 @@ string SequentialHloOrdering::ToString() const {
           tensorflow::strings::Printf("  %s", instruction->name().c_str()));
     }
   }
-  return tensorflow::str_util::Join(pieces, "\n");
+  return absl::StrJoin(pieces, "\n");
 }
 
 std::ostream& operator<<(
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 4b3cd99dc0..df789e6222 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
@@ -24,21 +29,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
 
 namespace {
 
-using ::tensorflow::StringPiece;
-using ::tensorflow::gtl::optional;
-using ::tensorflow::str_util::Join;
-using ::tensorflow::str_util::Split;
-using ::tensorflow::str_util::SplitAndParseAsInts;
+using absl::nullopt;
+using absl::optional;
+using absl::StrAppend;
+using absl::StrCat;
+using absl::StrJoin;
 using ::tensorflow::strings::Printf;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
 
 const double kF16max = 65504;
 
@@ -47,7 +49,7 @@ class HloParser {
  public:
   using LocTy = HloLexer::LocTy;
 
-  explicit HloParser(StringPiece str, const HloModuleConfig& config)
+  explicit HloParser(absl::string_view str, const HloModuleConfig& config)
       : lexer_(str), config_(config) {}
 
   // Runs the parser. Returns false if an error occurred.
@@ -57,14 +59,28 @@ class HloParser {
   std::unique_ptr<HloModule> ConsumeHloModule() { return std::move(module_); }
 
   // Returns the error information.
-  string GetError() const { return Join(error_, "\n"); }
+  string GetError() const { return StrJoin(error_, "\n"); }
 
   // Stand alone parsing utils for various aggregate data types.
   StatusOr<HloSharding> ParseShardingOnly();
   StatusOr<Window> ParseWindowOnly();
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
 
+  // Stand-alone parsing utility for a single instruction worth of text.
+  Status ParseSingleInstruction(HloComputation::Builder* builder,
+                                string* root_name);
+
  private:
+  // Locates an instruction with the given name in the instruction_pool_ or
+  // returns nullptr.
+  //
+  // If the missing_instruction_hook_ is registered and a "shape" is provided,
+  // the hook will be called and may satisfy the request for the given
+  // instruction. This is useful when we reify parameters as they're resolved;
+  // i.e. for ParseSingleInstruction.
+  std::pair<HloInstruction*, LocTy>* FindInstruction(
+      const string& name, const optional<Shape>& shape = nullopt);
+
   // ParseXXX returns false if an error occurred.
   bool ParseHloModule();
   bool ParseComputations();
@@ -138,6 +154,7 @@ class HloParser {
     kFusionKind,
     kDistribution,
     kDomain,
+    kPrecisionList,
   };
 
   struct AttrConfig {
@@ -203,6 +220,7 @@ class HloParser {
   bool ParseWindowPad(std::vector<std::vector<tensorflow::int64>>* pad);
 
   bool ParseSliceRanges(SliceRanges* result);
+  bool ParsePrecisionList(std::vector<PrecisionConfigProto::Precision>* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
                       const TokKind delim,
                       std::vector<tensorflow::int64>* result);
@@ -221,6 +239,7 @@ class HloParser {
   bool ParseFftType(FftType* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
+  bool ParsePrecision(PrecisionConfigProto::Precision* result);
   bool ParseInt64(tensorflow::int64* result);
   bool ParseDouble(double* result);
   bool ParseBool(bool* result);
@@ -233,8 +252,8 @@ class HloParser {
   bool CanBeParamListToShape();
 
   // Logs the current parsing line and the given message. Always returns false.
-  bool TokenError(StringPiece msg);
-  bool Error(LocTy loc, StringPiece msg);
+  bool TokenError(absl::string_view msg);
+  bool Error(LocTy loc, absl::string_view msg);
 
   // If the current token is 'kind', eats it (i.e. lexes the next token) and
   // returns true.
@@ -265,9 +284,40 @@ class HloParser {
   std::vector<std::unique_ptr<HloComputation>> computations_;
   const HloModuleConfig config_;
   std::vector<string> error_;
+
+  // Function that gets invoked when we try to resolve an instruction
+  // instruction_pool_ but fail to do so.
+  std::function<std::pair<HloInstruction*, LocTy>*(string,
+                                                   const optional<Shape>&)>
+      missing_instruction_hook_;
 };
 
-bool HloParser::Error(LocTy loc, StringPiece msg) {
+bool SplitToInt64s(absl::string_view s, char delim, std::vector<int64>* out) {
+  for (const auto& split : absl::StrSplit(s, delim)) {
+    int64 val;
+    if (!absl::SimpleAtoi(split, &val)) {
+      return false;
+    }
+    out->push_back(val);
+  }
+  return true;
+}
+
+// Creates replica groups from the provided nested array. groups[i] represents
+// the replica ids for group 'i'.
+std::vector<ReplicaGroup> CreateReplicaGroups(
+    tensorflow::gtl::ArraySlice<std::vector<int64>> groups) {
+  std::vector<ReplicaGroup> replica_groups;
+  absl::c_transform(groups, std::back_inserter(replica_groups),
+                    [](const std::vector<int64>& ids) {
+                      ReplicaGroup group;
+                      *group.mutable_replica_ids() = {ids.begin(), ids.end()};
+                      return group;
+                    });
+  return replica_groups;
+}
+
+bool HloParser::Error(LocTy loc, absl::string_view msg) {
   auto line_col = lexer_.GetLineAndColumn(loc);
   const unsigned line = line_col.first;
   const unsigned col = line_col.second;
@@ -277,12 +327,12 @@ bool HloParser::Error(LocTy loc, StringPiece msg) {
   error_lines.push_back(std::string(lexer_.GetLine(loc)));
   error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^"));
 
-  error_.push_back(Join(error_lines, "\n"));
+  error_.push_back(StrJoin(error_lines, "\n"));
   VLOG(1) << "Error: " << error_.back();
   return false;
 }
 
-bool HloParser::TokenError(StringPiece msg) {
+bool HloParser::TokenError(absl::string_view msg) {
   return Error(lexer_.GetLoc(), msg);
 }
 
@@ -291,6 +341,17 @@ bool HloParser::Run() {
   return ParseHloModule();
 }
 
+std::pair<HloInstruction*, HloParser::LocTy>* HloParser::FindInstruction(
+    const string& name, const optional<Shape>& shape) {
+  std::pair<HloInstruction*, LocTy>* instr =
+      tensorflow::gtl::FindOrNull(instruction_pool_, name);
+  // Potentially call the missing instruction hook.
+  if (instr == nullptr && missing_instruction_hook_ != nullptr) {
+    return missing_instruction_hook_(name, shape);
+  }
+  return instr;
+}
+
 // ::= 'HloModule' name computations
 bool HloParser::ParseHloModule() {
   if (lexer_.GetKind() != TokKind::kw_HloModule) {
@@ -304,7 +365,7 @@ bool HloParser::ParseHloModule() {
     return false;
   }
 
-  module_ = MakeUnique<HloModule>(name, config_);
+  module_ = absl::make_unique<HloModule>(name, config_);
 
   return ParseComputations();
 }
@@ -357,7 +418,7 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
   if (!ParseName(&name)) {
     return false;
   }
-  auto builder = MakeUnique<HloComputation::Builder>(name);
+  auto builder = absl::make_unique<HloComputation::Builder>(name);
 
   LocTy shape_loc = nullptr;
   Shape shape;
@@ -370,8 +431,7 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
     return false;
   }
 
-  std::pair<HloInstruction*, LocTy>* root_node =
-      tensorflow::gtl::FindOrNull(instruction_pool_, root_name);
+  std::pair<HloInstruction*, LocTy>* root_node = FindInstruction(root_name);
   // This means some instruction was marked as ROOT but we didn't find it in the
   // pool, which should not happen.
   if (!root_name.empty() && root_node == nullptr) {
@@ -469,6 +529,10 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   attrs["backend_config"] = {/*required=*/false, AttrTy::kString,
                              &backend_config};
 
+  optional<std::vector<PrecisionConfigProto::Precision>> operand_precision;
+  attrs["operand_precision"] = {/*required=*/false, AttrTy::kPrecisionList,
+                                &operand_precision};
+
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
@@ -597,31 +661,29 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kCrossReplicaSum: {
+      optional<std::vector<std::vector<int64>>> tmp_groups;
       optional<HloComputation*> to_apply;
       optional<std::vector<int64>> replica_group_ids;
       optional<string> barrier;
       optional<int64> all_reduce_id;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &to_apply};
-      attrs["replica_group_ids"] = {
-          /*required=*/false, AttrTy::kBracedInt64List, &replica_group_ids};
+      attrs["replica_groups"] = {/*required=*/false,
+                                 AttrTy::kBracedInt64ListList, &tmp_groups};
       attrs["barrier"] = {/*required=*/false, AttrTy::kString, &barrier};
       attrs["all_reduce_id"] = {/*required=*/false, AttrTy::kInt64,
                                 &all_reduce_id};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
-      if (replica_group_ids) {
-        instruction =
-            builder->AddInstruction(HloInstruction::CreateCrossReplicaSum(
-                shape, operands, *to_apply, *replica_group_ids,
-                barrier ? *barrier : "", all_reduce_id));
-      } else {
-        instruction =
-            builder->AddInstruction(HloInstruction::CreateCrossReplicaSum(
-                shape, operands, *to_apply, {}, barrier ? *barrier : "",
-                all_reduce_id));
+      std::vector<ReplicaGroup> replica_groups;
+      if (tmp_groups) {
+        replica_groups = CreateReplicaGroups(*tmp_groups);
       }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateCrossReplicaSum(
+              shape, operands, *to_apply, replica_groups,
+              barrier ? *barrier : "", all_reduce_id));
       break;
     }
     case HloOpcode::kAllToAll: {
@@ -629,21 +691,15 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       optional<string> barrier;
       attrs["replica_groups"] = {/*required=*/false,
                                  AttrTy::kBracedInt64ListList, &tmp_groups};
-      attrs["barrier"] = {/*required=*/false, AttrTy::kString, &barrier};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       std::vector<ReplicaGroup> replica_groups;
       if (tmp_groups) {
-        c_transform(*tmp_groups, std::back_inserter(replica_groups),
-                    [](const std::vector<int64>& ids) {
-                      ReplicaGroup group;
-                      *group.mutable_replica_ids() = {ids.begin(), ids.end()};
-                      return group;
-                    });
+        replica_groups = CreateReplicaGroups(*tmp_groups);
       }
-      instruction = builder->AddInstruction(HloInstruction::CreateAllToAll(
-          shape, operands, replica_groups, barrier ? *barrier : ""));
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateAllToAll(shape, operands, replica_groups));
       break;
     }
     case HloOpcode::kReshape: {
@@ -825,9 +881,12 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kConvolution: {
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
+      optional<int64> feature_group_count;
       attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
       attrs["dim_labels"] = {/*required=*/true,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
+      attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64,
+                                      &feature_group_count};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
         return false;
@@ -835,8 +894,12 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       if (!window) {
         window.emplace();
       }
+      if (!feature_group_count) {
+        feature_group_count = 1;
+      }
       instruction = builder->AddInstruction(HloInstruction::CreateConvolve(
-          shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums));
+          shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums,
+          feature_group_count.value()));
       break;
     }
     case HloOpcode::kFft: {
@@ -1073,7 +1136,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kInfeed: {
       optional<string> config;
       attrs["infeed_config"] = {/*required=*/false, AttrTy::kString, &config};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       // We need to know the infeed data shape to construct the infeed
@@ -1085,41 +1149,21 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
         return Error(lexer_.GetLoc(),
                      "infeed must have a non-empty tuple shape");
       }
-
-      if (operands.empty()) {
-        // TODO(b/80000000): Remove this when all uses of infeed are
-        // converted to take tokens.
-        instruction = builder->AddInstruction(HloInstruction::CreateInfeed(
-            ShapeUtil::GetTupleElementShape(shape, 0), config ? *config : ""));
-      } else if (operands.size() == 1) {
-        instruction = builder->AddInstruction(HloInstruction::CreateInfeed(
-            ShapeUtil::GetTupleElementShape(shape, 0), operands[0],
-            config ? *config : ""));
-      } else {
-        return Error(lexer_.GetLoc(),
-                     "infeed must have exactly zero or one operands");
-      }
+      instruction = builder->AddInstruction(HloInstruction::CreateInfeed(
+          ShapeUtil::GetTupleElementShape(shape, 0), operands[0],
+          config ? *config : ""));
       break;
     }
     case HloOpcode::kOutfeed: {
       optional<string> config;
       attrs["outfeed_config"] = {/*required=*/false, AttrTy::kString, &config};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
-      if (operands.size() == 1) {
-        // TODO(b/80000000): Remove this when all uses of outfeed are
-        // converted to take tokens.
-        instruction = builder->AddInstruction(HloInstruction::CreateOutfeed(
-            operands[0]->shape(), operands[0], config ? *config : ""));
-      } else if (operands.size() == 2) {
-        instruction = builder->AddInstruction(
-            HloInstruction::CreateOutfeed(operands[0]->shape(), operands[0],
-                                          operands[1], config ? *config : ""));
-      } else {
-        return Error(lexer_.GetLoc(),
-                     "outfeed must have exactly one or two operands");
-      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateOutfeed(operands[0]->shape(), operands[0],
+                                        operands[1], config ? *config : ""));
       break;
     }
     case HloOpcode::kRng: {
@@ -1189,20 +1233,6 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       }
       break;
     }
-    case HloOpcode::kHostCompute: {
-      optional<string> channel_name;
-      optional<tensorflow::int64> cost_estimate_ns;
-      attrs["channel_name"] = {/*required=*/true, AttrTy::kString,
-                               &channel_name};
-      attrs["cost_estimate_ns"] = {/*required=*/true, AttrTy::kInt64,
-                                   &cost_estimate_ns};
-      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
-        return false;
-      }
-      instruction = builder->AddInstruction(HloInstruction::CreateHostCompute(
-          shape, operands, *channel_name, *cost_estimate_ns));
-      break;
-    }
     case HloOpcode::kDot: {
       optional<std::vector<tensorflow::int64>> lhs_contracting_dims;
       attrs["lhs_contracting_dims"] = {
@@ -1245,22 +1275,21 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGather: {
-      optional<std::vector<tensorflow::int64>> output_window_dims;
-      attrs["output_window_dims"] = {
-          /*required=*/true, AttrTy::kBracedInt64List, &output_window_dims};
-      optional<std::vector<tensorflow::int64>> elided_window_dims;
-      attrs["elided_window_dims"] = {
-          /*required=*/true, AttrTy::kBracedInt64List, &elided_window_dims};
-      optional<std::vector<tensorflow::int64>> gather_dims_to_operand_dims;
-      attrs["gather_dims_to_operand_dims"] = {/*required=*/true,
-                                              AttrTy::kBracedInt64List,
-                                              &gather_dims_to_operand_dims};
+      optional<std::vector<tensorflow::int64>> offset_dims;
+      attrs["offset_dims"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                              &offset_dims};
+      optional<std::vector<tensorflow::int64>> collapsed_slice_dims;
+      attrs["collapsed_slice_dims"] = {
+          /*required=*/true, AttrTy::kBracedInt64List, &collapsed_slice_dims};
+      optional<std::vector<tensorflow::int64>> start_index_map;
+      attrs["start_index_map"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                                  &start_index_map};
       optional<tensorflow::int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
-      optional<std::vector<tensorflow::int64>> window_bounds;
-      attrs["window_bounds"] = {/*required=*/true, AttrTy::kBracedInt64List,
-                                &window_bounds};
+      optional<std::vector<tensorflow::int64>> slice_sizes;
+      attrs["slice_sizes"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                              &slice_sizes};
 
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
@@ -1269,14 +1298,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
 
       GatherDimensionNumbers dim_numbers =
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/*output_window_dims,
-              /*elided_window_dims=*/*elided_window_dims,
-              /*gather_dims_to_operand_dims=*/*gather_dims_to_operand_dims,
+              /*offset_dims=*/*offset_dims,
+              /*collapsed_slice_dims=*/*collapsed_slice_dims,
+              /*start_index_map=*/*start_index_map,
               /*index_vector_dim=*/*index_vector_dim);
 
       instruction = builder->AddInstruction(HloInstruction::CreateGather(
-          shape, /*operand=*/operands[0], /*gather_indices=*/operands[1],
-          dim_numbers, *window_bounds));
+          shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
+          dim_numbers, *slice_sizes));
       break;
     }
     case HloOpcode::kScatter: {
@@ -1359,6 +1388,12 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   if (backend_config) {
     instruction->set_raw_backend_config_string(std::move(*backend_config));
   }
+  if (operand_precision) {
+    PrecisionConfigProto precision_config;
+    *precision_config.mutable_operand_precision() = {operand_precision->begin(),
+                                                     operand_precision->end()};
+    instruction->set_precision_config(precision_config);
+  }
   return AddInstruction(name, instruction, name_loc);
 }  // NOLINT(readability/fn_size)
 
@@ -1522,14 +1557,14 @@ bool HloParser::ParseDomain(DomainData* domain) {
     return false;
   }
   if (*kind == ShardingMetadata::KindName()) {
-    auto entry_sharding_ptr = MakeUnique<HloSharding>(
+    auto entry_sharding_ptr = absl::make_unique<HloSharding>(
         HloSharding::FromProto(*entry_sharding).ValueOrDie());
-    auto exit_sharding_ptr = MakeUnique<HloSharding>(
+    auto exit_sharding_ptr = absl::make_unique<HloSharding>(
         HloSharding::FromProto(*exit_sharding).ValueOrDie());
     domain->entry_metadata =
-        MakeUnique<ShardingMetadata>(std::move(entry_sharding_ptr));
+        absl::make_unique<ShardingMetadata>(std::move(entry_sharding_ptr));
     domain->exit_metadata =
-        MakeUnique<ShardingMetadata>(std::move(exit_sharding_ptr));
+        absl::make_unique<ShardingMetadata>(std::move(exit_sharding_ptr));
   } else {
     return TokenError(StrCat("unsupported domain kind: ", *kind));
   }
@@ -1549,8 +1584,7 @@ bool HloParser::ParseInstructionNames(
     if (!ParseName(&name)) {
       return Error(loc, "expects a instruction name");
     }
-    std::pair<HloInstruction*, LocTy>* instr =
-        tensorflow::gtl::FindOrNull(instruction_pool_, name);
+    std::pair<HloInstruction*, LocTy>* instr = FindInstruction(name);
     if (!instr) {
       return TokenError(
           Printf("instruction '%s' is not defined", name.c_str()));
@@ -1782,10 +1816,10 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
     std::vector<tensorflow::int64> elems_seen_until_dim(
         elems_seen_per_dim.begin(), elems_seen_per_dim.begin() + dim);
     return StrCat("[",
-                  Join(elems_seen_until_dim, ",",
-                       [](string* out, const tensorflow::int64& num_elems) {
-                         StrAppend(out, num_elems - 1);
-                       }),
+                  StrJoin(elems_seen_until_dim, ",",
+                          [](string* out, const tensorflow::int64& num_elems) {
+                            StrAppend(out, num_elems - 1);
+                          }),
                   "]");
   };
   do {
@@ -1938,7 +1972,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
 
   tensorflow::int64 rank = ShapeUtil::Rank(shape);
 
-  *literal = MakeUnique<Literal>(shape);
+  *literal = absl::make_unique<Literal>(shape);
 
   if (!ParseToken(TokKind::kLbrace,
                   "expects '{' at the beginning of a sparse literal")) {
@@ -1972,7 +2006,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
         return Error(
             index_loc,
             StrCat("invalid multi-dimension index for shape with rank ", rank,
-                   ": [", Join(index, ", "), "]"));
+                   ": [", StrJoin(index, ", "), "]"));
       }
     }
     if (!ParseToken(TokKind::kColon,
@@ -2033,6 +2067,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
 //   ::= operand (, operand)*
 // operand ::= (shape)? name
 bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
+  CHECK(operands != nullptr);
   if (!ParseToken(TokKind::kLparen,
                   "expects '(' at the beginning of operands")) {
     return false;
@@ -2043,9 +2078,10 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
     do {
       LocTy loc = lexer_.GetLoc();
       string name;
+      optional<Shape> shape;
       if (CanBeShape()) {
-        Shape shape;
-        if (!ParseShape(&shape)) {
+        shape.emplace();
+        if (!ParseShape(&shape.value())) {
           return false;
         }
       }
@@ -2053,8 +2089,8 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
         return false;
       }
       std::pair<HloInstruction*, LocTy>* instruction =
-          tensorflow::gtl::FindOrNull(instruction_pool_, name);
-      if (!instruction) {
+          FindInstruction(name, shape);
+      if (instruction == nullptr) {
         return Error(loc, StrCat("instruction does not exist: ", name));
       }
       operands->push_back(instruction->first);
@@ -2065,6 +2101,7 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
 
 bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands,
                               const int expected_size) {
+  CHECK(operands != nullptr);
   LocTy loc = lexer_.GetLoc();
   if (!ParseOperands(operands)) {
     return false;
@@ -2146,10 +2183,10 @@ bool HloParser::ParseAttributeHelper(
     } else {
       allowed_attrs = StrCat(
           "Allowed attributes: ",
-          Join(attrs, ", ",
-               [&](string* out, const std::pair<string, AttrConfig>& kv) {
-                 StrAppend(out, kv.first);
-               }));
+          StrJoin(attrs, ", ",
+                  [&](string* out, const std::pair<string, AttrConfig>& kv) {
+                    StrAppend(out, kv.first);
+                  }));
     }
     return Error(loc, Printf("unexpected attribute \"%s\".  %s", name.c_str(),
                              allowed_attrs.c_str()));
@@ -2334,6 +2371,16 @@ bool HloParser::ParseAttributeHelper(
       case AttrTy::kDomain: {
         return ParseDomain(static_cast<DomainData*>(attr_out_ptr));
       }
+      case AttrTy::kPrecisionList: {
+        std::vector<PrecisionConfigProto::Precision> result;
+        if (!ParsePrecisionList(&result)) {
+          return false;
+        }
+        static_cast<optional<std::vector<PrecisionConfigProto::Precision>>*>(
+            attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
     }
   }();
   if (!success) {
@@ -2452,20 +2499,24 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   }
   string str = lexer_.GetStrVal();
 
-  // The str is expected to have 3 items, lhs, rhs, out, and it must looks like
+  // The str is expected to have 3 items, lhs, rhs, out, and it must look like
   // lhs_rhs->out, that is, the first separator is "_" and the second is "->".
-  // So we replace the "->" with "_" and then split on "_".
-  str = tensorflow::str_util::StringReplace(str, /*oldsub=*/"->",
-                                            /*newsub=*/"_",
-                                            /*replace_all=*/false);
-  std::vector<string> lhs_rhs_out = Split(str, "_");
-  if (lhs_rhs_out.size() != 3) {
+  std::vector<string> split1 = absl::StrSplit(str, "_");
+  if (split1.size() != 2) {
+    LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees "
+               << str;
+  }
+  std::vector<string> split2 = absl::StrSplit(split1[1], "->");
+  if (split2.size() != 2) {
     LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees "
                << str;
   }
+  absl::string_view lhs = split1[0];
+  absl::string_view rhs = split2[0];
+  absl::string_view out = split2[1];
 
-  const tensorflow::int64 rank = lhs_rhs_out[0].length();
-  if (rank != lhs_rhs_out[1].length() || rank != lhs_rhs_out[2].length()) {
+  const tensorflow::int64 rank = lhs.length();
+  if (rank != rhs.length() || rank != out.length()) {
     return TokenError(
         "convolution lhs, rhs, and output must have the same rank");
   }
@@ -2480,8 +2531,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 
   // lhs
   {
-    const string& lhs = lhs_rhs_out[0];
-    if (!is_unique(lhs)) {
+    if (!is_unique(string(lhs))) {
       return TokenError(
           StrCat("expects unique lhs dimension numbers, but sees ", lhs));
     }
@@ -2504,8 +2554,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   }
   // rhs
   {
-    const string& rhs = lhs_rhs_out[1];
-    if (!is_unique(rhs)) {
+    if (!is_unique(string(rhs))) {
       return TokenError(
           StrCat("expects unique rhs dimension numbers, but sees ", rhs));
     }
@@ -2528,8 +2577,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   }
   // output
   {
-    const string& out = lhs_rhs_out[2];
-    if (!is_unique(out)) {
+    if (!is_unique(string(out))) {
       return TokenError(
           StrCat("expects unique output dimension numbers, but sees ", out));
     }
@@ -2606,6 +2654,24 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
   return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
 }
 
+// precisionlist ::= start precision_elements end
+// precision_elements
+//   ::= /*empty*/
+//   ::= precision_val (delim precision_val)*
+bool HloParser::ParsePrecisionList(
+    std::vector<PrecisionConfigProto::Precision>* result) {
+  auto parse_and_add_item = [&]() {
+    PrecisionConfigProto::Precision item;
+    if (!ParsePrecision(&item)) {
+      return false;
+    }
+    result->push_back(item);
+    return true;
+  };
+  return ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
+                   parse_and_add_item);
+}
+
 // int64list ::= start int64_elements end
 // int64_elements
 //   ::= /*empty*/
@@ -2777,7 +2843,7 @@ bool HloParser::ParseDxD(const string& name,
   // 2D or higher.
   if (lexer_.GetKind() == TokKind::kDxD) {
     string str = lexer_.GetStrVal();
-    if (!SplitAndParseAsInts(str, 'x', result)) {
+    if (!SplitToInt64s(str, 'x', result)) {
       return Error(loc,
                    Printf("expects sub-attribute '%s=ixj...'", name.c_str()));
     }
@@ -2797,10 +2863,9 @@ bool HloParser::ParseWindowPad(
     return TokenError("expects window pad pattern, e.g., '0_0x3_3'");
   }
   string str = lexer_.GetStrVal();
-  std::vector<string> padding_str = Split(str, 'x');
-  for (int i = 0; i < padding_str.size(); i++) {
+  for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
     std::vector<tensorflow::int64> low_high;
-    if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) ||
+    if (!SplitToInt64s(padding_dim_str, '_', &low_high) ||
         low_high.size() != 2) {
       return Error(loc,
                    "expects padding_low and padding_high separated by '_'");
@@ -2821,10 +2886,9 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   }
   LocTy loc = lexer_.GetLoc();
   string str = lexer_.GetStrVal();
-  std::vector<string> padding_str = Split(str, 'x');
-  for (const auto& padding_dim_str : padding_str) {
+  for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
     std::vector<tensorflow::int64> padding_dim;
-    if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) ||
+    if (!SplitToInt64s(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
       return Error(loc,
                    "expects padding config pattern like 'low_high_interior' or "
@@ -2932,6 +2996,23 @@ bool HloParser::ParseRandomDistribution(RandomDistribution* result) {
   return true;
 }
 
+bool HloParser::ParsePrecision(PrecisionConfigProto::Precision* result) {
+  VLOG(1) << "ParsePrecision";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects random distribution");
+  }
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToPrecision(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects precision but sees: %s, error: %s", val.c_str(),
+               status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParser::ParseInt64(tensorflow::int64* result) {
   VLOG(1) << "ParseInt64";
   if (lexer_.GetKind() != TokKind::kInt) {
@@ -3053,10 +3134,44 @@ HloParser::ParseConvolutionDimensionNumbersOnly() {
   return dnums;
 }
 
+Status HloParser::ParseSingleInstruction(HloComputation::Builder* builder,
+                                         string* root_name) {
+  TF_RET_CHECK(missing_instruction_hook_ == nullptr);
+
+  // The missing instruction hook we register creates the shaped instruction on
+  // the fly as a parameter and returns it.
+  int64 parameter_count = 0;
+  missing_instruction_hook_ =
+      [this, builder, &parameter_count](
+          string name,
+          const optional<Shape>& shape) -> std::pair<HloInstruction*, LocTy>* {
+    if (!shape.has_value()) {
+      Error(lexer_.GetLoc(),
+            StrCat("Operand ", name,
+                   " had no shape in HLO text; cannot create parameter for "
+                   "single-instruction module."));
+      return nullptr;
+    }
+    HloInstruction* parameter = builder->AddInstruction(
+        HloInstruction::CreateParameter(parameter_count++, *shape, name));
+    instruction_pool_[name] = {parameter, lexer_.GetLoc()};
+    return tensorflow::gtl::FindOrNull(instruction_pool_, name);
+  };
+
+  // Prime the lexer.
+  lexer_.Lex();
+
+  // Parse the instruction with the registered hook.
+  if (!ParseInstruction(builder, root_name)) {
+    return InvalidArgument("Syntax error:\n%s", GetError().c_str());
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(
-    tensorflow::StringPiece str, const HloModuleConfig& config) {
+    absl::string_view str, const HloModuleConfig& config) {
   HloParser parser(str, config);
   if (!parser.Run()) {
     return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str());
@@ -3064,26 +3179,38 @@ StatusOr<std::unique_ptr<HloModule>> ParseHloString(
   return parser.ConsumeHloModule();
 }
 
-StatusOr<std::unique_ptr<HloModule>> ParseHloString(
-    tensorflow::StringPiece str) {
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str) {
   HloModuleConfig config;
   return ParseHloString(str, config);
 }
 
-StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str) {
+StatusOr<std::unique_ptr<HloModule>> ParseHloOpToModule(
+    absl::string_view str, absl::string_view name) {
+  HloModuleConfig config;
+  HloParser parser(str, config);
+  auto builder = absl::make_unique<HloComputation::Builder>(string(name));
+  string root_name;
+  TF_RETURN_IF_ERROR(parser.ParseSingleInstruction(builder.get(), &root_name));
+  std::unique_ptr<HloComputation> computation = builder->Build();
+  auto module = absl::make_unique<HloModule>(string(name), config);
+  module->AddEntryComputation(std::move(computation));
+  return std::move(module);
+}
+
+StatusOr<HloSharding> ParseSharding(absl::string_view str) {
   HloModuleConfig config;
   HloParser parser(str, config);
   return parser.ParseShardingOnly();
 }
 
-StatusOr<Window> ParseWindow(tensorflow::StringPiece str) {
+StatusOr<Window> ParseWindow(absl::string_view str) {
   HloModuleConfig config;
   HloParser parser(str, config);
   return parser.ParseWindowOnly();
 }
 
 StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
-    tensorflow::StringPiece str) {
+    absl::string_view str) {
   HloModuleConfig config;
   HloParser parser(str, config);
   return parser.ParseConvolutionDimensionNumbersOnly();
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 3f3a51215e..0c64b50481 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_lexer.h"
@@ -32,27 +33,31 @@ namespace xla {
 // The api of the hlo parser. Given a string in the HloModule::ToString()
 // format, parses the string and creates a HloModule with the given config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(
-    tensorflow::StringPiece str, const HloModuleConfig& config);
+    absl::string_view str, const HloModuleConfig& config);
+
+// Parses the text for a single HLO operation into an HLO module with a function
+// that runs that operation (with the same parameters) as its entry computation.
+StatusOr<std::unique_ptr<HloModule>> ParseHloOpToModule(
+    absl::string_view str, absl::string_view name = "single_op");
 
 // The api of the hlo parser. Given a string in the HloModule::ToString()
 // format, parses the string and creates a HloModule with default config.
-StatusOr<std::unique_ptr<HloModule>> ParseHloString(
-    tensorflow::StringPiece str);
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
 
 // Parses the result of HloSharding::ToString(), e.g. "{replicated}".
-StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str);
+StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
 // Parses the result of window_util::ToString(const Window&).
-StatusOr<Window> ParseWindow(tensorflow::StringPiece str);
+StatusOr<Window> ParseWindow(absl::string_view str);
 
 // Parses the result of ConvolutionDimensionNumbersToString(), e.g.
 // "b0f_0io->b0f".
 StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
-    tensorflow::StringPiece str);
+    absl::string_view str);
 
 // ParseHloString sharding from str. str is supposed to contain the body of the
 // sharding, i.e. just the rhs of the "sharding={...}" attribute string.
-StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str);
+StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 5990a3d478..b3d3ccda74 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -16,17 +16,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 #include <string>
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
-
 namespace {
 
-using ::tensorflow::StringPiece;
+namespace op = ::xla::testing::opcode_matchers;
+using absl::string_view;
 
 struct TestData {
   string test_name;
@@ -380,7 +382,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
   %input = f32[1,2,1]{2,1,0} parameter(0)
   %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
   %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f, feature_group_count=1
 }
 
 )"
@@ -393,7 +395,7 @@ R"(HloModule ConvolveR2_module
 ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] {
   %input = f32[1,2]{1,0} parameter(0)
   %filter = f32[1,1]{1,0} parameter(1)
-  ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf
+  ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf, feature_group_count=1
 }
 
 )"
@@ -406,7 +408,7 @@ R"(HloModule ConvolveBackward_module
 ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] {
   %input = f32[128,7,7,512]{0,3,2,1} parameter(0)
   %filter = f32[3,3,512,512]{3,2,1,0} parameter(1)
-  ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
+  ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f, feature_group_count=1
 }
 
 )"
@@ -752,10 +754,10 @@ ENTRY %sparse_f32_r1 () -> f32[9] {
 "gather",
 R"(HloModule StringifyGather
 
-ENTRY %Gather (input_tensor: f32[50,49,48,47,46], gather_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
+ENTRY %Gather (input_tensor: f32[50,49,48,47,46], start_indices: s64[10,9,8,7,5]) -> f32[10,9,8,7,30,29,28,27,26] {
   %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
-  %gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
-  ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
+  %start_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %start_indices), offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, start_index_map={0,1,2,3,4}, index_vector_dim=4, slice_sizes={30,29,28,27,26}
 }
 
 )"
@@ -1030,8 +1032,8 @@ R"(HloModule gather
 
 ENTRY Gather {
   input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0)
-  gather_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
-  ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
+  start_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1)
+  ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, start_indices), offset_dims={4,5,6,7,8}, collapsed_slice_dims={}, start_index_map={0,1,2,3,4}, index_vector_dim=4, slice_sizes={30,29,28,27,26}
 }
 
 )"
@@ -1049,7 +1051,7 @@ add {
 
 ENTRY CRS {
   input = f32[8]{0} parameter(0)
-  ROOT crs = f32[8]{0} cross-replica-sum(input), replica_group_ids={}, to_apply=add
+  ROOT crs = f32[8]{0} cross-replica-sum(input), replica_groups={}, to_apply=add
 }
 
 )"
@@ -1067,7 +1069,7 @@ add {
 
 ENTRY CrossReplicaSumWithSubgroups {
   input = f32[128,32]{0,1} parameter(0)
-  ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_group_ids={0,0,1,1}, barrier="abc", to_apply=add
+  ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add
 }
 
 )"
@@ -1091,7 +1093,7 @@ R"(HloModule AllToAllWithSubgroups
 
 ENTRY AllToAllWithSubgroups {
   input = f32[128,32]{0,1} parameter(0)
-  ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={{1,2},{3,0}}, barrier="abc"
+  ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={{1,2},{3,0}}
 }
 
 )"
@@ -1125,8 +1127,8 @@ ENTRY Computation {
 class HloParserTest : public ::testing::Test,
                       public ::testing::WithParamInterface<TestData> {
  protected:
-  static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-    EXPECT_TRUE(tensorflow::str_util::StrContains(s, expected))
+  static void ExpectHasSubstr(string_view s, string_view expected) {
+    EXPECT_TRUE(absl::StrContains(s, expected))
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
@@ -1370,7 +1372,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
   %input = f32[1,2,1]{2,1,0} parameter(0)
   %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
   %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), feature_group_count=1, sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
 }
 
 )";
@@ -1390,15 +1392,14 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 
 )";
 
-  ExpectHasSubstr(ParseHloString(tensorflow::strings::StrCat(
-                                     prefix, ",dim_labels=00_01_10", suffix))
-                      .status()
-                      .error_message(),
-                  "expects dim labels pattern");
+  ExpectHasSubstr(
+      ParseHloString(absl::StrCat(prefix, ",dim_labels=00_01_10", suffix))
+          .status()
+          .error_message(),
+      "expects dim labels pattern");
 
   ExpectHasSubstr(
-      ParseHloString(tensorflow::strings::StrCat(
-                         prefix, ",dim_labels=010_1100->010", suffix))
+      ParseHloString(absl::StrCat(prefix, ",dim_labels=010_1100->010", suffix))
           .status()
           .error_message(),
       "must have the same rank");
@@ -1722,5 +1723,26 @@ ENTRY nontuple_infeed {
                   "infeed must have a non-empty tuple shape");
 }
 
+TEST(HloParserSingleOpTest, SingleOp) {
+  const string text =
+      "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, "
+      "f32[2,4]{1,0} %x)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloOpToModule(text));
+  const HloComputation* computation = module->entry_computation();
+  ASSERT_NE(computation, nullptr);
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Parameter(0), op::Parameter(1)));
+}
+
+TEST(HloParserSingleOpTest, SingleOpNoShapesProducesError) {
+  const string text = "%multiply = f32[2,4]{1,0} multiply(%broadcast, %x)";
+  StatusOr<std::unique_ptr<HloModule>> module = ParseHloOpToModule(text);
+  ASSERT_TRUE(!module.status().ok());
+  LOG(INFO) << "Status: " << module.status();
+  EXPECT_THAT(
+      module.status().ToString(),
+      ::testing::HasSubstr("Operand broadcast had no shape in HLO text"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index 28194deb0e..791b1a97b0 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -45,7 +45,7 @@ class HloPassFix : public Pass {
       ++iteration_count;
       if (iteration_count == limit) {
         LOG(ERROR)
-            << "Unexpectedly number of iterations in HLO passes ("
+            << "Unexpectedly high number of iterations in HLO passes ("
             << iteration_count
             << ")\nIf compilation hangs here, please file a bug with XLA.";
       }
diff --git a/tensorflow/compiler/xla/service/hlo_pass_interface.h b/tensorflow/compiler/xla/service/hlo_pass_interface.h
index 0cddf8fb8f..f1ad0f9b01 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_interface.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_interface.h
@@ -29,7 +29,7 @@ namespace xla {
 class HloPassInterface {
  public:
   virtual ~HloPassInterface() = default;
-  virtual tensorflow::StringPiece name() const = 0;
+  virtual absl::string_view name() const = 0;
 
   // Run the pass on the given HLO module.  Return whether it modified the
   // module.
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index d8f1ab916b..df99e131d8 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -17,22 +17,22 @@ limitations under the License.
 
 #include <functional>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
-
 namespace xla {
-
 namespace {
+
+using absl::StrAppend;
+using absl::StrCat;
+
 void DumpModuleGraph(const HloModule& module, const string& message) {
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
@@ -68,7 +68,7 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
                                                    repeated_field.end());
   if (!disabled_passes.empty()) {
     VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: "
-            << tensorflow::str_util::Join(disabled_passes, ", ");
+            << absl::StrJoin(disabled_passes, ", ");
   }
 
   auto run_invariant_checkers = [this,
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
index a42d7e59fe..1d41a4dac1 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,7 +34,7 @@ namespace xla {
 class HloPassPipeline : public HloPassInterface {
  public:
   explicit HloPassPipeline(const string& name) : name_(name) {}
-  tensorflow::StringPiece name() const override { return name_; }
+  absl::string_view name() const override { return name_; }
 
   // Add a pass to the pipeline. It should be called with the arguments for the
   // pass constructor:
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
index b9cca13870..c3cacd7ce6 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index cf0be30c7a..6c6e7c6fec 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -20,6 +20,9 @@ limitations under the License.
 #include <set>
 #include <string>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
@@ -37,17 +40,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-using ::tensorflow::strings::HumanReadableNumBytes;
-
 namespace xla {
-
 namespace {
 
+using ::tensorflow::strings::HumanReadableNumBytes;
+
 // Potential optimizations:
 // . TODO(b/35244891): Avoid N^2 behavior by keeping a priority queue
 //   of candidates.
@@ -88,7 +88,7 @@ bool CanBeRematerialized(
 
 // Type holding a unique identifier for each Buffer object.
 using BufferId = int64;
-using BufferIdList = tensorflow::gtl::InlinedVector<BufferId, 3>;
+using BufferIdList = absl::InlinedVector<BufferId, 3>;
 
 // We wrap HloInstruction* with an Item that holds auxiliary
 // per-instruction state.
@@ -123,7 +123,7 @@ struct Item {
   int64 position;
 };
 
-using ItemList = tensorflow::gtl::InlinedVector<Item*, 3>;
+using ItemList = absl::InlinedVector<Item*, 3>;
 
 // Class which maintains an ordered list of instructions with fast insertion
 // before arbitrary elements.
@@ -206,11 +206,10 @@ class InstructionList {
       Item* to_insert, tensorflow::gtl::ArraySlice<Item*> before_instructions) {
     VLOG(3) << "InsertBeforeInstructions: " << to_insert->instruction->name()
             << " before {"
-            << tensorflow::str_util::Join(before_instructions, ", ",
-                                          [](string* out, Item* item) {
-                                            tensorflow::strings::StrAppend(
-                                                out, item->instruction->name());
-                                          })
+            << absl::StrJoin(before_instructions, ", ",
+                             [](string* out, Item* item) {
+                               absl::StrAppend(out, item->instruction->name());
+                             })
             << "}";
 
     // Find the minimal position number of any instruction in
@@ -393,10 +392,9 @@ class MemoryUsageTracker {
     int64 unfinished_user_count;
 
     string ToString() const {
-      return tensorflow::strings::StrCat(
-          "Buffer ", id, " (defined by ",
-          defining_instruction->instruction->name(), ", size ", size,
-          " bytes)");
+      return absl::StrCat("Buffer ", id, " (defined by ",
+                          defining_instruction->instruction->name(), ", size ",
+                          size, " bytes)");
     }
   };
 
@@ -740,29 +738,27 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
 }
 
 string MemoryUsageTracker::ToString() const {
-  string output = tensorflow::strings::StrCat("MemoryUsageTracker for ",
-                                              computation_->name(), "\n");
-  tensorflow::strings::StrAppend(
-      &output, "Memory usage: ", HumanReadableNumBytes(memory_usage()), " (",
-      memory_usage(), " bytes)");
+  string output =
+      absl::StrCat("MemoryUsageTracker for ", computation_->name(), "\n");
+  absl::StrAppend(&output,
+                  "Memory usage: ", HumanReadableNumBytes(memory_usage()), " (",
+                  memory_usage(), " bytes)");
   for (auto* item = instruction_list_.first(); item != nullptr;
        item = instruction_list_.next(item)) {
     const HloInstruction* instruction = item->instruction;
     string inprogress = item == in_progress_item_ ? " in-progress" : "";
     string placed = item->placed ? " placed" : "";
-    tensorflow::strings::StrAppend(&output, "  ", instruction->name(),
-                                   inprogress, placed, "\n    Defines:\n");
+    absl::StrAppend(&output, "  ", instruction->name(), inprogress, placed,
+                    "\n    Defines:\n");
     for (BufferId buffer_id : item->buffers_defined) {
       const Buffer& buffer = buffers_[buffer_id];
       string live = IsCurrentlyLive(buffer_id) ? " live" : "";
-      tensorflow::strings::StrAppend(&output, "      ", buffer.ToString(), live,
-                                     ", ", buffer.unfinished_user_count,
-                                     " unfinished uses\n");
+      absl::StrAppend(&output, "      ", buffer.ToString(), live, ", ",
+                      buffer.unfinished_user_count, " unfinished uses\n");
     }
-    tensorflow::strings::StrAppend(&output, "    Uses:\n");
+    absl::StrAppend(&output, "    Uses:\n");
     for (BufferId buffer_id : item->buffers_used) {
-      tensorflow::strings::StrAppend(&output, "      ",
-                                     buffers_[buffer_id].ToString(), "\n");
+      absl::StrAppend(&output, "      ", buffers_[buffer_id].ToString(), "\n");
     }
   }
   return output;
@@ -780,10 +776,9 @@ bool MemoryUsageTracker::Check() const {
     CHECK(elements_are_unique(defined_buffers))
         << "Instruction " << instruction->name()
         << " does not have unique defined buffers: "
-        << tensorflow::str_util::Join(
+        << absl::StrJoin(
                defined_buffers, ", ", [this](string* out, BufferId buffer_id) {
-                 tensorflow::strings::StrAppend(
-                     out, buffers_.at(buffer_id).ToString());
+                 absl::StrAppend(out, buffers_.at(buffer_id).ToString());
                });
 
     for (const Buffer& buffer : buffers_) {
@@ -803,10 +798,9 @@ bool MemoryUsageTracker::Check() const {
     CHECK(elements_are_unique(used_buffers))
         << "Instruction " << instruction->name()
         << " does not have unique used buffers: "
-        << tensorflow::str_util::Join(
+        << absl::StrJoin(
                used_buffers, ", ", [this](string* out, BufferId buffer_id) {
-                 tensorflow::strings::StrAppend(
-                     out, buffers_.at(buffer_id).ToString());
+                 absl::StrAppend(out, buffers_.at(buffer_id).ToString());
                });
   }
   for (const Buffer& buffer : buffers_) {
@@ -1209,6 +1203,49 @@ StatusOr<bool> HloRematerialization::Run(
 
   VLOG(1) << "HloRematerialization() with memory limit of "
           << HumanReadableNumBytes(memory_limit_bytes);
+  XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
+
+  // Create initial sequence of HLO instructions.
+  TF_ASSIGN_OR_RETURN(*sequence, ScheduleComputationsInModule(
+                                     *module,
+                                     [this](const BufferValue& buffer) {
+                                       return size_function_(buffer.shape());
+                                     },
+                                     scheduler_algorithm_));
+  if (copy_insertion) {
+    // We run a separate pass of copy elision here because the sequential
+    // ordering from the HLO schedule allows for more copies to be eliminated.
+    // TODO(b/80249101): Instead of a separate copy elision pass, use the
+    // ordering from the HLO schedule directly for copy insertion.
+
+    // First create a copy of the schedule which contains HloInstruction unique
+    // ids instead of HloInstruction*. This is necessary for updating the
+    // schedule below.
+    // TODO(b/113175018): Remove this when the HLO schedule is self-contained
+    // and can update itself.
+    tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+        id_sequence = ComputeIdSchedule(*sequence);
+
+    SequentialHloOrdering ordering(module, *sequence);
+    TF_RETURN_IF_ERROR(
+        copy_insertion->RemoveUnnecessaryCopies(ordering, module));
+
+    // RemoveUnnecessaryCopies only considers interference when determining
+    // whether it is legal to remove a copy. However, copies in the graph may be
+    // necessary for other reason such as preventing a constant from being live
+    // out of the graph. So run AddSpecialCaseCopies to re-insert these copies.
+    // TODO(b/80249101): Break copy insertion into several passes and run each
+    // one once in the regular HLO pipeline.
+    TF_RETURN_IF_ERROR(copy_insertion->AddSpecialCaseCopies(module));
+
+    // The passes above can add and remove copies, update the schedule to
+    // account for these transformations. Newly added instructions will be
+    // placed ASAP in the schedule.
+    TF_RETURN_IF_ERROR(UpdateSchedule(*module, id_sequence, sequence));
+
+    TF_DCHECK_OK(copy_insertion->VerifyNoLiveRangeInterference(
+        SequentialHloOrdering(module, *sequence), module));
+  }
 
   TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module));
 
@@ -1230,24 +1267,6 @@ StatusOr<bool> HloRematerialization::Run(
           << HumanReadableNumBytes(module_output_size)
           << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes);
 
-  XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
-  // Create initial sequence of HLO instructions.
-  TF_ASSIGN_OR_RETURN(*sequence, ScheduleComputationsInModule(
-                                     *module,
-                                     [this](const BufferValue& buffer) {
-                                       return size_function_(buffer.shape());
-                                     },
-                                     scheduler_algorithm_));
-  if (copy_insertion) {
-    // We run a separate pass of copy elision here because the sequential
-    // ordering from the HLO schedule allows for more copies to be eliminated.
-    // TODO(b/80249101): Instead of a separate copy elision pass, use the
-    // ordering from the HLO schedule directly for copy insertion.
-    SequentialHloOrdering ordering(module, *sequence);
-    TF_RETURN_IF_ERROR(
-        copy_insertion->RemoveUnnecessaryCopies(ordering, module));
-  }
-
   // Compute peak memory usage of all computations in the module called in a
   // sequential context.
   call_graph_ = CallGraph::Build(module);
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index b2725e2918..7bd8a4a544 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -32,7 +32,7 @@ limitations under the License.
 namespace xla {
 
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
+HloRunner::CreateModuleFromString(const absl::string_view hlo_string,
                                   const DebugOptions& debug_options) {
   HloModuleConfig config;
   config.set_debug_options(debug_options);
@@ -233,7 +233,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     int64 device = device_assignment(i, 0);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                         backend().stream_executor(device));
-    streams.push_back(MakeUnique<se::Stream>(executor));
+    streams.push_back(absl::make_unique<se::Stream>(executor));
     streams.back()->Init();
     service_run_options.emplace_back(GetServiceRunOptionsForDevice(
         device, streams.back().get(), &device_assignment));
@@ -260,7 +260,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     num_threads += options.num_replicas;
   }
   if (num_threads > 0) {
-    pool = MakeUnique<tensorflow::thread::ThreadPool>(
+    pool = absl::make_unique<tensorflow::thread::ThreadPool>(
         tensorflow::Env::Default(), "infeed_outfeed",
         /*num_threads=*/num_threads);
   }
@@ -291,7 +291,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
         VLOG(1) << "Starting outfeed on device " << device;
         for (int64 step = 1;
              options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
-          auto literal = MakeUnique<Literal>();
+          auto literal = absl::make_unique<Literal>();
           TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
               executor, options.outfeed_shape, literal.get()));
           if (options.outfeed_values != nullptr) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 65537f07f5..cfc519063e 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -87,8 +87,7 @@ class HloRunner {
   // Converts an HloModule from the given hlo textual IR string (in
   // HloModule::ToString format).
   static StatusOr<std::unique_ptr<HloModule>> CreateModuleFromString(
-      const tensorflow::StringPiece hlo_string,
-      const DebugOptions& debug_options);
+      const absl::string_view hlo_string, const DebugOptions& debug_options);
 
   // Reads the proto file in xla.HloProto format, creates and returns the
   // HloModule.
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 27cc5361cd..56b14f9fef 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 
 #include <map>
+#include <queue>
 #include <utility>
 #include <vector>
 
@@ -28,16 +29,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
-using ::tensorflow::strings::HumanReadableNumBytes;
-
 namespace xla {
-
 namespace {
 
+using ::tensorflow::strings::HumanReadableNumBytes;
+
 // Class implementing a list scheduler of HLO instructions which produces a
 // sequence which minimizes memory usage by preferring to schedule the node that
 // frees bigger buffer and defines smaller outputs.
@@ -582,4 +582,187 @@ StatusOr<std::vector<const HloInstruction*>> ScheduleOneComputation(
                                    size_function, nullptr, empty_map);
 }
 
+tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+ComputeIdSchedule(const SequentialHloOrdering::HloModuleSequence& sequence) {
+  tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>> id_sequence;
+  for (const auto& computation_sequence : sequence) {
+    for (const HloInstruction* instruction : computation_sequence.second) {
+      id_sequence[computation_sequence.first].push_back(
+          instruction->unique_id());
+    }
+  }
+  return id_sequence;
+}
+
+Status UpdateSchedule(
+    const HloModule& module,
+    const tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>&
+        id_sequence,
+    SequentialHloOrdering::HloModuleSequence* sequence) {
+  // Map from unique ID to HloInstruction pointer for instructions in the
+  // module.
+  tensorflow::gtl::FlatMap<int, const HloInstruction*> id_to_instruction;
+  // Set of all HloInstructions in the schedule.
+  tensorflow::gtl::FlatSet<int> ids_in_schedule;
+  std::vector<HloComputation*> nonfusion_computations =
+      module.MakeNonfusionComputations();
+  for (const HloComputation* computation : nonfusion_computations) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      TF_RET_CHECK(
+          id_to_instruction.insert({instruction->unique_id(), instruction})
+              .second);
+    }
+    for (int id : id_sequence.at(computation)) {
+      ids_in_schedule.insert(id);
+    }
+  }
+
+  // Map from HloInstruction X to newly added instructions (instruction is in
+  // module, but not in schedule) which use X. If an instruction is not in the
+  // map, then it has no users which are newly added instructions.
+  tensorflow::gtl::FlatMap<const HloInstruction*,
+                           std::vector<const HloInstruction*>>
+      new_instruction_uses;
+
+  // For each newly added instruction, this is the count of the instruction's
+  // operands that have not yet been scheduled. When this value reaches zero,
+  // then the instruction may be placed in the schedule.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int>
+      unscheduled_operand_count;
+  // For each computation, this is the set of newly added instructions which
+  // have no operands. These must be handled specially and are added to the
+  // beginning of the schedule.
+  tensorflow::gtl::FlatMap<const HloComputation*,
+                           std::vector<const HloInstruction*>>
+      new_zero_operand_instructions;
+  for (const HloComputation* computation : nonfusion_computations) {
+    new_zero_operand_instructions[computation] = {};
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (ids_in_schedule.count(instruction->unique_id()) == 0) {
+        // This is a newly added instruction which is not in the schedule.
+        for (const HloInstruction* operand : instruction->operands()) {
+          new_instruction_uses[operand].push_back(instruction);
+        }
+        if (instruction->operands().empty()) {
+          new_zero_operand_instructions[computation].push_back(instruction);
+        }
+        unscheduled_operand_count[instruction] = instruction->operand_count();
+      }
+    }
+  }
+
+  // Update the schedule with the newly added instructions, and remove any
+  // instructions no longer in the graph.
+  for (const HloComputation* computation : nonfusion_computations) {
+    std::vector<const HloInstruction*> old_computation_sequence =
+        std::move(sequence->at(computation));
+    sequence->at(computation).clear();
+
+    // Create a worklist of newly added instructions which are ready to be added
+    // to the schedule. Initialize worklist with those that have zero operands.
+    std::queue<const HloInstruction*> worklist;
+    for (const HloInstruction* instruction :
+         new_zero_operand_instructions.at(computation)) {
+      worklist.push(instruction);
+    }
+
+    // Lambda which schedules all instructions on the worklist.
+    auto schedule_worklist = [&]() {
+      while (!worklist.empty()) {
+        const HloInstruction* instruction = worklist.front();
+        worklist.pop();
+        sequence->at(computation).push_back(instruction);
+        std::vector<const HloInstruction*>* new_users =
+            tensorflow::gtl::FindOrNull(new_instruction_uses, instruction);
+        if (new_users != nullptr) {
+          // This just-scheduled instruction has users which are newly added to
+          // the module. Update the number of unscheduled operands and push the
+          // newly added instruction to the worklist if it is ready to
+          // schedule.
+          for (const HloInstruction* new_user : *new_users) {
+            unscheduled_operand_count.at(new_user)--;
+            CHECK_GE(unscheduled_operand_count.at(new_user), 0);
+            if (unscheduled_operand_count.at(new_user) == 0) {
+              worklist.push(new_user);
+            }
+          }
+        }
+      }
+    };
+
+    schedule_worklist();
+    for (int id : id_sequence.at(computation)) {
+      auto it = id_to_instruction.find(id);
+      if (it == id_to_instruction.end()) {
+        // This instruction in the schedule is no longer in the module.
+        continue;
+      }
+      const HloInstruction* instruction = it->second;
+      worklist.push(instruction);
+      schedule_worklist();
+    }
+  }
+
+  TF_RETURN_IF_ERROR(VerifySchedule(module, *sequence));
+  return Status::OK();
+}
+
+Status VerifySchedule(
+    const HloModule& module,
+    const SequentialHloOrdering::HloModuleSequence& sequence) {
+  VLOG(2) << "VerifySchedule()";
+  XLA_VLOG_LINES(2, module.ToString());
+  VLOG(2) << sequence;
+
+  // Verify the set of computations in the sequence is exactly the set of
+  // computations in the module.
+  std::vector<HloComputation*> nonfusion_computations =
+      module.MakeNonfusionComputations();
+  TF_RET_CHECK(nonfusion_computations.size() == sequence.size());
+  tensorflow::gtl::FlatSet<const HloComputation*> computations_in_module(
+      module.computations().begin(), module.computations().end());
+  for (const auto& computation_sequence : sequence) {
+    TF_RET_CHECK(computations_in_module.count(computation_sequence.first) == 1);
+  }
+
+  // For each computation verify the set of instructions is the same and that
+  // each dependency and control edge is honored.
+  for (const HloComputation* computation : nonfusion_computations) {
+    tensorflow::gtl::FlatMap<const HloInstruction*, int> instruction_position;
+    int pos = 0;
+    for (const HloInstruction* instruction : sequence.at(computation)) {
+      TF_RET_CHECK(instruction_position.insert({instruction, pos}).second)
+          << "Instruction " << instruction->name()
+          << " appears more than once in the schedule";
+      pos++;
+    }
+
+    TF_RET_CHECK(instruction_position.size() ==
+                 computation->instruction_count());
+    for (const HloInstruction* instruction : computation->instructions()) {
+      TF_RET_CHECK(instruction_position.count(instruction) == 1)
+          << "Instruction " << instruction->name() << " is not in schedule";
+    }
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      for (const HloInstruction* operand : instruction->operands()) {
+        TF_RET_CHECK(instruction_position.at(operand) <
+                     instruction_position.at(instruction))
+            << "Instruction " << instruction->name()
+            << " is not scheduled after its operand " << operand->name();
+      }
+
+      for (const HloInstruction* pred : instruction->control_predecessors()) {
+        TF_RET_CHECK(instruction_position.at(pred) <
+                     instruction_position.at(instruction))
+            << "Instruction " << instruction->name()
+            << " is not scheduled after its control predecessor "
+            << pred->name();
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
index 2b33ccc8bf..d06b8d9a5c 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.h
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -85,6 +85,43 @@ StatusOr<std::vector<const HloInstruction*>> ScheduleOneComputation(
     const HloComputation& computation,
     const LogicalBuffer::SizeFunction& size_function);
 
+// Transforms the given schedule such that it is (again) a valid schedule for
+// the module. This is used to update a schedule after the HLO module has been
+// transformed in some way. In general, the only transformations to the module
+// for which a schedule can be updated is the addition or removal of
+// instructions to/from the module. Updating the schedule after new dependencies
+// between existing instructions in the module is not supported and may result
+// in an error status returned.
+//
+// Instructions in the module which also exist in the given schedule will remain
+// in the same order in the updated schedule. Instructions which exist in the
+// module but not in the given schedule will be placed as early as possible in
+// the updated schedule.
+//
+// 'id_sequence' is a mirror of the given schedule 'sequence' but with
+// HloInstruction ids rather than HloInstruction pointers. This should be
+// constructed using ComputeIdSchedule below after the schedule is constructed
+// but before the HLO module is transformed.
+Status UpdateSchedule(
+    const HloModule& module,
+    const tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>&
+        id_sequence,
+    SequentialHloOrdering::HloModuleSequence* sequence);
+
+// Constructs a copy of the given schedule but with HloInstruction unique ids
+// rather than HloInstruction pointers. This is necessary for updating a
+// schedule as HloInstruction points in the schedule may become invalid if
+// instructions are removed from the module. Used by UpdateSchedule above..
+// TODO(b/113175018): Remove this function when HLO schedule is its own class.
+tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+ComputeIdSchedule(const SequentialHloOrdering::HloModuleSequence& sequence);
+
+// Verifies that the given schedule is valid for the given module. Specifically,
+// the schedule contains exactly the instructions in the module and every
+// dependency in the module is satisfied in the schedule.
+Status VerifySchedule(const HloModule& module,
+                      const SequentialHloOrdering::HloModuleSequence& sequence);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULING_H_
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
index 9ec983c2bc..930801288a 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -244,9 +246,9 @@ TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) {
                     *entry_computation, sequence.at(entry_computation),
                     *points_to_analysis, size_fn)
                     .ValueOrDie());
-  // HeapSimulator accounts for subcomputations. The max mem doesn't change
-  // because the while body isn't live during the peak.
-  EXPECT_EQ(80, HeapSimulator::MinimumMemoryForComputation(
+  // HeapSimulator accounts for subcomputations. The output buffer is aliased,
+  // so we don't double count.
+  EXPECT_EQ(64, HeapSimulator::MinimumMemoryForComputation(
                     *entry_computation, sequence.at(entry_computation),
                     *points_to_analysis, size_fn, &memory_by_computation)
                     .ValueOrDie());
@@ -350,7 +352,6 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
 TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
   auto module = CreateNewModule();
   const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
-  const Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 4});
 
   // param != 0
   // Needs 17 bytes
@@ -408,12 +409,259 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
                     *entry_computation, sequence.at(entry_computation),
                     *points_to_analysis, size_fn)
                     .ValueOrDie());
-  // HeapSimulator accounts for subcomputations
-  EXPECT_EQ(33, HeapSimulator::MinimumMemoryForComputation(
+  // HeapSimulator accounts for subcomputations. Cond is the largest one.
+  // The output buffer of the while is aliased.
+  EXPECT_EQ(17, HeapSimulator::MinimumMemoryForComputation(
                     *entry_computation, sequence.at(entry_computation),
                     *points_to_analysis, size_fn, &memory_by_computation)
                     .ValueOrDie());
 }
 
+TEST_F(HloSchedulingTest, UpdateScheduleUnchangedModule) {
+  // Updating the schedule of an unchanged HLO module should not affect the
+  // schedule at all.
+  const string module_str = R"(
+HloModule UpdateScheduleUnchanged
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  c = f32[] constant(42.0)
+  sum = f32[] add(a, b)
+  neg = f32[] negate(c)
+  ROOT root = f32[] multiply(sum, neg)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      ScheduleComputationsInModule(*module, [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape());
+      }));
+  tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+      id_sequence = ComputeIdSchedule(sequence);
+  std::vector<const HloInstruction*> entry_schedule = sequence.begin()->second;
+
+  EXPECT_EQ(entry_schedule.size(), 6);
+
+  TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence));
+  TF_ASSERT_OK(VerifySchedule(*module, sequence));
+
+  EXPECT_EQ(entry_schedule, sequence.begin()->second);
+}
+
+TEST_F(HloSchedulingTest, UpdateScheduleWithNewInstructions) {
+  // Add some additional instructions to a module and verify the schedule can be
+  // updated.
+  const string module_str = R"(
+HloModule UpdateScheduleWithNewInstructions
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  c = f32[] constant(42.0)
+  sum = f32[] add(a, b)
+  neg = f32[] negate(c)
+  ROOT root = f32[] multiply(sum, neg)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      ScheduleComputationsInModule(*module, [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape());
+      }));
+  tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+      id_sequence = ComputeIdSchedule(sequence);
+
+  HloComputation* entry = module->entry_computation();
+  const Shape shape = entry->root_instruction()->shape();
+  HloInstruction* constant = entry->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
+  HloInstruction* sub = entry->AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, constant, entry->root_instruction()));
+  entry->set_root_instruction(sub);
+
+  auto in_schedule = [&](const HloInstruction* hlo) {
+    return std::find(sequence.at(entry).begin(), sequence.at(entry).end(),
+                     hlo) != sequence.at(entry).end();
+  };
+
+  EXPECT_EQ(sequence.at(entry).size(), 6);
+  EXPECT_FALSE(in_schedule(constant));
+  EXPECT_FALSE(in_schedule(sub));
+
+  TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence));
+  TF_ASSERT_OK(VerifySchedule(*module, sequence));
+
+  EXPECT_EQ(sequence.at(entry).size(), 8);
+  EXPECT_TRUE(in_schedule(constant));
+  EXPECT_TRUE(in_schedule(sub));
+}
+
+TEST_F(HloSchedulingTest, UpdateScheduleWithAddedAndDeletedInstruction) {
+  // Add and delete some instructions from a module and verify that the schedule
+  // can be updated successfully.
+  const string module_str = R"(
+HloModule UpdateScheduleWithAddedAndDeletedInstruction
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  c = f32[] constant(42.0)
+  sum = f32[] add(a, b)
+  neg = f32[] negate(c)
+  ROOT root = f32[] multiply(sum, neg)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      ScheduleComputationsInModule(*module, [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape());
+      }));
+  tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+      id_sequence = ComputeIdSchedule(sequence);
+
+  // Set the entry root to some expression containing just a parameter and a
+  // constant.
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* constant = entry->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
+  HloInstruction* new_root = entry->AddInstruction(
+      HloInstruction::CreateBinary(constant->shape(), HloOpcode::kSubtract,
+                                   constant, entry->parameter_instruction(0)));
+  entry->set_root_instruction(new_root);
+
+  // DCE should remove everything but the parameters and the newly added code.
+  HloDCE dce;
+  TF_ASSERT_OK(dce.Run(module.get()).status());
+
+  EXPECT_EQ(sequence.at(entry).size(), 6);
+
+  TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence));
+  TF_ASSERT_OK(VerifySchedule(*module, sequence));
+
+  EXPECT_EQ(sequence.at(entry).size(), 4);
+}
+
+TEST_F(HloSchedulingTest, UpdateScheduleWithCompletelyReplacedModule) {
+  // Completely replace a module with an entirely new set of instructions and
+  // verify that the schedule can be updated successfully.
+  const string module_str = R"(
+HloModule UpdateScheduleWithCompletelyReplacedModule
+
+ENTRY main {
+  a = f32[] constant(42.0)
+  b = f32[] constant(123.0)
+  ROOT sum = f32[] add(a, b)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      ScheduleComputationsInModule(*module, [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape());
+      }));
+  tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+      id_sequence = ComputeIdSchedule(sequence);
+
+  // Replace the entry computation with the negation of a constant.
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* constant = entry->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  HloInstruction* new_root = entry->AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kNegate, constant));
+  entry->set_root_instruction(new_root);
+
+  // DCE the old instructions.
+  HloDCE dce;
+  TF_ASSERT_OK(dce.Run(module.get()).status());
+
+  EXPECT_EQ(sequence.at(entry).size(), 3);
+
+  TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence));
+  TF_ASSERT_OK(VerifySchedule(*module, sequence));
+
+  EXPECT_EQ(sequence.at(entry).size(), 2);
+}
+
+TEST_F(HloSchedulingTest, UpdateScheduleWithMultipleComputations) {
+  // Create changes to more than one computation in an HLO module and verify
+  // that the schedule can be updated.
+  const string module_str = R"(
+HloModule UpdateScheduleWithMultipleComputations
+
+%Body (param.1: (s32[], token[])) -> (s32[], token[]) {
+  %param.1 = (s32[], token[]) parameter(0)
+  %get-tuple-element.1 = s32[] get-tuple-element((s32[], token[]) %param.1), index=0
+  %constant.1 = s32[] constant(1)
+  %add = s32[] add(s32[] %get-tuple-element.1, s32[] %constant.1)
+  %get-tuple-element.2 = token[] get-tuple-element((s32[], token[]) %param.1), index=1
+  %after-all = token[] after-all(token[] %get-tuple-element.2)
+  ROOT %tuple = (s32[], token[]) tuple(s32[] %add, token[] %after-all)
+}
+
+%Cond (param: (s32[], token[])) -> pred[] {
+  %param = (s32[], token[]) parameter(0)
+  %get-tuple-element = s32[] get-tuple-element((s32[], token[]) %param), index=0
+  %constant = s32[] constant(42)
+  ROOT %less-than = pred[] less-than(s32[] %get-tuple-element, s32[] %constant)
+}
+
+ENTRY %WhileLoop () -> s32[] {
+  %zero = s32[] constant(0)
+  %init_token = token[] after-all()
+  %init_tuple = (s32[], token[]) tuple(s32[] %zero, token[] %init_token)
+  %while = (s32[], token[]) while((s32[], token[]) %init_tuple), condition=%Cond, body=%Body
+  ROOT %root = s32[] get-tuple-element((s32[], token[]) %while), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      ScheduleComputationsInModule(*module, [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape(),
+                                     /*pointer_size=*/sizeof(void*));
+      }));
+  tensorflow::gtl::FlatMap<const HloComputation*, std::vector<int>>
+      id_sequence = ComputeIdSchedule(sequence);
+
+  const HloInstruction* xla_while =
+      module->entry_computation()->root_instruction()->operand(0);
+  HloComputation* body = xla_while->while_body();
+  HloComputation* cond = xla_while->while_condition();
+
+  // Negate the root of the cond.
+  cond->set_root_instruction(cond->AddInstruction(
+      HloInstruction::CreateUnary(ShapeUtil::MakeShape(PRED, {}),
+                                  HloOpcode::kNot, cond->root_instruction())));
+
+  // Replace the body with a computation which just passes through its
+  // parameter.
+  body->set_root_instruction(body->parameter_instruction(0));
+
+  // DCE the dead code in the body.
+  HloDCE dce;
+  TF_ASSERT_OK(dce.Run(module.get()).status());
+
+  EXPECT_EQ(sequence.at(body).size(), 7);
+  EXPECT_EQ(sequence.at(cond).size(), 4);
+
+  TF_ASSERT_OK(UpdateSchedule(*module, id_sequence, &sequence));
+  TF_ASSERT_OK(VerifySchedule(*module, sequence));
+
+  EXPECT_EQ(sequence.at(body).size(), 1);
+  EXPECT_EQ(sequence.at(cond).size(), 5);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 879fb3bbab..980dae07ce 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -15,13 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 
-using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrCat;
+using absl::StrCat;
+using absl::StrJoin;
 
 HloSharding HloSharding::AssignDevice(int64 device_id) {
   return HloSharding(device_id);
@@ -71,12 +72,9 @@ HloSharding HloSharding::SingleTuple(const Shape& tuple_shape,
                                      const HloSharding& sharding) {
   CHECK(ShapeUtil::IsTuple(tuple_shape)) << ShapeUtil::HumanString(tuple_shape);
   CHECK(!sharding.IsTuple()) << sharding.ToString();
-  int64 leaf_count = ShapeUtil::GetLeafCount(tuple_shape);
+  int64 leaf_count = RequiredLeaves(tuple_shape);
   std::vector<HloSharding> flattened_list;
-  flattened_list.reserve(leaf_count);
-  for (int64 i = 0; i < leaf_count; ++i) {
-    flattened_list.push_back(sharding);
-  }
+  flattened_list.resize(leaf_count, sharding);
   return HloSharding(flattened_list);
 }
 
@@ -92,7 +90,7 @@ string HloSharding::ToString() const {
     for (const HloSharding& element : tuple_elements_) {
       parts.push_back(element.ToString());
     }
-    return StrCat("{", tensorflow::str_util::Join(parts, ", "), "}");
+    return StrCat("{", absl::StrJoin(parts, ", "), "}");
   }
 
   if (replicated_) {
@@ -101,8 +99,8 @@ string HloSharding::ToString() const {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
   } else {
-    return StrCat("{devices=[", Join(tile_assignment_.dimensions(), ","), "]",
-                  Join(tile_assignment_, ","), "}");
+    return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","),
+                  "]", StrJoin(tile_assignment_, ","), "}");
   }
 }
 
@@ -244,16 +242,16 @@ StatusOr<HloSharding> HloSharding::GetTupleSharding(const Shape& shape) const {
   return Tuple(ShapeTree<HloSharding>(shape, *this));
 }
 
-tensorflow::gtl::optional<int64> HloSharding::UniqueDevice() const {
+absl::optional<int64> HloSharding::UniqueDevice() const {
   if (IsTuple()) {
     if (tuple_elements_.empty()) {
-      return tensorflow::gtl::nullopt;
+      return absl::nullopt;
     }
-    tensorflow::gtl::optional<int64> unique_device;
+    absl::optional<int64> unique_device;
     for (auto& tuple_sharding : tuple_elements_) {
       auto device = tuple_sharding.UniqueDevice();
       if (!device || (unique_device && *device != *unique_device)) {
-        return tensorflow::gtl::nullopt;
+        return absl::nullopt;
       }
       unique_device = device;
     }
@@ -262,7 +260,7 @@ tensorflow::gtl::optional<int64> HloSharding::UniqueDevice() const {
   if (!replicated_ && maximal_) {
     return static_cast<int64>(*tile_assignment_.begin());
   }
-  return tensorflow::gtl::nullopt;
+  return absl::nullopt;
 }
 
 int64 HloSharding::GetUniqueDevice() const {
@@ -439,21 +437,20 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
                                        : sub_shape_tree.element(ShapeIndex({}));
 }
 
-tensorflow::gtl::optional<HloSharding> HloSharding::ExtractSingleSharding()
-    const {
+absl::optional<HloSharding> HloSharding::ExtractSingleSharding() const {
   if (!IsTuple()) {
     return *this;
   }
   for (int64 i = 1; i < tuple_elements_.size(); ++i) {
     if (tuple_elements_[0] != tuple_elements_[i]) {
-      return tensorflow::gtl::optional<HloSharding>();
+      return absl::nullopt;
     }
   }
   return tuple_elements_.front();
 }
 
 size_t HloSharding::Hash() const {
-  if (!tuple_) {
+  if (tuple_) {
     size_t h = 0;
     for (const auto& element : tuple_elements_) {
       h = tensorflow::Hash64Combine(h, element.Hash());
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 894783e5d1..be51c3f55b 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -151,7 +151,7 @@ class HloSharding {
   // span a single device, the return value will be empty.
   // In order for a sharding to span a single device, every leaf sharding must
   // be maximal and not replicated, and the used device must match.
-  tensorflow::gtl::optional<int64> UniqueDevice() const;
+  absl::optional<int64> UniqueDevice() const;
 
   // Retrieves the unique device or fails with a CHECK.
   int64 GetUniqueDevice() const;
@@ -182,7 +182,7 @@ class HloSharding {
   // be returned. If it is a tuple, and all the tuple elements are common, the
   // common element will be returned. Otherwise the optional will contain no
   // value.
-  tensorflow::gtl::optional<HloSharding> ExtractSingleSharding() const;
+  absl::optional<HloSharding> ExtractSingleSharding() const;
 
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
@@ -260,9 +260,9 @@ class HloSharding {
   bool maximal_;
   bool tuple_;
   Array<int64> tile_assignment_;
-  // Only non-empty when tuple_ is true, but because empty tuples are allowed
-  // may also be empty even then. This is a flattened list of all the leaf
-  // shardings in a tuple shape, by pre-order walk (ShapeTree iterator order).
+  // Only non-empty when tuple_ is true. If a tuple is empty then one entry is
+  // present for the root. This is a flattened list of all the leaf shardings in
+  // a tuple shape, by pre-order walk (ShapeTree iterator order).
   std::vector<HloSharding> tuple_elements_;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index a2c1d39d0d..a9b3b66934 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -117,13 +118,17 @@ Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
   return Status::OK();
 }
 
-std::unique_ptr<HloSharding> CloneShardingForDomain(
-    const HloSharding& sharding) {
-  auto single_sharding = sharding.ExtractSingleSharding();
+// For tuple shardings if every element have the same sharsing then we want to
+// treat them as single element sharsings to insert less domain separation as a
+// domain can prevent some optimizations and we want to minimize that from
+// happening.
+std::shared_ptr<const HloSharding> CloneShardingForDomain(
+    std::shared_ptr<const HloSharding> sharding) {
+  auto single_sharding = sharding->ExtractSingleSharding();
   if (!single_sharding) {
-    return MakeUnique<HloSharding>(sharding);
+    return sharding;
   }
-  return MakeUnique<HloSharding>(*single_sharding);
+  return std::make_shared<const HloSharding>(*single_sharding);
 }
 
 Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain,
@@ -279,65 +284,18 @@ Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
   return Status::OK();
 }
 
-// Creates a kDomain instruction to be placed between instruction and operand.
-// The kDomain instruction will be created only if the sharding differ between
-// the instruction and the operand.
-std::unique_ptr<HloInstruction> CreateDomain(HloInstruction* instruction,
-                                             HloInstruction* operand) {
-  const HloSharding* instruction_sharding =
-      instruction->has_sharding() ? &instruction->sharding() : nullptr;
-  const HloSharding* operand_sharding =
-      operand->has_sharding() ? &operand->sharding() : nullptr;
-  // No need for domain if they both have no sharding.
-  if (instruction_sharding == nullptr && operand_sharding == nullptr) {
-    return nullptr;
-  }
-  // No need for domain if they match.
-  if (instruction_sharding != nullptr && operand_sharding != nullptr &&
-      ShardingMatches(*instruction_sharding, *operand_sharding)) {
-    return nullptr;
-  }
-  std::unique_ptr<HloSharding> real_instruction_sharding;
-  std::unique_ptr<HloSharding> real_operand_sharding;
-  if (instruction_sharding != nullptr) {
-    real_instruction_sharding = CloneShardingForDomain(*instruction_sharding);
-  }
-  if (operand_sharding != nullptr) {
-    real_operand_sharding = CloneShardingForDomain(*operand_sharding);
-  }
-  VLOG(3) << "Creating domain:";
-  VLOG(3) << "  Instruction: " << instruction->name();
-  VLOG(3) << "  Operand: " << operand->name();
-  VLOG(3) << "    User side sharding: "
-          << (real_instruction_sharding != nullptr
-                  ? real_instruction_sharding->ToString()
-                  : "None");
-  VLOG(3) << "    Operand side sharding: "
-          << (real_operand_sharding != nullptr
-                  ? real_operand_sharding->ToString()
-                  : "None");
-
-  std::unique_ptr<DomainMetadata> operand_side_metadata =
-      MakeUnique<ShardingMetadata>(std::move(real_operand_sharding));
-  std::unique_ptr<DomainMetadata> user_side_metadata =
-      MakeUnique<ShardingMetadata>(std::move(real_instruction_sharding));
-  return HloInstruction::CreateDomain(operand->shape(), operand,
-                                      std::move(operand_side_metadata),
-                                      std::move(user_side_metadata));
-}
-
-StatusOr<std::unique_ptr<HloSharding>> ExtractOriginalCommonSharding(
+StatusOr<std::shared_ptr<const HloSharding>> ExtractOriginalCommonSharding(
     tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
   // If we are here, all the instructions being passed had the same sharding
   // (or no sharding), by the means of the ShardingMatches() API.
   // As such, no kDomain was inserted, and here we are asked to extract the
   // original common sharding.
   // All the instructions passed to this API are part of the same computation.
-  const HloSharding* sharding = nullptr;
+  std::shared_ptr<const HloSharding> sharding;
   for (HloInstruction* instruction : instructions) {
     if (instruction->has_sharding()) {
       if (sharding == nullptr) {
-        sharding = &instruction->sharding();
+        sharding = instruction->sharding_ptr();
       } else {
         TF_RET_CHECK(ShardingMatches(*sharding, instruction->sharding()))
             << "Sharding " << *sharding << " does not match the one in "
@@ -346,10 +304,10 @@ StatusOr<std::unique_ptr<HloSharding>> ExtractOriginalCommonSharding(
     }
   }
   if (sharding == nullptr) {
-    return std::unique_ptr<HloSharding>();
+    return std::shared_ptr<const HloSharding>();
   }
   VLOG(4) << "Extracted sharding is " << *sharding;
-  return CloneShardingForDomain(*sharding);
+  return CloneShardingForDomain(sharding);
 }
 
 }  // namespace
@@ -357,9 +315,9 @@ StatusOr<std::unique_ptr<HloSharding>> ExtractOriginalCommonSharding(
 std::unique_ptr<DomainMetadata> ShardingMetadata::Clone() const {
   std::unique_ptr<HloSharding> sharding;
   if (sharding_ != nullptr) {
-    sharding = MakeUnique<HloSharding>(*sharding_);
+    sharding = absl::make_unique<HloSharding>(*sharding_);
   }
-  return MakeUnique<ShardingMetadata>(std::move(sharding));
+  return absl::make_unique<ShardingMetadata>(std::move(sharding));
 }
 
 bool ShardingMetadata::Matches(const DomainMetadata& other) const {
@@ -403,7 +361,7 @@ Status ShardingMetadata::NormalizeShardingDomain(
       TF_RETURN_IF_ERROR(FixupPassThroughDomainLinks(domain, *sharding));
     }
   } else {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloSharding> sharding,
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<const HloSharding> sharding,
                         ExtractOriginalCommonSharding(domain.instructions));
     if (sharding != nullptr) {
       VLOG(4) << "Normalizing sharding-less domain to " << sharding->ToString();
@@ -415,9 +373,75 @@ Status ShardingMetadata::NormalizeShardingDomain(
   return Status::OK();
 }
 
-std::unique_ptr<HloInstruction> CreateShardingDomain(
-    HloInstruction* instruction, HloInstruction* operand) {
-  return CreateDomain(instruction, operand);
+// Creates a kDomain instruction to be placed between instruction and operand.
+// The kDomain instruction will be created only if the sharding differ between
+// the instruction and the operand.
+HloInstruction* ShardingDomainCreator::operator()(HloInstruction* instruction,
+                                                  HloInstruction* root,
+                                                  HloInstruction* operand) {
+  auto instruction_sharding = instruction->sharding_ptr();
+  auto root_sharding = root->sharding_ptr();
+  // No need for domain if they both have no sharding.
+  if (instruction_sharding == nullptr && root_sharding == nullptr) {
+    return nullptr;
+  }
+  // No need for domain if they match.
+  if (instruction_sharding != nullptr && root_sharding != nullptr &&
+      ShardingMatches(*instruction_sharding, *root_sharding)) {
+    return nullptr;
+  }
+
+  if (instruction_sharding != nullptr) {
+    instruction_sharding = CloneShardingForDomain(instruction_sharding);
+  }
+  if (root_sharding != nullptr) {
+    root_sharding = CloneShardingForDomain(root_sharding);
+  }
+
+  auto it = domain_cse_map_.find({operand, instruction_sharding});
+  if (it != domain_cse_map_.end()) {
+    return it->second;
+  }
+
+  VLOG(3) << "Creating domain:";
+  VLOG(3) << "  Instruction: " << instruction->name();
+  VLOG(3) << "  Operand: " << operand->name();
+  VLOG(3) << "    User side sharding: "
+          << (instruction_sharding != nullptr ? instruction_sharding->ToString()
+                                              : "None");
+  VLOG(3) << "    Operand side sharding: "
+          << (root_sharding != nullptr ? root_sharding->ToString() : "None");
+
+  HloInstruction* domain =
+      operand->parent()->AddInstruction(HloInstruction::CreateDomain(
+          operand->shape(), operand,
+          absl::make_unique<ShardingMetadata>(root_sharding),
+          absl::make_unique<ShardingMetadata>(instruction_sharding)));
+  domain_cse_map_.emplace(DomainCseMapKey{operand, instruction_sharding},
+                          domain);
+  return domain;
+}
+
+bool ShardingDomainCreator::DomainCseMapKey::operator==(
+    const ShardingDomainCreator::DomainCseMapKey& other) const {
+  if (instruction != other.instruction) {
+    return false;
+  }
+  if (sharding == nullptr && other.sharding == nullptr) {
+    return true;
+  }
+  if (sharding == nullptr || other.sharding == nullptr) {
+    return false;
+  }
+  return *sharding == *other.sharding;
+}
+
+size_t ShardingDomainCreator::DomainCseMapHasher::operator()(
+    const ShardingDomainCreator::DomainCseMapKey& key) const {
+  return tensorflow::Hash64Combine(
+      std::hash<const HloInstruction*>{}(key.instruction),
+      key.sharding ? key.sharding->Hash()
+                   : static_cast<size_t>(0x297814aaad196e6dULL));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
index 5e01fc0e22..7a6b0d9abc 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
@@ -27,12 +27,12 @@ namespace xla {
 // A DomainMetadata implementation that internally wraps a sharding attribute.
 class ShardingMetadata : public DomainMetadata {
  public:
-  explicit ShardingMetadata(std::unique_ptr<HloSharding> sharding)
+  explicit ShardingMetadata(std::shared_ptr<const HloSharding> sharding)
       : sharding_(std::move(sharding)) {}
 
   std::unique_ptr<DomainMetadata> Clone() const override;
 
-  tensorflow::StringPiece Kind() const override { return KindName(); }
+  absl::string_view Kind() const override { return KindName(); }
 
   bool Matches(const DomainMetadata& other) const override;
 
@@ -40,7 +40,7 @@ class ShardingMetadata : public DomainMetadata {
 
   const HloSharding* sharding() const { return sharding_.get(); }
 
-  static tensorflow::StringPiece KindName() { return "sharding"; }
+  static absl::string_view KindName() { return "sharding"; }
 
   static StatusOr<const ShardingMetadata*> ToShardingMetadata(
       const DomainMetadata* metadata);
@@ -55,15 +55,33 @@ class ShardingMetadata : public DomainMetadata {
                                         const DomainMetadata* metadata);
 
  private:
-  std::unique_ptr<HloSharding> sharding_;
+  std::shared_ptr<const HloSharding> sharding_;
 };
 
-// Given an HLO graph edge between instruction and one of its operands, creates
-// a ShardingMetadata based kDomain instruction if the sharding between
-// instruction and operand changes. Returns nullptr if there is no need for a
-// domain separation.
-std::unique_ptr<HloInstruction> CreateShardingDomain(
-    HloInstruction* instruction, HloInstruction* operand);
+// If the sharding between root and instruction changes then returns a
+// ShardingMetadata based kDomain instruction what can be used to separate
+// operand and instruction.
+// Returns nullptr if there is no need for a domain separation.
+class ShardingDomainCreator {
+ public:
+  HloInstruction* operator()(HloInstruction* instruction, HloInstruction* root,
+                             HloInstruction* operand);
+
+ private:
+  // Map from instruction and user sharding to domain users to CSE identical
+  // domains.
+  struct DomainCseMapKey {
+    const HloInstruction* instruction;
+    std::shared_ptr<const HloSharding> sharding;
+
+    bool operator==(const DomainCseMapKey& other) const;
+  };
+  struct DomainCseMapHasher {
+    size_t operator()(const DomainCseMapKey& key) const;
+  };
+  std::unordered_map<DomainCseMapKey, HloInstruction*, DomainCseMapHasher>
+      domain_cse_map_;
+};
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 45fc300fca..2341f8ada0 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -115,6 +115,13 @@ TEST_F(HloShardingTest, Tile) {
   }
 }
 
+// Tests that empty tuple is supported.
+TEST_F(HloShardingTest, EmptySingleTuple) {
+  HloSharding sharding = HloSharding::SingleTuple(ShapeUtil::MakeTupleShape({}),
+                                                  HloSharding::AssignDevice(0));
+  EXPECT_TRUE(sharding.ExtractSingleSharding());
+}
+
 TEST_F(HloShardingTest, NestedTuple) {
   // nested_tuple_shape = (f32[], (f32[3]), f32[4, 6])
   Shape nested_tuple_shape = ShapeUtil::MakeTupleShape({
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h b/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h
index 2ef38821af..d1cf644f82 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification.h
@@ -24,7 +24,7 @@ namespace xla {
 // one arbitrarily to use and delete the others.
 class HloSubcomputationUnification : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "subcomputation-unification";
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index b78bfa0cdf..4876533449 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -21,28 +23,25 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-
-using ::tensorflow::GraphDef;
-using ::tensorflow::NodeDef;
-using ::tensorflow::TensorShapeProto;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
-using ::tensorflow::str_util::Join;
 
 namespace xla {
 namespace hlo_graph_dumper {
 namespace {
 
+using absl::StrAppend;
+using absl::StrCat;
+using tensorflow::GraphDef;
+using tensorflow::NodeDef;
+using tensorflow::TensorShapeProto;
+
 string GetOpDefName(const HloInstruction* instruction) {
   string name = StrCat("hlo-", HloOpcodeString(instruction->opcode()));
-  tensorflow::str_util::TitlecaseString(&name, "-");
+  tensorflow::str_util::TitlecaseString(&name, "-");  // non-absl ok
   name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
 
   if (instruction->opcode() == HloOpcode::kFusion) {
     string fusion_name = ToString(instruction->fusion_kind());
-    StrAppend(&name, tensorflow::StringPiece(fusion_name).substr(1));
+    StrAppend(&name, absl::string_view(fusion_name).substr(1));
   }
   return name;
 }
@@ -166,7 +165,9 @@ void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
       layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
     } else {
       layout_string = StrCat(
-          "{", Join(LayoutUtil::MinorToMajor(instruction->shape()), ","), "}");
+          "{",
+          absl::StrJoin(LayoutUtil::MinorToMajor(instruction->shape()), ","),
+          "}");
     }
     attrs["layout"].set_s(layout_string);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 7fd99fc930..e0c1326177 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -30,16 +32,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 const Shape& HloPosition::shape() const {
   return ShapeUtil::GetSubshape(instruction->shape(), index);
@@ -216,10 +215,11 @@ void HloValueSet::SortAndUniquifyValues() {
 }
 
 string HloValueSet::ToString() const {
-  return StrCat("HloValueSet: ",
-                Join(values_, ", ", [](string* result, const HloValue* value) {
-                  result->append(value->ToShortString());
-                }));
+  return StrCat(
+      "HloValueSet: ",
+      absl::StrJoin(values_, ", ", [](string* result, const HloValue* value) {
+        result->append(value->ToShortString());
+      }));
 }
 
 bool HloValueSet::AssignUnionOf(
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index e7674f3ddd..f60c4eab42 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <set>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -84,7 +85,8 @@ Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
       const Shape expected,
       ShapeInference::InferConvolveShape(
           convolution->operand(0)->shape(), convolution->operand(1)->shape(),
-          convolution->window(), convolution->convolution_dimension_numbers()));
+          convolution->window(), convolution->convolution_dimension_numbers(),
+          convolution->feature_group_count()));
   return CheckShape(convolution, expected);
 }
 
@@ -121,29 +123,26 @@ Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
                                           reduce_precision->mantissa_bits()));
 }
 
-namespace {
-
-Status CheckIsTokenOperand(const HloInstruction* instruction,
-                           int64 operand_no) {
+Status ShapeVerifier::CheckIsTokenOperand(const HloInstruction* instruction,
+                                          int64 operand_no) {
   const HloInstruction* token = instruction->operand(operand_no);
   if (!ShapeUtil::Equal(token->shape(), ShapeUtil::MakeTokenShape())) {
     return InternalError(
         "Expected operand %lld to be token-shaped, actual shape is "
         "%s:\n%s",
-        operand_no, ShapeUtil::HumanString(token->shape()).c_str(),
+        operand_no, StringifyShape(token->shape()).c_str(),
         instruction->ToString().c_str());
   }
   return Status::OK();
 }
 
-Status CheckOperandAndParameter(const HloInstruction* instruction,
-                                int64 operand_number,
-                                const HloComputation* computation,
-                                int64 parameter_number) {
+Status ShapeVerifier::CheckOperandAndParameter(
+    const HloInstruction* instruction, int64 operand_number,
+    const HloComputation* computation, int64 parameter_number) {
   const HloInstruction* operand = instruction->operand(operand_number);
   const HloInstruction* parameter =
       computation->parameter_instruction(parameter_number);
-  if (!ShapeUtil::Compatible(operand->shape(), parameter->shape())) {
+  if (!ShapesSame(operand->shape(), parameter->shape())) {
     return InternalError("Operand %s shape does not match parameter's %s in %s",
                          operand->ToString().c_str(),
                          parameter->ToString().c_str(),
@@ -152,15 +151,9 @@ Status CheckOperandAndParameter(const HloInstruction* instruction,
   return Status::OK();
 }
 
-}  // namespace
-
 Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
   HloInfeedInstruction* infeed = Cast<HloInfeedInstruction>(instruction);
-  // Infeed has an optional single token operand.
-  // TODO(b/80000000): Update when token is not optional.
-  if (infeed->operand_count() == 1) {
-    TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
-  }
+  TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
 
   // The output of infeed is a tuple containing the data value and a token.
   return CheckShape(infeed,
@@ -170,30 +163,21 @@ Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
   HloOutfeedInstruction* outfeed = Cast<HloOutfeedInstruction>(instruction);
-  // Outfeed has an optional token operand (operand 1).
-  // TODO(b/80000000): Update when token is not optional.
-  if (outfeed->operand_count() == 2) {
-    TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
-  }
+  TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
 
   // Outfeed has a separate shape field for the value which is outfed to the
   // host. The shape of the instruction itself is always a token.
-  if (!ShapeUtil::Compatible(outfeed->outfeed_shape(),
-                             outfeed->operand(0)->shape())) {
+  if (!ShapesSame(outfeed->outfeed_shape(), outfeed->operand(0)->shape())) {
     return InternalError(
-        "Expected outfeed shape to be compatible with operand's shape %s, "
+        "Expected outfeed shape to be equal to operand's shape %s, "
         "actual shape is %s:\n%s",
-        ShapeUtil::HumanString(outfeed->operand(0)->shape()).c_str(),
-        ShapeUtil::HumanString(outfeed->outfeed_shape()).c_str(),
+        StringifyShape(outfeed->operand(0)->shape()).c_str(),
+        StringifyShape(outfeed->outfeed_shape()).c_str(),
         outfeed->ToString().c_str());
   }
   return CheckShape(outfeed, ShapeUtil::MakeTokenShape());
 }
 
-Status ShapeVerifier::HandleHostCompute(HloInstruction*) {
-  return Status::OK();
-}
-
 bool ShapeVerifier::HasCompatibleElementTypes(const Shape& shape_0,
                                               const Shape& shape_1,
                                               const Shape& result_shape) {
@@ -269,8 +253,8 @@ Status ShapeVerifier::HandleSort(HloInstruction* sort) {
     return InternalError(
         "Expected sort to have to have the same dimensions for the keys and "
         "the values. Keys shape is: %s\n, Values shape is: %s",
-        ShapeUtil::HumanString(sort->operand(0)->shape()).c_str(),
-        ShapeUtil::HumanString(sort->operand(1)->shape()).c_str());
+        StringifyShape(sort->operand(0)->shape()).c_str(),
+        StringifyShape(sort->operand(1)->shape()).c_str());
   }
   return CheckVariadicShape(sort);
 }
@@ -344,7 +328,18 @@ Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
   return Status::OK();
 }
 
-Status ShapeVerifier::HandleFusion(HloInstruction*) { return Status::OK(); }
+Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
+  for (HloInstruction* fused_param : fusion->fused_parameters()) {
+    int64 param_no = fused_param->parameter_number();
+    if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape())) {
+      return InternalError(
+          "Shape mismatch between parameter number %lld and its operand in "
+          "%s.",
+          param_no, fusion->ToString().c_str());
+    }
+  }
+  return Status::OK();
+}
 
 Status ShapeVerifier::HandleCall(HloInstruction* call) {
   for (int64 i = 0; i < call->to_apply()->num_parameters(); ++i) {
@@ -426,12 +421,11 @@ Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
       CheckOperandAndParameter(xla_while, 0, xla_while->while_condition(), 0));
   const Shape& conditional_shape =
       xla_while->while_condition()->root_instruction()->shape();
-  if (!ShapeUtil::Compatible(conditional_shape,
-                             ShapeUtil::MakeShape(PRED, {}))) {
+  if (!ShapesSame(conditional_shape, ShapeUtil::MakeShape(PRED, {}))) {
     return InternalError(
         "Conditional computation shape does not lead to a scalar predicate "
         "shape: %s",
-        ShapeUtil::HumanString(conditional_shape).c_str());
+        StringifyShape(conditional_shape).c_str());
   }
   // The shape of kWhile should match the shape of the body computation it
   // calls.
@@ -579,7 +573,7 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) {
       gather,
       ShapeInference::InferGatherShape(
           gather->operand(0)->shape(), gather->operand(1)->shape(),
-          gather->gather_dimension_numbers(), gather->gather_window_bounds()));
+          gather->gather_dimension_numbers(), gather->gather_slice_sizes()));
 }
 
 Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
@@ -609,52 +603,51 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
   }
 
   // Check if the output shape matches the expected shape.
-  bool compatible;
+  //
   // We treat BF16 and F32 as compatible types if mixed precision is allowed,
   // but only when the instruction defines the BF16/F32 buffer.
-  switch (instruction->opcode()) {
-    case HloOpcode::kTupleSelect:
-      // TupleSelect only defines the top-level buffer, which in this case is
-      // the tuple, so we cannot allow mixed precision.
-      compatible = ShapeUtil::Compatible(instruction->shape(), inferred_shape);
-      break;
-    case HloOpcode::kGetTupleElement:
-    case HloOpcode::kTuple:
-      // Tuple and GetTupleElement do not define BF16/F32 buffers, so mixed
-      // precision is disallowed.
-    case HloOpcode::kConstant:
-    case HloOpcode::kBitcast:
-    case HloOpcode::kBitcastConvert:
-    case HloOpcode::kCall:
-    case HloOpcode::kConditional:
-    case HloOpcode::kConvert:
-    case HloOpcode::kCustomCall:
-    case HloOpcode::kInfeed:
-    case HloOpcode::kOutfeed:
-    case HloOpcode::kParameter:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
-    case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
-    case HloOpcode::kWhile:
-      // The above opcodes should match the expected shapes exactly.
-      compatible = ShapeUtil::Compatible(instruction->shape(), inferred_shape);
-      break;
-    default:
-      if (allow_mixed_precision_) {
-        compatible = ShapeUtil::CompatibleIgnoringFpPrecision(
-            instruction->shape(), inferred_shape);
-      } else {
-        compatible =
-            ShapeUtil::Compatible(instruction->shape(), inferred_shape);
-      }
-  }
-  if (!compatible) {
+  bool equal = [&] {
+    switch (instruction->opcode()) {
+      // The opcodes below can't have implicit layout conversions, nor can they
+      // implicitly transform f32 -> bf16.  Fundamentally these are either
+      // reinterpreting existing data (e.g. kBitcast) or shuffling data around
+      // without modifying it (e.g. kGetTupleElement, kTupleSelect).
+      case HloOpcode::kBitcast:
+      case HloOpcode::kCall:
+      case HloOpcode::kConditional:
+      case HloOpcode::kConstant:
+      case HloOpcode::kCustomCall:
+      case HloOpcode::kGetTupleElement:
+      case HloOpcode::kInfeed:
+      case HloOpcode::kOutfeed:
+      case HloOpcode::kParameter:
+      case HloOpcode::kRecv:
+      case HloOpcode::kRecvDone:
+      case HloOpcode::kSend:
+      case HloOpcode::kSendDone:
+      case HloOpcode::kTuple:
+      case HloOpcode::kTupleSelect:
+      case HloOpcode::kWhile:
+        return ShapesSame(instruction->shape(), inferred_shape);
+
+      // We allow arbitrary layout and f32->bf16 transformations on all other
+      // instructions, although this may be made more strict pending discussion
+      // in b/112709536.
+      default:
+        if (allow_mixed_precision_) {
+          return ShapeUtil::CompatibleIgnoringFpPrecision(instruction->shape(),
+                                                          inferred_shape);
+        } else {
+          return ShapeUtil::Compatible(instruction->shape(), inferred_shape);
+        }
+    }
+  }();
+  if (!equal) {
     return InternalError(
-        "Expected instruction to have shape compatible with %s, actual "
+        "Expected instruction to have shape equal to %s, actual "
         "shape is %s:\n%s",
-        ShapeUtil::HumanString(inferred_shape).c_str(),
-        ShapeUtil::HumanString(instruction->shape()).c_str(),
+        StringifyShape(inferred_shape).c_str(),
+        StringifyShape(instruction->shape()).c_str(),
         instruction->ToString().c_str());
   }
   return Status::OK();
@@ -699,10 +692,10 @@ Status ShapeVerifier::CheckVariadicShape(const HloInstruction* instruction) {
 
 string ComputationsToString(
     tensorflow::gtl::ArraySlice<HloComputation*> computations) {
-  return tensorflow::str_util::Join(
-      computations, ",", [](string* s, const HloComputation* computation) {
-        s->append(computation->name());
-      });
+  return absl::StrJoin(computations, ",",
+                       [](string* s, const HloComputation* computation) {
+                         s->append(computation->name());
+                       });
 }
 
 // Verifies various invariants about the structure of the HLO:
@@ -838,7 +831,7 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   }
 
   // Fused parameter instructions must be numbered contiguously and match up
-  // (shapes compatible) with their respective operand.
+  // (shapes equal) with their respective operand.
   CHECK_EQ(fusion->operands().size(), fused_parameters.size());
   std::vector<bool> parameter_numbers(fused_parameters.size(), false);
   for (auto fused_param : fused_parameters) {
@@ -859,13 +852,6 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
           param_no, fusion->ToString().c_str());
     }
     parameter_numbers[param_no] = true;
-    if (!ShapeUtil::Compatible(fused_param->shape(),
-                               fusion->operand(param_no)->shape())) {
-      return InternalError(
-          "Shape mismatch between parameter number %lld and its operand in "
-          "%s.",
-          param_no, fusion->ToString().c_str());
-    }
   }
   // Make sure all the parameter_numbers entries were seen.
   for (int i = 0; i < parameter_numbers.size(); i++) {
@@ -927,7 +913,7 @@ Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
     if (!ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) {
       return FailedPrecondition(
           "Implicit broadcast is not allowed in HLO."
-          "Found non-compatible shapes for instruction %s.\n"
+          "Found different shapes for instruction %s.\n"
           "output: %s\noperand: %s\n",
           HloOpcodeString(instruction->opcode()).c_str(),
           ShapeUtil::HumanString(out_shape).c_str(),
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index c942fab08e..b6093d667c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 
 namespace xla {
@@ -27,9 +28,9 @@ namespace xla {
 // TODO(b/26024837): Check output shape for all instruction types.
 class ShapeVerifier : public DfsHloVisitor {
  public:
-  explicit ShapeVerifier() : allow_mixed_precision_(false) {}
-  explicit ShapeVerifier(bool allow_mixed_precision)
-      : allow_mixed_precision_(allow_mixed_precision) {}
+  explicit ShapeVerifier(bool layout_sensitive, bool allow_mixed_precision)
+      : layout_sensitive_(layout_sensitive),
+        allow_mixed_precision_(allow_mixed_precision) {}
 
   Status HandleElementwiseUnary(HloInstruction* hlo) override;
   Status HandleElementwiseBinary(HloInstruction* hlo) override;
@@ -63,7 +64,6 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleFusion(HloInstruction*) override;
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction*) override;
-  Status HandleHostCompute(HloInstruction*) override;
   Status HandleSlice(HloInstruction* slice) override;
   Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
   Status HandleDynamicUpdateSlice(
@@ -106,13 +106,42 @@ class ShapeVerifier : public DfsHloVisitor {
   Status CheckVariadicShape(const HloInstruction* instruction);
 
  private:
-  // Return true if the shapes of the two operands have the same element type,
-  // and the result shape either has the same element type as the operand
-  // shapes or mixed precision is allowed and the result shape and the operand
-  // shapes have floating point element types.
+  // Helpers that switch on layout_sensitive_.
+  bool ShapesSame(const Shape& a, const Shape& b) {
+    return layout_sensitive_ ? ShapeUtil::Equal(a, b)
+                             : ShapeUtil::Compatible(a, b);
+  }
+  bool ShapesSameIgnoringFpPrecision(const Shape& a, const Shape& b) {
+    return layout_sensitive_ ? ShapeUtil::EqualIgnoringFpPrecision(a, b)
+                             : ShapeUtil::CompatibleIgnoringFpPrecision(a, b);
+  }
+  string StringifyShape(const Shape& s) {
+    return layout_sensitive_ ? ShapeUtil::HumanStringWithLayout(s)
+                             : ShapeUtil::HumanString(s);
+  }
+
+  // Checks that the given operand of the given instruction is of type TOKEN.
+  Status CheckIsTokenOperand(const HloInstruction* instruction,
+                             int64 operand_no);
+
+  // Checks that the shape of the given operand of the given instruction matches
+  // the given parameter of the given computation.
+  Status CheckOperandAndParameter(const HloInstruction* instruction,
+                                  int64 operand_number,
+                                  const HloComputation* computation,
+                                  int64 parameter_number);
+
+  // Returns true if the shapes of the two operands have the same element type,
+  // and the result shape either has the same element type as the operand shapes
+  // or mixed precision is allowed and the result shape and the operand shapes
+  // have floating point element types.
   bool HasCompatibleElementTypes(const Shape& shape_0, const Shape& shape_1,
                                  const Shape& result_shape);
 
+  // If the verifier is layout-sensitive, shapes must be equal to what's
+  // expected.  Otherwise, the shapes must simply be compatible.
+  bool layout_sensitive_;
+
   // Whether the inputs and output of an instruction can contain both F32s and
   // BF16s. Tuples that include both F32s and BF16s are allowed regardless of
   // this flag.
@@ -125,14 +154,10 @@ class HloVerifier : public HloPassInterface {
  public:
   using ShapeVerifierFactory = std::function<std::unique_ptr<ShapeVerifier>()>;
 
-  // Uses standard shape inference.
-  explicit HloVerifier()
-      : shape_verifier_factory_(
-            [] { return MakeUnique<ShapeVerifier>(false); }) {}
-
-  explicit HloVerifier(bool allow_mixed_precision)
-      : shape_verifier_factory_([allow_mixed_precision] {
-          return MakeUnique<ShapeVerifier>(allow_mixed_precision);
+  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision)
+      : shape_verifier_factory_([layout_sensitive, allow_mixed_precision] {
+          return absl::make_unique<ShapeVerifier>(layout_sensitive,
+                                                  allow_mixed_precision);
         }) {}
 
   // Uses custom shape verification.
@@ -140,10 +165,9 @@ class HloVerifier : public HloPassInterface {
       : shape_verifier_factory_(std::move(shape_verifier_factory)) {}
 
   ~HloVerifier() override = default;
-  tensorflow::StringPiece name() const override { return "verifier"; }
+  absl::string_view name() const override { return "verifier"; }
 
-  // Note: always returns false (no instructions are ever modified by this
-  // pass).
+  // Never returns true; no instructions are ever modified by this pass.
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index d764964f3c..70b741353d 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -37,13 +37,15 @@ using ::testing::HasSubstr;
 class HloVerifierTest : public HloTestBase {
  public:
   HloVerifierTest()
-      : HloTestBase(/*allow_mixed_precision_in_hlo_verifier=*/false) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
 };
 
 class HloVerifierTestAllowMixedPrecision : public HloTestBase {
  public:
   HloVerifierTestAllowMixedPrecision()
-      : HloTestBase(/*allow_mixed_precision_in_hlo_verifier=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 };
 
 TEST_F(HloVerifierTest, NullInstructionParent) {
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index bb5b40a8a8..581b3ce1e0 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -14,20 +14,20 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/metric_table_report.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
 
+using absl::StrAppend;
+using absl::StrCat;
 using tensorflow::strings::Appendf;
 using tensorflow::strings::HumanReadableElapsedTime;
 using tensorflow::strings::HumanReadableNumBytes;
 using tensorflow::strings::Printf;
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
 
 string HumanReadableProfileBuilder::ToString() const {
   string s;
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
index 6f56c3aa82..b99624460e 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.h
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -29,7 +29,7 @@ namespace xla {
 // computation, suitable for consumption by humans.
 class HumanReadableProfileBuilder {
  public:
-  explicit HumanReadableProfileBuilder(tensorflow::StringPiece computation_name,
+  explicit HumanReadableProfileBuilder(absl::string_view computation_name,
                                        int64 total_cycles,
                                        double clock_rate_ghz)
       : computation_name_(std::string(computation_name)),
@@ -43,9 +43,8 @@ class HumanReadableProfileBuilder {
   // Adds an operation to the profile.  If you don't know the number of
   // floating-point ops or bytes touched by the op, or if you don't know how
   // fast it would run optimally, pass -1 for that param.
-  void AddOp(tensorflow::StringPiece op_name,
-             tensorflow::StringPiece short_name,
-             tensorflow::StringPiece category, int64 cycles, int64 flop_count,
+  void AddOp(absl::string_view op_name, absl::string_view short_name,
+             absl::string_view category, int64 cycles, int64 flop_count,
              int64 transcendental_count, int64 bytes_accessed,
              float optimal_seconds) {
     op_infos_.push_back({std::string(op_name), std::string(short_name),
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
index aa325dc8a3..85bb4a8b24 100644
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
@@ -30,7 +30,7 @@ class ImplicitBroadcastRemover : public HloPassInterface {
   ImplicitBroadcastRemover() {}
   ~ImplicitBroadcastRemover() override {}
 
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "implicit-broadcast-remover";
   }
 
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
index f85d31d522..df88587492 100644
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -26,6 +26,11 @@ namespace xla {
 namespace {
 
 class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+ public:
+  ImplicitBroadcastRemoverTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+
  protected:
   ImplicitBroadcastRemover remover_;
 };
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 3531b7223f..43ef30d1eb 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -14,13 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/optional.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace gtl = ::tensorflow::gtl;
@@ -31,32 +34,30 @@ using UnknownArray = Analysis::UnknownArray;
 using ConstantArray = Analysis::ConstantArray;
 using ReshapedArray = Analysis::ReshapedArray;
 using ScalarIndexedArray = Analysis::ScalarIndexedArray;
+using absl::StrJoin;
 using tensorflow::gtl::ArraySlice;
-using tensorflow::str_util::Join;
 }  // namespace
 
 string IndexedArrayAnalysis::ToString(Array* root, bool print_constants) {
   switch (root->kind()) {
     case Array::kUnknown: {
       auto* unknown_tensor = root->as<UnknownArray>();
-      return tensorflow::strings::StrCat("%",
-                                         unknown_tensor->instruction().name());
+      return absl::StrCat("%", unknown_tensor->instruction().name());
     }
 
     case Array::kConstant: {
       if (print_constants) {
         string contents = root->as<ConstantArray>()->literal()->ToString();
-        return tensorflow::strings::StrCat(
-            "(constant ", ShapeUtil::HumanString(root->shape()), " ", contents,
-            ")");
+        return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()),
+                            " ", contents, ")");
       }
-      return tensorflow::strings::StrCat(
-          "(constant ", ShapeUtil::HumanString(root->shape()), ")");
+      return absl::StrCat("(constant ", ShapeUtil::HumanString(root->shape()),
+                          ")");
     }
 
     case Array::kReshaped: {
       ReshapedArray* reshaped_array = root->as<ReshapedArray>();
-      return tensorflow::strings::StrCat(
+      return absl::StrCat(
           "(reshape ", ToString(reshaped_array->operand(), print_constants),
           " to ", ShapeUtil::HumanString(reshaped_array->shape()), ")");
     }
@@ -67,11 +68,11 @@ string IndexedArrayAnalysis::ToString(Array* root, bool print_constants) {
       string name = root->kind() == Array::kScalarIndexedConstant
                         ? "scalar-indexed-const"
                         : "scalar-indexed";
-      return tensorflow::strings::StrCat(
+      return absl::StrCat(
           "(", name, " ", ToString(indexed_array->source(), print_constants),
           " ", ToString(indexed_array->indices(), print_constants), " ",
           indexed_array->source_dim(), "->[",
-          Join(indexed_array->output_dims(), ","), "])");
+          StrJoin(indexed_array->output_dims(), ","), "])");
     }
   }
 }
@@ -92,7 +93,7 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
   // Depth first search over the DAG, invoking ComputeArrayFor in post order.
   // The HLO instructions already in the cache are considered leaves.
 
-  gtl::InlinedVector<const HloInstruction*, 4> stack;
+  absl::InlinedVector<const HloInstruction*, 4> stack;
 
   enum DfsState { kDiscovered, kVisited };
   gtl::FlatMap<const HloInstruction*, DfsState> dfs_state_map;
@@ -153,7 +154,7 @@ StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayFor(
     TF_ASSIGN_OR_RETURN(
         computed_array,
         ComputeArrayForGather(instr->shape(), instr->gather_dimension_numbers(),
-                              instr->gather_window_bounds(),
+                              instr->gather_slice_sizes(),
                               FindOrDie(cache_, instr->operand(0)),
                               FindOrDie(cache_, instr->operand(1))));
   } else if (instr->opcode() == HloOpcode::kReshape) {
@@ -251,24 +252,23 @@ StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldGatherOfGather(
 
 StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
     const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds, Array* source,
+    tensorflow::gtl::ArraySlice<int64> slice_sizes, Array* source,
     Array* indices) {
   if (dim_numbers.index_vector_dim() != indices->shape().dimensions_size()) {
     VLOG(3) << "ComputeArrayForGather: indices are not scalar";
     return nullptr;
   }
 
-  CHECK_EQ(dim_numbers.gather_dims_to_operand_dims_size(), 1);
+  CHECK_EQ(dim_numbers.start_index_map_size(), 1);
 
-  // We can also handle dim_numbers.elided_window_dims_size() == 0 here, should
-  // it become relevant.
+  // We can also handle dim_numbers.collapsed_slice_dims_size() == 0 here,
+  // should it become relevant.
 
-  if (dim_numbers.elided_window_dims_size() != 1 ||
-      dim_numbers.elided_window_dims(0) !=
-          dim_numbers.gather_dims_to_operand_dims(0)) {
+  if (dim_numbers.collapsed_slice_dims_size() != 1 ||
+      dim_numbers.collapsed_slice_dims(0) != dim_numbers.start_index_map(0)) {
     VLOG(3) << "ComputeArrayForGather: gather operations must elide "
-               "gather_dims_to_operand_dims[0] and "
-               "gather_dims_to_operand_dims[0] only";
+               "start_index_map[0] and "
+               "start_index_map[0] only";
     return nullptr;
   }
 
@@ -277,27 +277,27 @@ StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
   // arrays from an array of size [7,4,6].  We check that condition down below:
 
   for (int64 i = 0, e = source->shape().dimensions_size(); i < e; i++) {
-    if (i != dim_numbers.elided_window_dims(0) &&
-        source->shape().dimensions(i) != window_bounds[i]) {
-      VLOG(3) << "ComputeArrayForGather: window_bounds[" << i
+    if (i != dim_numbers.collapsed_slice_dims(0) &&
+        source->shape().dimensions(i) != slice_sizes[i]) {
+      VLOG(3) << "ComputeArrayForGather: slice_sizes[" << i
               << "] != source->shape().dimensions(" << i << ") -- "
-              << source->shape().dimensions(i) << " vs. " << window_bounds[i]
-              << " with dim_numbers.elided_window_dims(0) = "
-              << dim_numbers.elided_window_dims(0);
+              << source->shape().dimensions(i) << " vs. " << slice_sizes[i]
+              << " with dim_numbers.collapsed_slice_dims(0) = "
+              << dim_numbers.collapsed_slice_dims(0);
       return nullptr;
     }
   }
 
-  int64 source_dim = dim_numbers.gather_dims_to_operand_dims(0);
+  int64 source_dim = dim_numbers.start_index_map(0);
   std::vector<int64> output_dims;
   for (int64 i = 0, e = shape.dimensions_size(); i < e; i++) {
-    if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+    if (!absl::c_binary_search(dim_numbers.offset_dims(), i)) {
       output_dims.push_back(i);
     }
   }
 
   if (auto* indexed = dynamic_cast<ScalarIndexedArray*>(source)) {
-    if (c_linear_search(indexed->output_dims(), source_dim)) {
+    if (absl::c_linear_search(indexed->output_dims(), source_dim)) {
       return FoldGatherOfGather(indexed, indices, source_dim, output_dims,
                                 shape);
     }
@@ -315,7 +315,7 @@ namespace {
 // [values.begin()+index, values.end()) is equal to `product`.  If there is no
 // such index, return -1.  All integers in `values` must be positive.
 int64 FindSuffixWithProduct(ArraySlice<int64> values, int64 product) {
-  DCHECK(c_all_of(values, [](int64 value) { return value > 0; }));
+  DCHECK(absl::c_all_of(values, [](int64 value) { return value > 0; }));
 
   int64 current_product = 1;
   int64 i;
@@ -378,8 +378,8 @@ std::vector<ReshapePassthroughDimPair> ComputeReshapePassthroughDimPairs(
     CHECK_NE(candidate_operand_dim, 0)
         << "result_dim = " << result_dim
         << ", result_subarray_size = " << result_subarray_size
-        << ", result_shape = [" << Join(result_shape, ",") << "]"
-        << ", operand_shape = [" << Join(operand_shape, ",") << "]";
+        << ", result_shape = [" << StrJoin(result_shape, ",") << "]"
+        << ", operand_shape = [" << StrJoin(operand_shape, ",") << "]";
 
     if (candidate_operand_dim != -1 &&
         result_shape[result_dim] == operand_shape[candidate_operand_dim - 1]) {
@@ -389,26 +389,27 @@ std::vector<ReshapePassthroughDimPair> ComputeReshapePassthroughDimPairs(
     result_subarray_size *= result_shape[result_dim];
   }
 
-  c_reverse(result);
+  absl::c_reverse(result);
 
   if (VLOG_IS_ON(3)) {
     std::vector<string> result_strings;
-    c_transform(result, std::back_inserter(result_strings),
-                [](ReshapePassthroughDimPair value) {
-                  return tensorflow::strings::StrCat(value.result_dim, "->",
-                                                     value.operand_dim);
-                });
-    VLOG(3) << "For a reshape from [" << Join(operand_shape, ",") << "] to ["
-            << Join(result_shape, ",") << "] passthrough indices are ["
-            << Join(result_strings, ",") << "] (legend: `result`->`operand`)";
+    absl::c_transform(result, std::back_inserter(result_strings),
+                      [](ReshapePassthroughDimPair value) {
+                        return absl::StrCat(value.result_dim, "->",
+                                            value.operand_dim);
+                      });
+    VLOG(3) << "For a reshape from [" << StrJoin(operand_shape, ",") << "] to ["
+            << StrJoin(result_shape, ",") << "] passthrough indices are ["
+            << StrJoin(result_strings, ",")
+            << "] (legend: `result`->`operand`)";
   }
 
-  DCHECK(c_is_sorted(
+  DCHECK(absl::c_is_sorted(
       result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
         return lhs.result_dim < rhs.result_dim;
       }));
 
-  DCHECK(c_is_sorted(
+  DCHECK(absl::c_is_sorted(
       result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
         return lhs.operand_dim < rhs.operand_dim;
       }));
@@ -420,20 +421,20 @@ std::vector<ReshapePassthroughDimPair> ComputeReshapePassthroughDimPairs(
 // `passthrough_dims`.
 bool IsReshapePassthroughOperandDim(
     ArraySlice<ReshapePassthroughDimPair> passthrough_dims, int64 dim) {
-  return c_any_of(passthrough_dims,
-                  [&](ReshapePassthroughDimPair passthrough_dim_pair) {
-                    return passthrough_dim_pair.operand_dim == dim;
-                  });
+  return absl::c_any_of(passthrough_dims,
+                        [&](ReshapePassthroughDimPair passthrough_dim_pair) {
+                          return passthrough_dim_pair.operand_dim == dim;
+                        });
 }
 
 // Maps `operand_dim` which must be an passthrough operand dimension to its
 // corresponding passthrough result dimension based on `passthrough_dims`.
 int64 MapPassthroughOperandDimToResultDim(
     ArraySlice<ReshapePassthroughDimPair> passthrough_dims, int64 operand_dim) {
-  auto it = c_find_if(passthrough_dims,
-                      [&](ReshapePassthroughDimPair passthrough_dim_pair) {
-                        return passthrough_dim_pair.operand_dim == operand_dim;
-                      });
+  auto it = absl::c_find_if(
+      passthrough_dims, [&](ReshapePassthroughDimPair passthrough_dim_pair) {
+        return passthrough_dim_pair.operand_dim == operand_dim;
+      });
   CHECK(it != passthrough_dims.end());
   return it->result_dim;
 }
@@ -442,7 +443,7 @@ int64 FindSourcePositionForPassthroughResultDim(ArraySlice<int64> operand_shape,
                                                 ArraySlice<int64> result_shape,
                                                 int64 source_passthrough_dim) {
   VLOG(3) << "FindSourcePositionForPassthroughResultDim(["
-          << Join(operand_shape, ",") << "], [" << Join(result_shape, ",")
+          << StrJoin(operand_shape, ",") << "], [" << StrJoin(result_shape, ",")
           << "], " << source_passthrough_dim << ")";
 
   int64 indexed_source_subarray_size =
@@ -454,8 +455,8 @@ int64 FindSourcePositionForPassthroughResultDim(ArraySlice<int64> operand_shape,
 
 Shape StripDegenerateDimensions(const Shape& shape) {
   DimensionVector new_dims;
-  c_copy_if(shape.dimensions(), std::back_inserter(new_dims),
-            [](int64 dim) { return dim != 1; });
+  absl::c_copy_if(shape.dimensions(), std::back_inserter(new_dims),
+                  [](int64 dim) { return dim != 1; });
   return ShapeUtil::MakeShape(shape.element_type(), new_dims);
 }
 };  // namespace
@@ -531,7 +532,7 @@ StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::ReshapeToAddDegenerateDims(
     // element is true iff the i'th component of the result index is an output
     // index.
 
-    gtl::InlinedVector<bool, 6> output_dims_bitvector(
+    absl::InlinedVector<bool, 6> output_dims_bitvector(
         operand->shape().dimensions_size());
     for (int64 output_dim : operand->output_dims()) {
       output_dims_bitvector[output_dim] = true;
@@ -553,8 +554,8 @@ StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::ReshapeToAddDegenerateDims(
   }();
 
   DimensionVector new_result_shape_dims;
-  c_copy(operand->shape().dimensions(),
-         std::back_inserter(new_result_shape_dims));
+  absl::c_copy(operand->shape().dimensions(),
+               std::back_inserter(new_result_shape_dims));
   for (int64 degenerate_dim : degenerate_dims) {
     InsertAt(&new_result_shape_dims, degenerate_dim, 1);
   }
@@ -695,8 +696,8 @@ IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims(
                                           operand_dim);
   };
 
-  if (!c_all_of(scalar_indexed->output_dims(),
-                is_reshape_passthrough_operand_dim)) {
+  if (!absl::c_all_of(scalar_indexed->output_dims(),
+                      is_reshape_passthrough_operand_dim)) {
     VLOG(3) << "Not all output dims are passthrough dims "
             << ToString(scalar_indexed);
     return nullptr;
@@ -735,11 +736,11 @@ IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims(
   //   operand = s32[3,5,2] constant({...})
   //   indices = s32[7] parameter(0)
   //   gather = s32[3,2,7] gather(operand, indices),
-  //       output_window_dims={0,1},
-  //       elided_window_dims={1},
-  //       gather_dims_to_operand_dims={1},
+  //       offset_dims={0,1},
+  //       collapsed_slice_dims={1},
+  //       start_index_map={1},
   //       index_vector_dim=1,
-  //       window_bounds={3,1,2}
+  //       slice_sizes={3,1,2}
   //   reshape = s32[6,7] reshape(gather)
   //
   // In this case the gather maps to:
@@ -754,9 +755,9 @@ IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims(
   if (source_dim_for_new_scalar_indexed_node == -1) {
     VLOG(3) << "Could not compute the source dim for the new scalar indexed "
                "node: scalar_indexed_source_shape = ["
-            << Join(scalar_indexed_source_shape.dimensions(), ",")
+            << StrJoin(scalar_indexed_source_shape.dimensions(), ",")
             << "] and new_scalar_indexed_source_shape = ["
-            << Join(new_scalar_indexed_source_shape, ",") << "]";
+            << StrJoin(new_scalar_indexed_source_shape, ",") << "]";
     return nullptr;
   }
 
@@ -764,8 +765,8 @@ IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims(
       &new_scalar_indexed_source_shape, source_dim_for_new_scalar_indexed_node,
       scalar_indexed_source_shape.dimensions(scalar_indexed->source_dim()));
 
-  CHECK_EQ(c_accumulate(new_scalar_indexed_source_shape, 1LL,
-                        std::multiplies<int64>()),
+  CHECK_EQ(absl::c_accumulate(new_scalar_indexed_source_shape, 1LL,
+                              std::multiplies<int64>()),
            ShapeUtil::ElementsIn(scalar_indexed_source_shape));
 
   CHECK(IsReshapePassthroughOperandDim(
@@ -781,9 +782,9 @@ IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims(
   };
 
   std::vector<int64> output_dims_for_new_scalar_indexed_node;
-  c_transform(scalar_indexed->output_dims(),
-              std::back_inserter(output_dims_for_new_scalar_indexed_node),
-              map_passthrough_operand_dim_to_result_dim);
+  absl::c_transform(scalar_indexed->output_dims(),
+                    std::back_inserter(output_dims_for_new_scalar_indexed_node),
+                    map_passthrough_operand_dim_to_result_dim);
 
   TF_ASSIGN_OR_RETURN(const Literal* new_scalar_indexed_source_literal,
                       TakeOwnership(scalar_indexed->literal().Reshape(
@@ -874,11 +875,12 @@ IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
 
   ArraySlice<int64> broadcast_dims = broadcast_instr->dimensions();
   auto is_broadcasted_dim = [&](int64 output_dim) {
-    return c_find(broadcast_dims, output_dim) == broadcast_dims.end();
+    return absl::c_find(broadcast_dims, output_dim) == broadcast_dims.end();
   };
 
   // All of the output dims must be "broadcasted" dims for the other operand.
-  if (!c_all_of(scalar_indexed_const->output_dims(), is_broadcasted_dim)) {
+  if (!absl::c_all_of(scalar_indexed_const->output_dims(),
+                      is_broadcasted_dim)) {
     return nullptr;
   }
 
@@ -970,15 +972,15 @@ namespace {
 
 // Returns the non-contracting non-batch dimension (as per `contracting_dims`
 // and `batch_dims`) if there is exactly one, otherwise returns nullopt.
-gtl::optional<int64> GetOnlyNonContractingNonBatchDim(
+absl::optional<int64> GetOnlyNonContractingNonBatchDim(
     int64 rank, ArraySlice<int64> contracting_dims,
     ArraySlice<int64> batch_dims) {
-  gtl::optional<int64> result;
+  absl::optional<int64> result;
   for (int64 dim = 0; dim < rank; dim++) {
     if (!ArrayContains(contracting_dims, dim) &&
         !ArrayContains(batch_dims, dim)) {
       if (result.has_value()) {
-        return gtl::nullopt;
+        return absl::nullopt;
       }
       result = dim;
     }
@@ -995,10 +997,9 @@ gtl::optional<int64> GetOnlyNonContractingNonBatchDim(
 // `contracting_dims` and `batch_dims` are the contracting and batch dimensions
 // of whatever operand `indexed_array` is to the dot (LHS or RHS).
 bool CanFoldDotIntoIndexedArray(
-    tensorflow::StringPiece tag,
-    Analysis::ScalarIndexedConstantArray* indexed_array,
+    absl::string_view tag, Analysis::ScalarIndexedConstantArray* indexed_array,
     ArraySlice<int64> contracting_dims, ArraySlice<int64> batch_dims) {
-  gtl::optional<int64> non_contracting_non_batch_dim =
+  absl::optional<int64> non_contracting_non_batch_dim =
       GetOnlyNonContractingNonBatchDim(ShapeUtil::Rank(indexed_array->shape()),
                                        contracting_dims, batch_dims);
   if (!non_contracting_non_batch_dim.has_value()) {
@@ -1133,7 +1134,7 @@ StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForDot(
   return nullptr;
 }
 
-tensorflow::StringPiece IndexedArrayAnalysisPrinterPass::name() const {
+absl::string_view IndexedArrayAnalysisPrinterPass::name() const {
   return "indexed-array-analysis-printer-pass";
 }
 
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.h b/tensorflow/compiler/xla/service/indexed_array_analysis.h
index e923dc39f7..3fa7d749e1 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.h
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.h
@@ -265,7 +265,7 @@ class IndexedArrayAnalysis {
 
   StatusOr<Array*> ComputeArrayForGather(
       const Shape& shape, const GatherDimensionNumbers& dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds, Array* source,
+      tensorflow::gtl::ArraySlice<int64> slice_sizes, Array* source,
       Array* indices);
 
   StatusOr<Array*> ComputeArrayForDotWithIndexedLhs(
@@ -371,7 +371,7 @@ class IndexedArrayAnalysis {
 // unconditionally add to the regular HLO pass pipeline.
 class IndexedArrayAnalysisPrinterPass : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override;
+  absl::string_view name() const override;
   StatusOr<bool> Run(HloModule* module) override;
 };
 
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 5f4b42799b..c34c32f7d3 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -22,6 +22,11 @@ limitations under the License.
 namespace xla {
 namespace {
 class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
+ public:
+  IndexedArrayAnalysisTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+
  protected:
   void AssertArrayForRootExpressionIs(const string& hlo_text,
                                       const string& root_expression) {
@@ -82,11 +87,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
 }
 )";
 
@@ -102,11 +107,11 @@ ENTRY main {
   operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5] parameter(0)
   ROOT gather = s32[5,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
 }
 )";
 
@@ -122,11 +127,11 @@ ENTRY main {
   operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5,2] parameter(0)
   ROOT gather = s32[5] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
 
@@ -141,11 +146,11 @@ ENTRY main {
   operand = s32[3,3,1] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,2},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0,2},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3,1}
+      slice_sizes={1,3,1}
 }
 )";
 
@@ -160,11 +165,11 @@ ENTRY main {
   operand = s32[3,3,1] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,2,3] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1,2},
+      collapsed_slice_dims={2},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={2,3,1}
+      slice_sizes={2,3,1}
 }
 )";
 
@@ -179,11 +184,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[5] parameter(1)
   ROOT gather = s32[5,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,2}
+      slice_sizes={1,2}
 }
 )";
 
@@ -199,17 +204,17 @@ ENTRY main {
   indices_a = s32[5] parameter(0)
   indices_b = s32[2] parameter(1)
   gather_a = s32[5,3] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
   ROOT gather_b = s32[2,3] gather(gather_a, indices_b),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
 }
 )";
 
@@ -228,17 +233,17 @@ ENTRY main {
   indices_a = s32[5,7] parameter(1)
   indices_b = s32[2] parameter(2)
   gather_a = s32[5,3,7] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT gather_b = s32[5,3,2] gather(gather_a, indices_b),
-      output_window_dims={0,1},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={2},
+      offset_dims={0,1},
+      collapsed_slice_dims={2},
+      start_index_map={2},
       index_vector_dim=1,
-      window_bounds={5,3,1}
+      slice_sizes={5,3,1}
 }
 )";
 
@@ -256,17 +261,17 @@ ENTRY main {
   indices_a = s32[2] parameter(1)
   indices_b = s32[5,7] parameter(2)
   gather_a = s32[2,6] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT gather_b = s32[5,6,7] gather(gather_a, indices_b),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,6}
+      slice_sizes={1,6}
 }
 )";
 
@@ -284,17 +289,17 @@ ENTRY main {
   indices_a = s32[5,7] parameter(1)
   indices_b = s32[4,8] parameter(2)
   gather_a = s32[5,3,7] gather(operand, indices_a),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT gather_b = s32[4,5,3,8] gather(gather_a, indices_b),
-      output_window_dims={1,2},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={2},
+      offset_dims={1,2},
+      collapsed_slice_dims={2},
+      start_index_map={2},
       index_vector_dim=2,
-      window_bounds={5,3,1}
+      slice_sizes={5,3,1}
 }
 )";
 
@@ -312,11 +317,11 @@ ENTRY main {
   operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT reshape = s32[5,2,2] reshape(gather)
 }
 )";
@@ -333,11 +338,11 @@ ENTRY main {
   operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,7] parameter(0)
   gather = s32[5,4,7] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT reshape = s32[5,2,2,7] reshape(gather)
 }
 )";
@@ -358,11 +363,11 @@ ENTRY main {
       {{1,2,3,4,5,6},{1,2,3,4,5,6}}})
   indices = s32[5,7] parameter(0)
   gather = s32[5,2,6,7] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1,2},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,2,6}
+      slice_sizes={1,2,6}
   ROOT reshape = s32[5,3,4,7] reshape(gather)
 }
 )";
@@ -381,11 +386,11 @@ ENTRY main {
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT reshape = s32[1,1,6] reshape(gather)
 }
 )";
@@ -408,14 +413,14 @@ ENTRY main {
   operand = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 1, 2, 3 } })
 
   i.0 = s64[1,3]{1,0} parameter(0)
-  g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), output_window_dims={2},
-    elided_window_dims={0}, gather_dims_to_operand_dims={0},
-    index_vector_dim=2, window_bounds={1,3}
+  g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0},
+    index_vector_dim=2, slice_sizes={1,3}
 
   i.1 = s64[1] parameter(1)
-  g.1 = s32[1,1,3]{2,1,0} gather(g.0, i.1), output_window_dims={0,2},
-    elided_window_dims={1}, gather_dims_to_operand_dims={1},
-    index_vector_dim=1, window_bounds={1,1,3}
+  g.1 = s32[1,1,3]{2,1,0} gather(g.0, i.1), offset_dims={0,2},
+    collapsed_slice_dims={1}, start_index_map={1},
+    index_vector_dim=1, slice_sizes={1,1,3}
 
   ROOT reshape = s32[1,3]{1,0} reshape(g.1)
 }
@@ -441,11 +446,11 @@ ENTRY main {
   operand = s32[1,6] constant(s32[1,6]{{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT reshape = s32[1,1,6] reshape(gather)
 }
 )";
@@ -469,11 +474,11 @@ ENTRY main {
       {1,2,3,4,5,6},{1,2,3,4,5,6}}})
   indices = s32[1] parameter(0)
   gather = s32[1,1,6] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1,2},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={1,1,6}
+      slice_sizes={1,1,6}
   ROOT reshape = s32[1,1,1,6] reshape(gather)
 }
 )";
@@ -500,11 +505,11 @@ ENTRY main {
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1,5] parameter(0)
   gather = s32[1,5,6] gather(operand, indices),
-      output_window_dims={2},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={2},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,6}
+      slice_sizes={1,6}
   ROOT reshape = s32[1,1,5,6] reshape(gather)
 }
 )";
@@ -530,11 +535,11 @@ ENTRY main {
   operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,6] parameter(0)
   gather = s32[5,4,6] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT reshape = s32[5,2,2,2,3] reshape(gather)
 }
 )";
@@ -562,11 +567,11 @@ ENTRY main {
       {{1,2},{3,4},{5,6},{7,8},{9,10}}})
   indices = s32[7] parameter(0)
   gather = s32[3,2,7] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0,1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1,2}
+      slice_sizes={3,1,2}
   ROOT reshape = s32[6,7] reshape(gather)
 }
 )";
@@ -594,11 +599,11 @@ ENTRY main {
     {{1},{2},{3},{4}}})
   indices = s32[5,6] parameter(0)
   gather = s32[5,4,6,1] gather(operand, indices),
-      output_window_dims={1,3},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1,3},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=2,
-      window_bounds={1,4,1}
+      slice_sizes={1,4,1}
   ROOT reshape = s32[5,2,2,2,3,1] reshape(gather)
 }
 )";
@@ -623,20 +628,20 @@ ENTRY main {
   operand = f32[3,4] constant(f32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   indices = s32[5] parameter(0)
   gather = f32[5,4] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT tanh = f32[5,4] tanh(gather)
 }
 )";
 
   AssertArrayWithConstantsForRootExpressionIs(hlo_text, 1 + R"(
 (scalar-indexed-const (constant f32[3,4] f32[3,4] {
-  { 0.761594176, 0.964027584, 0.995054781, 0.999329329 },
-  { 0.761594176, 0.995054781, 0.964027584, 0.999329329 },
-  { 0.999329329, 0.995054781, 0.964027584, 0.761594176 }
+  { 0.761594, 0.964028, 0.995055, 0.999329 },
+  { 0.761594, 0.995055, 0.964028, 0.999329 },
+  { 0.999329, 0.995055, 0.964028, 0.761594 }
 }) %indices 0->[0]))");
 }
 
@@ -650,11 +655,11 @@ ENTRY main {
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT add = s32[5,4] add(gather, constant_broadcasted)
 }
 )";
@@ -678,11 +683,11 @@ ENTRY main {
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT sub = s32[5,4] subtract(gather, constant_broadcasted)
 }
 )";
@@ -706,11 +711,11 @@ ENTRY main {
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT sub = s32[5,4] subtract(constant_broadcasted, gather)
 }
 )";
@@ -733,11 +738,11 @@ ENTRY main {
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT add = s32[5,4] add(gather, constant_broadcasted)
 }
 )";
@@ -760,11 +765,11 @@ ENTRY main {
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0}
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT add = s32[5,4] add(gather, constant_broadcasted)
 }
 )";
@@ -808,11 +813,11 @@ ENTRY main {
   dot_rhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[5,4] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,4}
+      slice_sizes={1,4}
   ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 )";
@@ -835,11 +840,11 @@ ENTRY main {
   dot_rhs_constant = s32[3,3] constant(s32[3,3]{{1,2,3},{4,5,6},{7,8,9}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[3,5] gather(gather_operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT dot = s32[5,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 )";
@@ -863,11 +868,11 @@ ENTRY main {
   dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[3,5] gather(gather_operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 )";
@@ -892,11 +897,11 @@ ENTRY main {
   dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[5,3] gather(gather_operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1,3}
+      slice_sizes={1,3}
   ROOT dot = s32[4,5] dot(dot_lhs_constant, dot_rhs), lhs_contracting_dims={1}, rhs_contracting_dims={1}
 }
 )";
@@ -921,11 +926,11 @@ ENTRY main {
   dot_lhs_constant = s32[2,2,3] constant(s32[2,2,3]{{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
   indices = s32[4] parameter(0)
   dot_rhs = s32[2,3,4] gather(gather_operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={2},
-      gather_dims_to_operand_dims={2},
+      offset_dims={0,1},
+      collapsed_slice_dims={2},
+      start_index_map={2},
       index_vector_dim=1,
-      window_bounds={2,3,1}
+      slice_sizes={2,3,1}
   ROOT dot = s32[2,2,4] dot(dot_lhs_constant, dot_rhs),
       lhs_contracting_dims={2}, rhs_contracting_dims={1},
       lhs_batch_dims={0}, rhs_batch_dims={0}
@@ -952,11 +957,11 @@ ENTRY main {
   dot_rhs_constant = s32[2,3] constant(s32[2,3]{{1,2,3},{4,5,6}})
   indices = s32[2] parameter(0)
   dot_lhs = s32[3,2] gather(gather_operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3,1}
+      slice_sizes={3,1}
   ROOT dot = s32[3,3] dot(dot_lhs, dot_rhs_constant), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 )";
diff --git a/tensorflow/compiler/xla/service/inliner.h b/tensorflow/compiler/xla/service/inliner.h
index a523811f6c..efa8ed3abc 100644
--- a/tensorflow/compiler/xla/service/inliner.h
+++ b/tensorflow/compiler/xla/service/inliner.h
@@ -27,7 +27,7 @@ namespace xla {
 class Inliner : public HloPassInterface {
  public:
   ~Inliner() override = default;
-  tensorflow::StringPiece name() const override { return "inline"; }
+  absl::string_view name() const override { return "inline"; }
 
   // Run inlining on the given computation. Returns whether the computation was
   // changed.
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc
index 32937b33b3..5695bc2420 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/inliner_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index f33942d679..be59ce8281 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -130,7 +131,6 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kFft:
     case HloOpcode::kFusion:
     case HloOpcode::kGather:
-    case HloOpcode::kHostCompute:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
     case HloOpcode::kMap:
@@ -497,7 +497,7 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
 
 bool InstructionFusion::MultiOutputFusionCreatesCycle(
     HloInstruction* producer, HloInstruction* consumer) {
-  return c_any_of(
+  return absl::c_any_of(
       consumer->operands(), [&](const HloInstruction* consumer_operand) {
         // The fusion algorithm traverses the HLO graph in reverse post order.
         // Thus `cosumers` is visited before its operands (including
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index f73ca9adf7..8489c3d9ad 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -36,7 +36,7 @@ class InstructionFusion : public HloPassInterface {
       bool may_duplicate = true)
       : is_expensive_(is_expensive), may_duplicate_(may_duplicate) {}
   ~InstructionFusion() override = default;
-  tensorflow::StringPiece name() const override { return "fusion"; }
+  absl::string_view name() const override { return "fusion"; }
 
   // Run instruction fusion on the given computation. Returns whether the
   // computation was changed (instructions were fused).
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 8652599dc6..581f8d2e92 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -12,12 +12,11 @@ cc_library(
     srcs = ["interpreter_transfer_manager.cc"],
     hdrs = ["interpreter_transfer_manager.h"],
     deps = [
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service/interpreter:platform_id",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -32,8 +31,6 @@ cc_library(
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
@@ -54,6 +51,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = True,  # Contains compiler registration
 )
@@ -79,7 +77,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
@@ -91,6 +88,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 9f8f4bda87..bb69cb9c47 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
@@ -69,8 +69,8 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
 
   // Create executable from only the Hlo module.
   std::unique_ptr<Executable> executable =
-      xla::MakeUnique<InterpreterExecutable>(std::move(hlo_module),
-                                             xla::MakeUnique<HloEvaluator>());
+      absl::make_unique<InterpreterExecutable>(
+          std::move(hlo_module), absl::make_unique<HloEvaluator>());
 
   return std::move(executable);
 }
@@ -103,11 +103,11 @@ HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
 static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(
       se::interpreter::kXlaInterpreterPlatformId, []() {
-        return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
+        return absl::make_unique<xla::interpreter::InterpreterCompiler>();
       });
   xla::ComputationPlacer::RegisterComputationPlacer(
       se::interpreter::kXlaInterpreterPlatformId,
-      []() { return xla::MakeUnique<xla::ComputationPlacer>(); });
+      []() { return absl::make_unique<xla::ComputationPlacer>(); });
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 8d40c08d55..2259dc1083 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
index d27cd7502f..7955ee5cf3 100644
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 
@@ -31,7 +31,7 @@ InterpreterTransferManager::InterpreterTransferManager()
 
 static std::unique_ptr<xla::TransferManager>
 CreateInterpreterTransferManager() {
-  return xla::MakeUnique<xla::InterpreterTransferManager>();
+  return absl::make_unique<xla::InterpreterTransferManager>();
 }
 
 static bool InitModule() {
diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h
index 2b44f30821..b732230fdd 100644
--- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_
 
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/core/platform/macros.h"
@@ -33,4 +33,4 @@ class InterpreterTransferManager : public GenericTransferManager {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_INTERPRETER_TRANSFER_MANAGER_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index 42c2c28997..e57a9b3672 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
 #include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
@@ -70,8 +71,8 @@ port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 XlaInterpreterPlatform::GetUncachedExecutor(
     const StreamExecutorConfig& config) {
-  auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<XlaInterpreterExecutor>(config.plugin_config));
+  auto executor = absl::make_unique<StreamExecutor>(
+      this, absl::make_unique<XlaInterpreterExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status{
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 805fdb2d5b..5741864282 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -26,9 +26,11 @@ limitations under the License.
 #include <string>
 #include <tuple>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -49,20 +51,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace xla {
 
-// For now moving only one API here, but we should have a single top level
-// anonymous namespace, instead of three or four spread all over this file.
-namespace {
-
-}  // namespace
-
 std::ostream& operator<<(std::ostream& out,
                          const LayoutConstraint& constraint) {
   out << constraint.ToString();
@@ -137,7 +131,7 @@ PointsToSet::BufferSet* LayoutConstraints::GetBufferSet(
   }
   auto& buffer_set =
       buffer_sets_cache_
-          .emplace(instruction, MakeUnique<PointsToSet::BufferSet>())
+          .emplace(instruction, absl::make_unique<PointsToSet::BufferSet>())
           .first->second;
   const auto& points_to_set = points_to_analysis_.GetPointsToSet(instruction);
   points_to_set.ForEachElement(
@@ -368,31 +362,27 @@ const ShapeLayout* LayoutConstraints::ResultLayout() const {
 
 string LayoutConstraints::ToString() const {
   string output;
-  tensorflow::strings::StrAppend(&output, "LayoutConstraints for computation ",
-                                 computation_->name(), ":\n");
+  absl::StrAppend(&output, "LayoutConstraints for computation ",
+                  computation_->name(), ":\n");
   for (auto* instruction : computation_->MakeInstructionPostOrder()) {
-    tensorflow::strings::StrAppend(&output, "  ", instruction->ToShortString(),
-                                   "\n");
+    absl::StrAppend(&output, "  ", instruction->ToShortString(), "\n");
     for (int64 i = 0; i < instruction->operand_count(); ++i) {
       if (OperandLayout(instruction, i) != nullptr) {
-        tensorflow::strings::StrAppend(
-            &output, "    operand (", i,
-            "): ", OperandLayout(instruction, i)->ToString(), "\n");
+        absl::StrAppend(&output, "    operand (", i,
+                        "): ", OperandLayout(instruction, i)->ToString(), "\n");
       }
     }
     for (const LogicalBuffer* buffer :
          points_to_analysis_.GetBuffersDefinedByInstruction(instruction)) {
       if (BufferLayout(*buffer) != nullptr) {
-        tensorflow::strings::StrAppend(
-            &output, "    ", buffer->ToString(), " : ",
-            LayoutUtil::HumanString(*BufferLayout(*buffer)), "\n");
+        absl::StrAppend(&output, "    ", buffer->ToString(), " : ",
+                        LayoutUtil::HumanString(*BufferLayout(*buffer)), "\n");
       }
     }
   }
 
   if (ResultLayout() != nullptr) {
-    tensorflow::strings::StrAppend(&output, "  => ", ResultLayout()->ToString(),
-                                   "\n");
+    absl::StrAppend(&output, "  => ", ResultLayout()->ToString(), "\n");
   }
   return output;
 }
@@ -909,7 +899,7 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
                       "Layout of instruction %s at index {%s} does not match "
                       "source LogicalBuffer %s: %s vs %s",
                       instruction->name().c_str(),
-                      tensorflow::str_util::Join(index, ",").c_str(),
+                      absl::StrJoin(index, ",").c_str(),
                       buffer->ToString().c_str(),
                       ShapeUtil::HumanStringWithLayout(instruction_subshape)
                           .c_str(),
@@ -1008,7 +998,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     //
     // TODO(jingyue): Other operations, such as kSlice and kConcat, can benefit
     // from assigning the same layout to input and output.
-    return MakeUnique<Layout>(output_layout);
+    return absl::make_unique<Layout>(output_layout);
   }
 
   if (instruction->opcode() == HloOpcode::kReshape) {
@@ -1031,13 +1021,13 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     *operand_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(operand_shape);
     if (ShapeUtil::ReshapeIsBitcast(operand_shape, output_shape_with_layout)) {
-      return MakeUnique<Layout>(operand_shape.layout());
+      return absl::make_unique<Layout>(operand_shape.layout());
     }
     if (ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape)) {
       *operand_shape.mutable_layout() = output_layout;
       if (ShapeUtil::ReshapeIsBitcast(operand_shape,
                                       output_shape_with_layout)) {
-        return MakeUnique<Layout>(output_layout);
+        return absl::make_unique<Layout>(output_layout);
       }
     }
     auto aligned_operand_shape =
@@ -1046,7 +1036,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
       auto operand_layout = aligned_operand_shape.value().layout();
       TF_CHECK_OK(
           LayoutUtil::ValidateLayoutForShape(operand_layout, operand_shape));
-      return MakeUnique<Layout>(operand_layout);
+      return absl::make_unique<Layout>(operand_layout);
     }
   }
 
@@ -1062,7 +1052,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     Layout operand_layout = LayoutUtil::MakeLayout(new_minor_to_major);
     TF_CHECK_OK(
         LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape()));
-    return MakeUnique<Layout>(operand_layout);
+    return absl::make_unique<Layout>(operand_layout);
   }
 
   return nullptr;
@@ -1080,7 +1070,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
       !ShapeUtil::IsScalar(operand->shape()) &&
       ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape())) {
     // Assign users the same layout as the operand.
-    return MakeUnique<Layout>(operand_layout);
+    return absl::make_unique<Layout>(operand_layout);
   }
 
   if (user->opcode() == HloOpcode::kReshape) {
@@ -1103,13 +1093,13 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     *output_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(output_shape);
     if (ShapeUtil::ReshapeIsBitcast(output_shape, operand_shape_with_layout)) {
-      return MakeUnique<Layout>(output_shape.layout());
+      return absl::make_unique<Layout>(output_shape.layout());
     }
     if (ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape)) {
       *output_shape.mutable_layout() = operand_layout;
       if (ShapeUtil::ReshapeIsBitcast(output_shape,
                                       operand_shape_with_layout)) {
-        return MakeUnique<Layout>(operand_layout);
+        return absl::make_unique<Layout>(operand_layout);
       }
     }
     auto aligned_user_shape =
@@ -1118,7 +1108,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
       auto user_layout = aligned_user_shape.value().layout();
       TF_CHECK_OK(
           LayoutUtil::ValidateLayoutForShape(user_layout, output_shape));
-      return MakeUnique<Layout>(user_layout);
+      return absl::make_unique<Layout>(user_layout);
     }
   }
 
@@ -1134,7 +1124,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     }
     Layout user_layout = LayoutUtil::MakeLayout(new_minor_to_major);
     TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(user_layout, user->shape()));
-    return MakeUnique<Layout>(user_layout);
+    return absl::make_unique<Layout>(user_layout);
   }
 
   return nullptr;
@@ -1400,8 +1390,8 @@ StatusOr<Layout> InferArrayLayout(
       return FailedPrecondition(
           "Array at index {%s} in instruction %s aliases buffers %s "
           "and %s which have different layouts",
-          tensorflow::str_util::Join(index, ",").c_str(),
-          instruction->name().c_str(), source_buffers[0]->ToString().c_str(),
+          absl::StrJoin(index, ",").c_str(), instruction->name().c_str(),
+          source_buffers[0]->ToString().c_str(),
           source_buffer->ToString().c_str());
     }
   }
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index f9e8dbea2f..3e000ec2df 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -297,7 +297,7 @@ class LayoutAssignment : public HloPassInterface {
       ComputationLayout* entry_computation_layout,
       ChannelLayoutConstraints* channel_constraints = nullptr);
   ~LayoutAssignment() override {}
-  tensorflow::StringPiece name() const override { return "layout-assignment"; }
+  absl::string_view name() const override { return "layout-assignment"; }
 
   // Assign layouts to the given module. Returns whether the module was changed
   // (any layouts were changed).
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index a16fa75e30..6d05fa5fe2 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -59,7 +59,7 @@ class LayoutAssignmentTest : public HloTestBase {
     EXPECT_IS_OK(layout_assignment.Run(module).status());
   }
 
-  std::vector<int64> LayoutOf(HloModule* module, tensorflow::StringPiece name) {
+  std::vector<int64> LayoutOf(HloModule* module, absl::string_view name) {
     auto minor_to_major =
         FindInstruction(module, name)->shape().layout().minor_to_major();
     return std::vector<int64>(minor_to_major.begin(), minor_to_major.end());
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index cdd3daf73b..fc3289f30d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -38,6 +38,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
 )
@@ -69,6 +70,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:support",
         "@llvm//:target",
@@ -88,6 +90,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
 )
@@ -103,6 +107,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
 )
@@ -133,9 +138,7 @@ cc_library(
         ":llvm_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "@llvm//:core",
@@ -193,6 +196,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
         "//tensorflow/compiler/xla/service/gpu:partition_assignment",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@llvm//:core",
     ],
 )
@@ -219,7 +224,7 @@ cc_library(
     deps = [
         ":llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
 )
@@ -230,6 +235,7 @@ cc_library(
     hdrs = ["buffer_assignment_util.h"],
     deps = [
         "//tensorflow/compiler/xla/service:buffer_assignment",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index fe9eab93aa..8d9fa99d82 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_ALIAS_ANALYSIS_H_
 
+#include "absl/strings/str_cat.h"
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -23,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace llvm_ir {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
index 4eb5d9fb47..bdce4a171b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
+#include "absl/strings/str_cat.h"
 
 namespace xla {
 namespace llvm_ir {
@@ -48,7 +49,7 @@ string ConstantBufferAllocationToGlobalName(
       c = '_';
     }
   }
-  return tensorflow::strings::StrCat("buffer_for_", instr_name);
+  return absl::StrCat("buffer_for_", instr_name);
 }
 
 const Literal& LiteralForConstantAllocation(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 27fbb11e2e..ad350613dd 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -40,7 +40,7 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
     const Shape& update_shape, const ElementGenerator& start_indices_generator,
     bool is_signed, ElementGenerator update_array_generator,
     const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
-    tensorflow::StringPiece name, llvm::IRBuilder<>* b) {
+    absl::string_view name, llvm::IRBuilder<>* b) {
   const Shape& output_shape = output_array.GetShape();
 
   // Read start indices from start_indices_generator.
@@ -101,8 +101,7 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
 
 Status EmitDynamicUpdateSliceInPlace(
     tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
-    const IrArray& output_array, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* b) {
+    const IrArray& output_array, absl::string_view name, llvm::IRBuilder<>* b) {
   VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name;
 
   // No need to use operand_arrays[0], the input array of the
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index 3502577d23..e1631a62ae 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -65,8 +65,7 @@ inline bool CanEmitFusedDynamicUpdateSliceInPlace(
 // modify the input/output buffer without touching any of the other elements.
 Status EmitDynamicUpdateSliceInPlace(
     tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
-    const IrArray& output_array, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* b);
+    const IrArray& output_array, absl::string_view name, llvm::IRBuilder<>* b);
 
 // Given a loop-fusion node whose root is a dynamic-update-slice op whose
 // array-to-be-updated and output share the same buffer slice, emits
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 2b6caee6aa..6971220022 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -342,9 +342,9 @@ llvm::Value* IrArray::Index::Linearize(
   return logical_linear_index;
 }
 
-llvm::Value* IrArray::EmitArrayElementAddress(
-    const IrArray::Index& index, llvm::IRBuilder<>* b,
-    tensorflow::StringPiece name) const {
+llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
+                                              llvm::IRBuilder<>* b,
+                                              absl::string_view name) const {
   if (ShapeUtil::IsScalar(*shape_)) {
     // Special handling of scalars: a scalar pretends to have the same value for
     // every index, thus effectively implementing broadcasting of its value
@@ -402,7 +402,7 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata(
 
 llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
                                            llvm::IRBuilder<>* b,
-                                           tensorflow::StringPiece name) const {
+                                           absl::string_view name) const {
   llvm::Value* element_address = EmitArrayElementAddress(index, b, name);
   llvm::LoadInst* load = b->CreateLoad(element_address);
   AnnotateLoadStoreInstructionWithMetadata(load);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 28ca793e3e..e913c109b3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -19,12 +19,13 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -81,7 +82,7 @@ class IrArray {
         }
       }
       CHECK_NE(index_type_, nullptr);
-      CHECK(c_all_of(multidim, [&](llvm::Value* v) {
+      CHECK(absl::c_all_of(multidim, [&](llvm::Value* v) {
         return index_type_ == v->getType();
       }));
     }
@@ -240,7 +241,7 @@ class IrArray {
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
   llvm::Value* EmitArrayElementAddress(const Index& index, llvm::IRBuilder<>* b,
-                                       tensorflow::StringPiece name = "") const;
+                                       absl::string_view name = "") const;
 
   // Attach metadata this IrArray instance knows about to "instruction".
   void AnnotateLoadStoreInstructionWithMetadata(
@@ -254,7 +255,7 @@ class IrArray {
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
   llvm::Value* EmitReadArrayElement(const Index& index, llvm::IRBuilder<>* b,
-                                    tensorflow::StringPiece name = "") const;
+                                    absl::string_view name = "") const;
 
   // Emit IR to write the given value to the array element at the given index.
   void EmitWriteArrayElement(const Index& index, llvm::Value* value,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index b79567369a..bd0139f85b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace xla {
 Status KernelSupportLibrary::For(
-    tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+    absl::string_view name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step,
     const std::function<Status(llvm::Value*, bool)>& for_body_generator) {
   return If(b_->CreateICmpSLT(start, end), [&]() -> Status {
@@ -30,7 +30,7 @@ Status KernelSupportLibrary::For(
 }
 
 Status KernelSupportLibrary::For(
-    tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+    absl::string_view name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step, bool peel_first_iteration,
     const std::function<Status(llvm::Value*, llvm::Value*)>&
         for_body_generator) {
@@ -56,7 +56,7 @@ Status KernelSupportLibrary::For(
 }
 
 Status KernelSupportLibrary::If(
-    tensorflow::StringPiece name, llvm::Value* condition,
+    absl::string_view name, llvm::Value* condition,
     const std::function<Status()>& true_block_generator,
     const std::function<Status()>& false_block_generator) {
   llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(condition, name, b_);
@@ -70,7 +70,7 @@ Status KernelSupportLibrary::If(
 
 void KernelSupportLibrary::EmitAndCallOutlinedKernel(
     bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
-    tensorflow::StringPiece kernel_name,
+    absl::string_view kernel_name,
     KernelSupportLibrary::ArgumentVector arguments,
     const std::function<void(KernelSupportLibrary::ArgumentVector)>&
         kernel_body_generator) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index b00f903d56..b152cf9275 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace xla {
 // A thin wrapper around llvm_loop.h to make code generating structured control
@@ -49,13 +49,13 @@ class KernelSupportLibrary {
   //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`;
   //   }
   Status For(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<Status(llvm::Value* ind_var,
                                  bool is_first_iteration)>& for_body_generator);
 
   void ForReturnVoid(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
@@ -67,7 +67,7 @@ class KernelSupportLibrary {
                  }));
   }
 
-  Status For(tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+  Status For(absl::string_view name, int64 start, int64 end, int64 step,
              const std::function<Status(llvm::Value* ind_var,
                                         bool is_first_iteration)>&
                  for_body_generator) {
@@ -77,7 +77,7 @@ class KernelSupportLibrary {
   }
 
   void ForReturnVoid(
-      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+      absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
     ForReturnVoid(name, /*start=*/b_->getInt64(start),
@@ -99,13 +99,13 @@ class KernelSupportLibrary {
   //   for (i64 i = `start`; i s< `end`; i += `step`)
   //     `for_body_generator(/*ind_var=*/,i,
   //                         /*is_first_iteration=*/,(i != `start`))`;
-  Status For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+  Status For(absl::string_view name, llvm::Value* start, llvm::Value* end,
              llvm::Value* step, bool peel_first_iteration,
              const std::function<Status(llvm::Value* ind_var,
                                         llvm::Value* is_first_iteration)>&
                  for_body_generator);
 
-  void ForReturnVoid(tensorflow::StringPiece name, llvm::Value* start,
+  void ForReturnVoid(absl::string_view name, llvm::Value* start,
                      llvm::Value* end, llvm::Value* step,
                      bool peel_first_iteration,
                      const std::function<void(llvm::Value* ind_var,
@@ -119,7 +119,7 @@ class KernelSupportLibrary {
         }));
   }
 
-  Status For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+  Status For(absl::string_view name, llvm::Value* start, llvm::Value* end,
              int64 step, bool peel_first_iteration,
              const std::function<Status(llvm::Value* ind_var,
                                         llvm::Value* is_first_iteration)>&
@@ -129,7 +129,7 @@ class KernelSupportLibrary {
                peel_first_iteration, for_body_generator);
   }
 
-  void ForReturnVoid(tensorflow::StringPiece name, llvm::Value* start,
+  void ForReturnVoid(absl::string_view name, llvm::Value* start,
                      llvm::Value* end, int64 step, bool peel_first_iteration,
                      const std::function<void(llvm::Value* ind_var,
                                               llvm::Value* is_first_iteration)>&
@@ -140,7 +140,7 @@ class KernelSupportLibrary {
   }
 
   Status For(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
     return For(name, start, end, step,
@@ -151,7 +151,7 @@ class KernelSupportLibrary {
   }
 
   void ForReturnVoid(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
     ForReturnVoid(name, start, end, step,
@@ -162,8 +162,7 @@ class KernelSupportLibrary {
   }
 
   Status For(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-      int64 step,
+      absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
     return For(name, start, end, llvm::ConstantInt::get(start->getType(), step),
                /*peel_first_iteration=*/false,
@@ -173,8 +172,7 @@ class KernelSupportLibrary {
   }
 
   void ForReturnVoid(
-      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-      int64 step,
+      absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
     ForReturnVoid(name, start, end,
                   llvm::ConstantInt::get(start->getType(), step),
@@ -182,7 +180,7 @@ class KernelSupportLibrary {
   }
 
   Status For(
-      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+      absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
     return For(name, /*start=*/b_->getInt64(start),
                /*end=*/b_->getInt64(end),
@@ -190,7 +188,7 @@ class KernelSupportLibrary {
   }
 
   void ForReturnVoid(
-      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+      absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
     ForReturnVoid(name, /*start=*/b_->getInt64(start),
                   /*end=*/b_->getInt64(end),
@@ -203,7 +201,7 @@ class KernelSupportLibrary {
   //     `true_block_generator()`;
   //   else
   //      `false_block_generator()`;
-  Status If(tensorflow::StringPiece name, llvm::Value* condition,
+  Status If(absl::string_view name, llvm::Value* condition,
             const std::function<Status()>& true_block_generator,
             const std::function<Status()>& false_block_generator =
                 []() -> Status { return Status::OK(); });
@@ -222,7 +220,7 @@ class KernelSupportLibrary {
     IfReturnVoid("", condition, true_block_generator, false_block_generator);
   }
 
-  void IfReturnVoid(tensorflow::StringPiece name, llvm::Value* condition,
+  void IfReturnVoid(absl::string_view name, llvm::Value* condition,
                     const std::function<void()>& true_block_generator,
                     const std::function<void()>& false_block_generator = []() {
                     }) {
@@ -259,13 +257,13 @@ class KernelSupportLibrary {
   // Currently we only support at most one nullptr value in `arguments`.
   static void EmitAndCallOutlinedKernel(
       bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
-      tensorflow::StringPiece kernel_name, ArgumentVector arguments,
+      absl::string_view kernel_name, ArgumentVector arguments,
       const std::function<void(ArgumentVector)>& kernel_body_generator);
 
   // Thin wrappers around the more general EmitAndCallOutlinedKernel above.
   static void EmitAndCallOutlinedKernel(
       bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
-      tensorflow::StringPiece kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1,
       llvm::Value* arg2,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
           kernel_body_generator) {
@@ -278,7 +276,7 @@ class KernelSupportLibrary {
 
   static void EmitAndCallOutlinedKernel(
       bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
-      tensorflow::StringPiece kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      absl::string_view kernel_name, llvm::Value* arg0, llvm::Value* arg1,
       llvm::Value* arg2, llvm::Value* arg3,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*,
                                llvm::Value*)>& kernel_body_generator) {
@@ -296,4 +294,4 @@ class KernelSupportLibrary {
 };
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_KERNEL_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index 35b3941272..cb4d1db997 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -55,10 +55,10 @@ Shape MergeDimensions(tensorflow::gtl::ArraySlice<size_t> segs,
 }
 }  // namespace
 
-tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(
-    const Shape& a, const Shape& b) {
+absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
+                                                     const Shape& b) {
   if (!ShapeUtil::CompatibleIgnoringElementType(a, b)) {
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
 
   std::vector<int64> perm(a.dimensions().size());
@@ -88,7 +88,7 @@ tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(
     return dims_021;
   }
 
-  return tensorflow::gtl::nullopt;
+  return absl::nullopt;
 }
 
 IrArray::Index GetUnreducedOutputIndex(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index ccb9b8ba3e..8bd06c42c3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -36,8 +36,8 @@ namespace llvm_ir {
 
 // If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
 // reduced shape of `b` or the 0-2-1 shape.
-tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
-                                                                const Shape& b);
+absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
+                                                     const Shape& b);
 
 // Return the unreduced output index corresponding to the given reduced output
 // index.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index ba7f94834c..978fa5b453 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -25,14 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 namespace llvm_ir {
 
-ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
+ForLoop::ForLoop(absl::string_view prefix, absl::string_view suffix,
                  llvm::Value* start_index, llvm::Value* end_index,
                  llvm::Value* step, UnrollMode unroll_mode,
                  bool prevent_vectorization)
@@ -46,9 +46,9 @@ ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
       prevent_vectorization_(prevent_vectorization) {}
 
 /* static */ std::unique_ptr<ForLoop> ForLoop::EmitForLoop(
-    tensorflow::StringPiece prefix, llvm::Value* start_index,
-    llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* b,
-    UnrollMode unroll_mode, bool prevent_vectorization) {
+    absl::string_view prefix, llvm::Value* start_index, llvm::Value* end_index,
+    llvm::Value* step, llvm::IRBuilder<>* b, UnrollMode unroll_mode,
+    bool prevent_vectorization) {
   std::unique_ptr<ForLoop> loop(new ForLoop(prefix, /*suffix=*/"", start_index,
                                             end_index, step, unroll_mode,
                                             prevent_vectorization));
@@ -168,16 +168,16 @@ std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(llvm::IRBuilder<>* b) {
   return result;
 }
 
-string ForLoop::GetQualifiedName(tensorflow::StringPiece name) {
+string ForLoop::GetQualifiedName(absl::string_view name) {
   return llvm_ir::IrName(prefix_, llvm_ir::IrName(name, suffix_));
 }
 
-llvm::BasicBlock* ForLoop::CreateLoopBB(tensorflow::StringPiece name,
+llvm::BasicBlock* ForLoop::CreateLoopBB(absl::string_view name,
                                         llvm::IRBuilder<>* b) {
   return CreateBasicBlock(insert_before_bb_, GetQualifiedName(name), b);
 }
 
-std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
+std::unique_ptr<ForLoop> ForLoopNest::AddLoop(absl::string_view suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index,
                                               UnrollMode unroll_mode,
@@ -186,12 +186,9 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                  unroll_mode, prevent_vectorization);
 }
 
-std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
-                                              llvm::Value* start_index,
-                                              llvm::Value* end_index,
-                                              llvm::Value* stride,
-                                              UnrollMode unroll_mode,
-                                              bool prevent_vectorization) {
+std::unique_ptr<ForLoop> ForLoopNest::AddLoop(
+    absl::string_view suffix, llvm::Value* start_index, llvm::Value* end_index,
+    llvm::Value* stride, UnrollMode unroll_mode, bool prevent_vectorization) {
   if (inner_loop_body_bb_ != nullptr) {
     // Create this loop inside the previous one.
     b_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
@@ -216,7 +213,7 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index,
-                                              tensorflow::StringPiece suffix,
+                                              absl::string_view suffix,
                                               UnrollMode unroll_mode,
                                               bool prevent_vectorization) {
   CHECK_LE(start_index, end_index);
@@ -227,7 +224,7 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index, int64 stride,
-                                              tensorflow::StringPiece suffix,
+                                              absl::string_view suffix,
                                               UnrollMode unroll_mode,
                                               bool prevent_vectorization) {
   CHECK_LE(start_index, end_index);
@@ -238,7 +235,7 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
 }
 
 IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
-                                             tensorflow::StringPiece suffix) {
+                                             absl::string_view suffix) {
   std::vector<int64> dimensions(ShapeUtil::Rank(shape));
   std::iota(dimensions.begin(), dimensions.end(), 0);
   return AddLoopsForShapeOnDimensions(shape, dimensions, suffix);
@@ -246,14 +243,14 @@ IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
 
 IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions(
     const Shape& shape, tensorflow::gtl::ArraySlice<int64> dimensions,
-    tensorflow::StringPiece suffix) {
+    absl::string_view suffix) {
   llvm_ir::IrArray::Index index(index_type_, shape.dimensions_size());
   for (int64 dimension : dimensions) {
     std::unique_ptr<llvm_ir::ForLoop> loop = AddLoop(
         /*start_index=*/0,
         /*end_index=*/shape.dimensions(dimension),
         /*suffix=*/
-        llvm_ir::IrName(suffix, tensorflow::strings::StrCat(dimension)));
+        llvm_ir::IrName(suffix, absl::StrCat(dimension)));
     index[dimension] = loop->GetIndVarValue();
   }
   return index;
@@ -261,7 +258,7 @@ IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions(
 
 IrArray::Index ForLoopNest::EmitOperandArrayLoopNest(
     const llvm_ir::IrArray& operand_array, int64 dimension_to_skip,
-    tensorflow::StringPiece name_suffix) {
+    absl::string_view name_suffix) {
   // Prepares the dimension list we will use to emit the loop nest. Outermost
   // loops are added first. Add loops in major-to-minor order, and skip the
   // 'dimension_to_skip' dimension.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index a4fed5c8dc..62aa15fe2d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -19,15 +19,15 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -78,7 +78,7 @@ class ForLoop {
   // `unroll_mode` specifies the desired LLVM unrolling behavior for generated
   //  loop.
   static std::unique_ptr<ForLoop> EmitForLoop(
-      tensorflow::StringPiece prefix, llvm::Value* start_index,
+      absl::string_view prefix, llvm::Value* start_index,
       llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* b,
       UnrollMode unroll_mode = llvm_ir::UnrollMode::kDefaultUnroll,
       bool prevent_vectorization = false);
@@ -133,19 +133,18 @@ class ForLoop {
   // Allow ForLoopNest to call this private constructor.
   friend class ForLoopNest;
 
-  ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
+  ForLoop(absl::string_view prefix, absl::string_view suffix,
           llvm::Value* start_index, llvm::Value* end_index, llvm::Value* step,
           UnrollMode unroll_mode, bool prevent_vectorization);
 
   // Emit the loop at the insert point of the builder.
   void Emit(llvm::IRBuilder<>* b);
 
-  llvm::BasicBlock* CreateLoopBB(tensorflow::StringPiece name,
-                                 llvm::IRBuilder<>* b);
+  llvm::BasicBlock* CreateLoopBB(absl::string_view name, llvm::IRBuilder<>* b);
 
   // Creates a name for an LLVM construct, appending prefix_ and suffix_, if
   // they are set.
-  string GetQualifiedName(tensorflow::StringPiece name);
+  string GetQualifiedName(absl::string_view name);
 
   // Return a list of metadata nodes that should be associated with the
   // llvm::Loop for this `ForLoop`.
@@ -182,7 +181,7 @@ class ForLoopNest {
     SetIndexType(index_ty);
   }
 
-  ForLoopNest(tensorflow::StringPiece name, llvm::IRBuilder<>* b,
+  ForLoopNest(absl::string_view name, llvm::IRBuilder<>* b,
               llvm::Type* index_ty = nullptr)
       : name_(std::string(name)),
         outer_loop_preheader_bb_(nullptr),
@@ -197,14 +196,14 @@ class ForLoopNest {
   // been added then emit loop inside the body of the last added loop.
   // unroll_mode is used to emit metadata that controls LLVM unrolling.
   std::unique_ptr<ForLoop> AddLoop(
-      tensorflow::StringPiece suffix, llvm::Value* start_index,
+      absl::string_view suffix, llvm::Value* start_index,
       llvm::Value* end_index, llvm::Value* stride,
       UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
       bool prevent_vectorization = false);
 
   // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(
-      tensorflow::StringPiece suffix, llvm::Value* start_index,
+      absl::string_view suffix, llvm::Value* start_index,
       llvm::Value* end_index,
       UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
       bool prevent_vectorization = false);
@@ -213,13 +212,13 @@ class ForLoopNest {
   // end index are constant.
   std::unique_ptr<ForLoop> AddLoop(
       int64 start_index, int64 end_index, int64 stride,
-      tensorflow::StringPiece suffix,
+      absl::string_view suffix,
       UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
       bool prevent_vectorization = false);
 
   // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(
-      int64 start_index, int64 end_index, tensorflow::StringPiece suffix,
+      int64 start_index, int64 end_index, absl::string_view suffix,
       UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
       bool prevent_vectorization = false);
 
@@ -234,8 +233,7 @@ class ForLoopNest {
   // within the shape. One possible order for that sequence would be:
   //
   //   (0,0), (0,1), (0,2), (1,0), (1,1), (1,2)
-  IrArray::Index AddLoopsForShape(const Shape& shape,
-                                  tensorflow::StringPiece suffix);
+  IrArray::Index AddLoopsForShape(const Shape& shape, absl::string_view suffix);
 
   // Add a loop for each dimension in "dimensions". "suffix" is the
   // name suffix of the indvar and basic blocks in this new loop nest.
@@ -245,7 +243,7 @@ class ForLoopNest {
   // dimension that is not in "dimensions".
   IrArray::Index AddLoopsForShapeOnDimensions(
       const Shape& shape, tensorflow::gtl::ArraySlice<int64> dimensions,
-      tensorflow::StringPiece suffix);
+      absl::string_view suffix);
 
   // Emits a series of nested loops for iterating over an operand array. Loops
   // are constructed in major to minor dimension layout order. No loop is
@@ -256,7 +254,7 @@ class ForLoopNest {
   // basic blocks) constructed by this method.
   IrArray::Index EmitOperandArrayLoopNest(const llvm_ir::IrArray& operand_array,
                                           int64 dimension_to_skip,
-                                          tensorflow::StringPiece name_suffix);
+                                          absl::string_view name_suffix);
 
   // Convenience methods which return particular basic blocks of the outermost
   // or innermost loops. These methods return nullptr if no loops have been
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index e6126881af..f0db2a3761 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/MDBuilder.h"
@@ -34,8 +36,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -61,7 +61,7 @@ string AsString(const std::string& str) {
   return string(str.data(), str.length());
 }
 
-llvm::StringRef AsStringRef(tensorflow::StringPiece str) {
+llvm::StringRef AsStringRef(absl::string_view str) {
   return llvm::StringRef(str.data(), str.size());
 }
 
@@ -262,15 +262,17 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
 }
 
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
-                                            tensorflow::StringPiece name,
+                                            absl::string_view name,
                                             llvm::IRBuilder<>* b,
                                             int alignment) {
   return EmitAllocaAtFunctionEntryWithCount(type, nullptr, name, b, alignment);
 }
 
-llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
-    llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* b, int alignment) {
+llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
+                                                     llvm::Value* element_count,
+                                                     absl::string_view name,
+                                                     llvm::IRBuilder<>* b,
+                                                     int alignment) {
   llvm::IRBuilder<>::InsertPoint insert_point = b->saveIP();
   llvm::Function* function = b->GetInsertBlock()->getParent();
   b->SetInsertPoint(&function->getEntryBlock(),
@@ -285,7 +287,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
 }
 
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
-                                   tensorflow::StringPiece name,
+                                   absl::string_view name,
                                    llvm::IRBuilder<>* b) {
   return llvm::BasicBlock::Create(
       /*Context=*/b->getContext(),
@@ -294,27 +296,25 @@ llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
       /*InsertBefore*/ insert_before);
 }
 
-LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
+LlvmIfData EmitIfThenElse(llvm::Value* condition, absl::string_view name,
                           llvm::IRBuilder<>* b, bool emit_else) {
   llvm_ir::LlvmIfData if_data;
   if_data.if_block = b->GetInsertBlock();
   if_data.true_block =
-      CreateBasicBlock(nullptr, tensorflow::strings::StrCat(name, "-true"), b);
+      CreateBasicBlock(nullptr, absl::StrCat(name, "-true"), b);
   if_data.false_block =
-      emit_else ? CreateBasicBlock(
-                      nullptr, tensorflow::strings::StrCat(name, "-false"), b)
+      emit_else ? CreateBasicBlock(nullptr, absl::StrCat(name, "-false"), b)
                 : nullptr;
 
   // Add a terminator to the if block, if necessary.
   if (if_data.if_block->getTerminator() == nullptr) {
     b->SetInsertPoint(if_data.if_block);
-    if_data.after_block = CreateBasicBlock(
-        nullptr, tensorflow::strings::StrCat(name, "-after"), b);
+    if_data.after_block =
+        CreateBasicBlock(nullptr, absl::StrCat(name, "-after"), b);
     b->CreateBr(if_data.after_block);
   } else {
     if_data.after_block = if_data.if_block->splitBasicBlock(
-        b->GetInsertPoint(),
-        AsStringRef(tensorflow::strings::StrCat(name, "-after")));
+        b->GetInsertPoint(), AsStringRef(absl::StrCat(name, "-after")));
   }
 
   // Our basic block should now end with an unconditional branch.  Remove it;
@@ -413,14 +413,14 @@ string IrName(string a) {
   return a;
 }
 
-string IrName(tensorflow::StringPiece a, tensorflow::StringPiece b) {
+string IrName(absl::string_view a, absl::string_view b) {
   if (!a.empty() && !b.empty()) {
-    return IrName(tensorflow::strings::StrCat(a, ".", b));
+    return IrName(absl::StrCat(a, ".", b));
   }
-  return IrName(tensorflow::strings::StrCat(a, b));
+  return IrName(absl::StrCat(a, b));
 }
 
-string IrName(const HloInstruction* a, tensorflow::StringPiece b) {
+string IrName(const HloInstruction* a, absl::string_view b) {
   return IrName(a->name(), b);
 }
 
@@ -556,7 +556,7 @@ std::map<int, llvm::MDNode*> MergeMetadata(
   return result;
 }
 
-static string GetProcessUniqueIrFileName(tensorflow::StringPiece prefix) {
+static string GetProcessUniqueIrFileName(absl::string_view prefix) {
   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
   static NameUniquer* uniquer = new NameUniquer(/*separator=*/"-");
 
@@ -584,18 +584,16 @@ Status DumpIRToDirectory(const string& directory_name,
   // XlaJitCompiledCpuFunction::Compile.  Avoid overwriting IR files previously
   // dumped from the same process in such cases.
   string unique_and_safe_file_name = GetProcessUniqueIrFileName(
-      tensorflow::strings::StrCat("ir-", SanitizeFileName(hlo_module_name), "-",
-                                  optimized ? "with" : "no", "-opt"));
+      absl::StrCat("ir-", SanitizeFileName(hlo_module_name), "-",
+                   optimized ? "with" : "no", "-opt"));
 
   string ir_file_name = tensorflow::io::JoinPath(
-      directory_name,
-      tensorflow::strings::StrCat(unique_and_safe_file_name, ".ll"));
+      directory_name, absl::StrCat(unique_and_safe_file_name, ".ll"));
 
   // For some models the embedded constants can be huge, so also dump the module
   // with the constants stripped to get IR that is easier to manipulate.
   string ir_no_constant_initializers_file_name = tensorflow::io::JoinPath(
-      directory_name,
-      tensorflow::strings::StrCat(unique_and_safe_file_name, "-noconst.ll"));
+      directory_name, absl::StrCat(unique_and_safe_file_name, "-noconst.ll"));
 
   TF_RETURN_IF_ERROR(CreateAndWriteStringToFile(
       directory_name, ir_file_name, DumpModuleToString(llvm_module)));
@@ -607,8 +605,7 @@ Status DumpIRToDirectory(const string& directory_name,
 llvm::Function* CreateFunction(llvm::FunctionType* function_type,
                                llvm::GlobalValue::LinkageTypes linkage,
                                bool enable_fast_math, bool optimize_for_size,
-                               tensorflow::StringPiece name,
-                               llvm::Module* module) {
+                               absl::string_view name, llvm::Module* module) {
   llvm::Function* function =
       llvm::Function::Create(function_type, linkage, AsStringRef(name), module);
   function->setCallingConv(llvm::CallingConv::C);
@@ -638,7 +635,7 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
     fake_argv_storage.push_back("");
     for (const auto& it : options) {
       // Skip options the XLA backend itself consumes.
-      if (!tensorflow::str_util::StartsWith(it.first, "xla_")) {
+      if (!absl::StartsWith(it.first, "xla_")) {
         if (it.second.empty()) {
           fake_argv_storage.push_back(it.first);
         } else {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 0958398534..dde50e19d1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -47,11 +47,11 @@ namespace llvm_ir {
 // Convert a std::string (used by LLVM's interfaces) to string.
 string AsString(const std::string& str);
 
-// Convert a tensorflow::StringPiece to a llvm::StringRef. Note: both
-// tensorflow::StringPiece and llvm::StringRef are non-owning pointers into a
+// Convert a absl::string_view to a llvm::StringRef. Note: both
+// absl::string_view and llvm::StringRef are non-owning pointers into a
 // string in memory. This method is used to feed strings to LLVM
 // & Clang APIs that expect llvm::StringRef.
-llvm::StringRef AsStringRef(tensorflow::StringPiece str);
+llvm::StringRef AsStringRef(absl::string_view str);
 
 template <typename T>
 llvm::ArrayRef<T> AsArrayRef(const std::vector<T>& vec) {
@@ -88,8 +88,8 @@ string DumpModuleToString(const llvm::Module& module);
 //   - removing all '%'s.
 //
 string IrName(string a);
-string IrName(tensorflow::StringPiece a, tensorflow::StringPiece b);
-string IrName(const HloInstruction* a, tensorflow::StringPiece b = "");
+string IrName(absl::string_view a, absl::string_view b);
+string IrName(const HloInstruction* a, absl::string_view b = "");
 
 // Removes special characters from a function name.
 //
@@ -164,21 +164,23 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
 // This can be useful to avoid e.g. executing an alloca every time
 // through a loop.
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
-                                            tensorflow::StringPiece name,
+                                            absl::string_view name,
                                             llvm::IRBuilder<>* b,
                                             int alignment = 0);
 
 // As EmitAllocaAtFunctionEntry, but allocates element_count entries
 // instead of a single element.
-llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
-    llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* b, int alignment = 0);
+llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
+                                                     llvm::Value* element_count,
+                                                     absl::string_view name,
+                                                     llvm::IRBuilder<>* b,
+                                                     int alignment = 0);
 
 // Creates a basic block with the same context and function as for the
 // builder. Inserts at the end of the function if insert_before is
 // null.
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
-                                   tensorflow::StringPiece name,
+                                   absl::string_view name,
                                    llvm::IRBuilder<>* b);
 
 // Struct with data on a conditional branch in a diamond shape created
@@ -210,7 +212,7 @@ struct LlvmIfData {
 // Currently the insertion point of the builder must be a well-formed
 // block with a terminator. If you need to use this for a
 // non-terminated block, just make the function able to do that too.
-LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
+LlvmIfData EmitIfThenElse(llvm::Value* condition, absl::string_view name,
                           llvm::IRBuilder<>* b, bool emit_else = true);
 
 // Emits a compare operation between "lhs" and "rhs" with the given predicate,
@@ -285,8 +287,7 @@ Status DumpIRToDirectory(const string& directory_name,
 llvm::Function* CreateFunction(llvm::FunctionType* function_type,
                                llvm::GlobalValue::LinkageTypes linkage,
                                bool enable_fast_math, bool optimize_for_size,
-                               tensorflow::StringPiece name,
-                               llvm::Module* module);
+                               absl::string_view name, llvm::Module* module);
 
 // Extracts the xla_backend_extra_options from `config` and passes those that
 // don't start with xla_ to LLVM.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 36f5fa1952..cf7445804c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -86,7 +86,7 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
 }
 
 std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
-    tensorflow::StringPiece loop_name, llvm::Type* index_type) {
+    absl::string_view loop_name, llvm::Type* index_type) {
   CHECK_NE(index_type, nullptr);
   if (ShapeUtil::IsScalar(shape_)) {
     // No loop needed, so set exit_bb_ to nullptr.
@@ -122,7 +122,7 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   return {array_index};
 }
 
-Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name,
+Status LoopEmitter::EmitLoop(absl::string_view loop_name,
                              llvm::Type* index_type) {
   if (index_type == nullptr) {
     index_type = b_->getInt64Ty();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index c4f5c82086..57d9d8bbc6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -69,10 +69,10 @@ class LoopEmitter {
   }
 
   virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
-      tensorflow::StringPiece loop_name, llvm::Type* index_type);
+      absl::string_view loop_name, llvm::Type* index_type);
 
   // Emits a complete loop nest for every element in the given shape.
-  Status EmitLoop(tensorflow::StringPiece loop_name = "",
+  Status EmitLoop(absl::string_view loop_name = "",
                   llvm::Type* index_type = nullptr);
 
  protected:
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index e546f5cc4a..00dd3f1638 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
@@ -29,8 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -42,7 +42,7 @@ namespace {
 void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index,
                      const IrArray::Index& compare_keys_index,
                      const IrArray& keys_array,
-                     const tensorflow::gtl::optional<IrArray>& values_array,
+                     const absl::optional<IrArray>& values_array,
                      llvm::IRBuilder<>* b) {
   // if (is_smaller_index &&
   //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
@@ -87,8 +87,8 @@ void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index,
 }  // namespace
 
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const tensorflow::gtl::optional<IrArray>& values_array,
-                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       const absl::optional<IrArray>& values_array,
+                       absl::string_view name, llvm::Value* xor_mask,
                        llvm::IRBuilder<>* b,
                        const gpu::LaunchDimensions* launch_dimensions) {
   const Shape& keys_shape = keys_array.GetShape();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 8458744c6b..527ed10374 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
 
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -31,8 +31,8 @@ namespace llvm_ir {
 // implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr,
 // the inner compare loop will not be parallelized.
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const tensorflow::gtl::optional<IrArray>& values_array,
-                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       const absl::optional<IrArray>& values_array,
+                       absl::string_view name, llvm::Value* xor_mask,
                        llvm::IRBuilder<>* b,
                        const gpu::LaunchDimensions* launch_dimensions);
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 5e02096ee5..ea59adadea 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -73,7 +73,7 @@ namespace {
 // If the parameter number is invalid for this computation, nullopt is
 // returned. When the return value has_value(), nullptr will never be
 // the held value.
-tensorflow::gtl::optional<const OpMetadata*> ParameterMetadata(
+absl::optional<const OpMetadata*> ParameterMetadata(
     const XlaComputation& computation, int parameter_number) {
   for (const HloComputationProto& comp : computation.proto().computations()) {
     if (comp.id() == computation.proto().entry_computation_id()) {
@@ -81,14 +81,14 @@ tensorflow::gtl::optional<const OpMetadata*> ParameterMetadata(
         if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
             instr.parameter_number() == parameter_number) {
           if (!instr.has_metadata()) {
-            return tensorflow::gtl::nullopt;
+            return absl::nullopt;
           }
           return &instr.metadata();
         }
       }
     }
   }
-  return tensorflow::gtl::nullopt;
+  return absl::nullopt;
 }
 
 ExecutionOptions CreateExecutionOptions(
@@ -158,7 +158,7 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     TF_RETURN_IF_ERROR(
         ShapeUtil::ValidateShapeWithOptionalLayout(argument_shape));
     if (!ShapeUtil::Compatible(argument_shape, program_shape.parameters(i))) {
-      tensorflow::gtl::optional<const OpMetadata*> metadata =
+      absl::optional<const OpMetadata*> metadata =
           ParameterMetadata(computation, /*parameter_number=*/i);
       auto metadata_string = [&metadata]() -> string {
         if (!metadata.has_value()) {
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index c742d35a7b..e1f56727bd 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
@@ -34,11 +34,10 @@ LogicalBuffer::~LogicalBuffer() {}
 string LogicalBuffer::ToString() const {
   string color_string;
   if (has_color()) {
-    color_string = tensorflow::strings::StrCat(" @", color().value());
+    color_string = absl::StrCat(" @", color().value());
   }
-  return tensorflow::strings::StrCat(instruction_->name(), "[",
-                                     tensorflow::str_util::Join(index_, ","),
-                                     "](#", id(), color_string, ")");
+  return absl::StrCat(instruction_->name(), "[", absl::StrJoin(index_, ","),
+                      "](#", id(), color_string, ")");
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index d631fb5ee4..eaa09591b7 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -89,7 +90,7 @@ void LogicalBufferAnalysis::NewLogicalBuffer(HloInstruction* instruction,
                                              const ShapeIndex& index) {
   CHECK_EQ(logical_buffers_.size(), next_buffer_id_);
   logical_buffers_.emplace_back(
-      MakeUnique<LogicalBuffer>(instruction, index, next_buffer_id_));
+      absl::make_unique<LogicalBuffer>(instruction, index, next_buffer_id_));
   output_buffers_[std::make_pair(instruction, index)] =
       logical_buffers_.back().get();
 
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index 0019cd7254..4c8cb7d379 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -19,10 +19,10 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace xla {
 
@@ -48,9 +48,7 @@ class MultiOutputFusion : public HloPassInterface {
  public:
   MultiOutputFusion(int64 fuel) : fuel_(fuel) {}
 
-  tensorflow::StringPiece name() const override {
-    return "multi_output_fusion";
-  }
+  absl::string_view name() const override { return "multi_output_fusion"; }
 
   // Run multi-output fusion on the given module. Returns whether the module
   // was changed.
@@ -104,17 +102,17 @@ class MultiOutputFusion : public HloPassInterface {
   // InstructionFusion instead.
   virtual bool DoProducerConsumerMultiOutputFusion();
 
- private:
-  // Update the internal data structures after instr1 and instr2 are fused into
-  // one fusion instruction.
-  void Update(HloInstruction* instr1, HloInstruction* instr2);
-
   // Optimization fuel is a compiler debugging technique that makes an
   // optimization pass stop what it is doing after having made N changes to the
   // program, where N is the fuel. By varying N, this can be used to find the
   // first single change that makes a test fail.
   int64 fuel_;
 
+ private:
+  // Update the internal data structures after instr1 and instr2 are fused into
+  // one fusion instruction.
+  void Update(HloInstruction* instr1, HloInstruction* instr2);
+
   // Computation for the pass.
   HloComputation* computation_;
 
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index f6e7578a89..70cd0a339a 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -52,7 +53,7 @@ NameUniquer::NameUniquer(const string& separator) {
   return result;
 }
 
-string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
+string NameUniquer::GetUniqueName(absl::string_view prefix) {
   string root = GetSanitizedName(prefix.empty() ? "name" : std::string(prefix));
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
@@ -63,20 +64,22 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
   if (separator_index != string::npos && (separator_index > 0) &&
       (separator_index < root.size() - 1)) {
     string after_suffix = root.substr(separator_index + 1);
-    if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+    if (absl::SimpleAtoi(after_suffix, &numeric_suffix)) {
       has_numeric_suffix = true;
       // Remove numeric suffix from root.
       root = root.substr(0, separator_index);
+    } else {
+      // absl::SimpleAtoi may modify numeric_suffix even if it returns false.
+      numeric_suffix = 0;
     }
   }
 
   SequentialIdGenerator& id_generator = generated_names_[root];
   numeric_suffix = id_generator.RegisterId(numeric_suffix);
   if (numeric_suffix == 0) {
-    return has_numeric_suffix ? tensorflow::strings::StrCat(root, separator_, 0)
-                              : root;
+    return has_numeric_suffix ? absl::StrCat(root, separator_, 0) : root;
   }
-  tensorflow::strings::StrAppend(&root, separator_, numeric_suffix);
+  absl::StrAppend(&root, separator_, numeric_suffix);
   return root;
 }
 
diff --git a/tensorflow/compiler/xla/service/name_uniquer.h b/tensorflow/compiler/xla/service/name_uniquer.h
index 4423d61069..6dd89c240f 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.h
+++ b/tensorflow/compiler/xla/service/name_uniquer.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
@@ -38,7 +38,7 @@ class NameUniquer {
 
   // Get a sanitized unique name in a string, with an optional prefix for
   // convenience.
-  string GetUniqueName(tensorflow::StringPiece prefix = "");
+  string GetUniqueName(absl::string_view prefix = "");
 
   // Sanitizes and returns the name. Unallowed characters will be replaced with
   // '_'. The result will match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index ac6ea4c72f..ccc06ce613 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace xla {
 
@@ -622,7 +622,7 @@ template <typename Previous>
 class HloInstructionPatternNameImpl {
  public:
   explicit HloInstructionPatternNameImpl(const Previous& previous,
-                                         tensorflow::StringPiece name)
+                                         absl::string_view name)
       : previous_(previous), name_(name) {}
 
   bool Match(const ::xla::HloInstruction* inst) const {
@@ -631,7 +631,7 @@ class HloInstructionPatternNameImpl {
 
  private:
   Previous previous_;
-  tensorflow::StringPiece name_;
+  absl::string_view name_;
 };
 
 // An HloInstructionPattern implementation that matches only if the instruction
@@ -784,7 +784,7 @@ class HloInstructionPattern {
 
   // Modifies the pattern to match only if the instruction has the given name.
   HloInstructionPattern<HloInstructionType, HloInstructionPatternNameImpl<Impl>>
-  WithName(tensorflow::StringPiece name) const {
+  WithName(absl::string_view name) const {
     return HloInstructionPattern<HloInstructionType,
                                  HloInstructionPatternNameImpl<Impl>>(
         HloInstructionPatternNameImpl<Impl>(impl_, name), matched_inst_);
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index 39fe3c7835..150af0cd93 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -19,20 +19,19 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 
-using tensorflow::str_util::Lowercase;
-
 // Minimum supported CUDA compute capability is 3.5.
 constexpr int kMinCudaComputeCapabilityMajor = 3;
 constexpr int kMinCudaComputeCapabilityMinor = 5;
@@ -43,7 +42,7 @@ constexpr char kInterpreter[] = "interpreter";
 namespace {
 
 string CanonicalPlatformName(const string& name) {
-  string platform_str = Lowercase(name);
+  string platform_str = absl::AsciiStrToLower(name);
   // "cpu" and "host" mean the same thing.
   if (platform_str == "cpu") {
     platform_str = "host";
@@ -94,7 +93,7 @@ PlatformUtil::GetSupportedPlatforms() {
   }
 
   // Multiple platforms present and we can't pick a reasonable default.
-  string platforms_string = tensorflow::str_util::Join(
+  string platforms_string = absl::StrJoin(
       platforms, ", ",
       [](string* out, const se::Platform* p) { out->append(p->Name()); });
   return InvalidArgument(
@@ -110,15 +109,15 @@ PlatformUtil::GetSupportedPlatforms() {
     return platforms[0];
   } else if (platforms.size() == 2) {
     for (int i = 0; i < 2; i++) {
-      if (Lowercase(platforms[i]->Name()) == kInterpreter &&
-          Lowercase(platforms[1 - i]->Name()) != kInterpreter) {
+      if (absl::AsciiStrToLower(platforms[i]->Name()) == kInterpreter &&
+          absl::AsciiStrToLower(platforms[1 - i]->Name()) != kInterpreter) {
         return platforms[1 - i];
       }
     }
   }
 
   // Multiple platforms present and we can't pick a reasonable default.
-  string platforms_string = tensorflow::str_util::Join(
+  string platforms_string = absl::StrJoin(
       platforms, ", ",
       [](string* out, const se::Platform* p) { out->append(p->Name()); });
   return InvalidArgument(
@@ -132,7 +131,7 @@ PlatformUtil::GetSupportedPlatforms() {
   string platform_str = CanonicalPlatformName(platform_name);
   TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
   for (se::Platform* platform : platforms) {
-    if (Lowercase(platform->Name()) == platform_str) {
+    if (absl::AsciiStrToLower(platform->Name()) == platform_str) {
       return platform;
     }
   }
@@ -146,7 +145,7 @@ PlatformUtil::GetSupportedPlatforms() {
   TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
   std::vector<se::Platform*> matched;
   for (se::Platform* platform : platforms) {
-    if (Lowercase(platform->Name()) != platform_name) {
+    if (absl::AsciiStrToLower(platform->Name()) != platform_name) {
       matched.push_back(platform);
     }
   }
@@ -157,7 +156,7 @@ PlatformUtil::GetSupportedPlatforms() {
   if (matched.size() == 1) {
     return matched[0];
   }
-  string matched_string = tensorflow::str_util::Join(
+  string matched_string = absl::StrJoin(
       matched, ", ",
       [](string* out, const se::Platform* p) { out->append(p->Name()); });
   return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
index afde3cf95c..256b231e3a 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -59,7 +59,7 @@ class ReducePrecisionInsertion : public HloPassInterface {
 
   ~ReducePrecisionInsertion() override{};
 
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "reduce-precision-insertion";
   }
 
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index ca86c5d13e..4df746fca9 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -38,6 +38,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 
 #include <algorithm>
+
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -374,7 +376,7 @@ StatusOr<bool> TryReshapeMoveOnCandidates(
 
     removed = false;
     for (auto operand : nontrivial_operands) {
-      if (c_any_of(operand->users(), [&](HloInstruction* user) {
+      if (absl::c_any_of(operand->users(), [&](HloInstruction* user) {
             return !reshape_candidates->count(user);
           })) {
         for (auto* user : operand->users()) {
diff --git a/tensorflow/compiler/xla/service/reshape_mover.h b/tensorflow/compiler/xla/service/reshape_mover.h
index 1f59e3b314..1e86a0823a 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.h
+++ b/tensorflow/compiler/xla/service/reshape_mover.h
@@ -26,7 +26,7 @@ namespace xla {
 // them inputward also.
 class ReshapeMover : public HloPassInterface {
  public:
-  tensorflow::StringPiece name() const override { return "reshape-mover"; }
+  absl::string_view name() const override { return "reshape-mover"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 };
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index ccb9fb3e3a..a395dd5333 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -28,13 +28,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-
-namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
-using ReshapeMoverTest = HloVerifiedTestBase;
+
+namespace op = xla::testing::opcode_matchers;
+
+class ReshapeMoverTest : public HloVerifiedTestBase {
+ public:
+  ReshapeMoverTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+};
 
 TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
   HloComputation::Builder builder(TestName());
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
new file mode 100644
index 0000000000..338f0c09e9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -0,0 +1,351 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/scatter_expander.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+using tensorflow::gtl::ArraySlice;
+
+// Transposes the given scatter_indices such that the index_vector_dim becomes
+// the most-minor dimension.
+static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
+    HloInstruction* scatter_indices, int64 index_vector_dim) {
+  const Shape& scatter_indices_shape = scatter_indices->shape();
+
+  if (scatter_indices_shape.dimensions_size() == index_vector_dim) {
+    return scatter_indices;
+  }
+
+  if (index_vector_dim == (scatter_indices_shape.dimensions_size() - 1)) {
+    return scatter_indices;
+  }
+
+  std::vector<int64> permutation;
+  permutation.reserve(scatter_indices_shape.dimensions_size());
+  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != index_vector_dim) {
+      permutation.push_back(i);
+    }
+  }
+  permutation.push_back(index_vector_dim);
+  return MakeTransposeHlo(scatter_indices, permutation);
+}
+
+// Canonicalizes the scatter_indices tensor in order to keep them uniform while
+// performing the scatter operation.
+static StatusOr<HloInstruction*> CanonicalizeScatterIndices(
+    HloInstruction* scatter_indices, int64 index_vector_dim) {
+  // Transpose the non-index-vector dimensions to the front.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * transposed_scatter_indices,
+      TransposeIndexVectorDimToLast(scatter_indices, index_vector_dim));
+  bool indices_are_scalar =
+      index_vector_dim == scatter_indices->shape().dimensions_size();
+
+  // The number of dimensions in scatter_indices that are index dimensions.
+  const int64 index_dims_in_scatter_indices = indices_are_scalar ? 0 : 1;
+
+  // If there is only one index (i.e. scatter_indices has rank 1 and this
+  // scatter is really just a dynamic update slice) add a leading degenerate
+  // dimension for uniformity.  Otherwise create a "collapsed" leading dimension
+  // that subsumes all of the non-index-vector dimensions.
+  const Shape& shape = transposed_scatter_indices->shape();
+  if (shape.dimensions_size() == index_dims_in_scatter_indices) {
+    return PrependDegenerateDims(transposed_scatter_indices, 1);
+  } else {
+    // Collapse all but the dimensions (0 or 1) in scatter_indices containing
+    // the index vectors.
+    return CollapseFirstNDims(
+        transposed_scatter_indices,
+        shape.dimensions_size() - index_dims_in_scatter_indices);
+  }
+}
+
+// Permutes the `updates` tensor such that all the scatter dims appear in the
+// major dimensions and all the window dimensions appear in the minor
+// dimensions.
+static StatusOr<HloInstruction*> PermuteScatterAndWindowDims(
+    HloInstruction* updates, ArraySlice<int64> update_window_dims) {
+  std::vector<int64> permutation;
+  const int64 updates_rank = ShapeUtil::Rank(updates->shape());
+  permutation.reserve(updates_rank);
+
+  for (int64 i = 0; i < updates_rank; ++i) {
+    bool is_scatter_dim = !absl::c_binary_search(update_window_dims, i);
+    if (is_scatter_dim) {
+      permutation.push_back(i);
+    }
+  }
+  for (auto window_dim : update_window_dims) {
+    permutation.push_back(window_dim);
+  }
+
+  return MakeTransposeHlo(updates, permutation);
+}
+
+// Expands or contracts the scatter indices in the updates tensor.
+static StatusOr<HloInstruction*> AdjustScatterDims(
+    const Shape& scatter_indices_shape, HloInstruction* updates,
+    int64 index_vector_dim) {
+  int64 num_scatter_dims = scatter_indices_shape.dimensions_size();
+  if (index_vector_dim < scatter_indices_shape.dimensions_size()) {
+    --num_scatter_dims;
+  }
+  if (num_scatter_dims == 0) {
+    // If there are no scatter dims, this must be a dynamic-update-slice kind of
+    // scatter. In this case, we prepend a degenerate dimension to work
+    // uniformly in the while loop.
+    return PrependDegenerateDims(updates, 1);
+  }
+  return CollapseFirstNDims(updates, num_scatter_dims);
+}
+
+// Expands an index vector from the scatter_indices tensor into a vector that
+// can be used to dynamic-update-slice to perform the scatter update.
+static StatusOr<HloInstruction*> ExpandIndexVectorIntoOperandSpace(
+    HloInstruction* index_vector, const ScatterDimensionNumbers& dim_numbers,
+    int64 operand_rank) {
+  HloComputation* computation = index_vector->parent();
+  const Shape& index_shape = index_vector->shape();
+  HloInstruction* zero =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateFromDimensions(index_shape.element_type(), {1})));
+
+  // We extract out individual components from the smaller index and concatenate
+  // them (interspersing zeros as needed) into the larger index.
+  std::vector<HloInstruction*> expanded_index_components;
+
+  for (int i = 0; i < operand_rank; i++) {
+    int64 index_vector_dim_index =
+        FindIndex(dim_numbers.scatter_dims_to_operand_dims(), i);
+    if (index_vector_dim_index !=
+        dim_numbers.scatter_dims_to_operand_dims_size()) {
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * component_to_concat,
+          MakeSliceHlo(index_vector, /*start_indices=*/{index_vector_dim_index},
+                       /*limit_indices=*/{index_vector_dim_index + 1},
+                       /*strides=*/{1}));
+      expanded_index_components.push_back(component_to_concat);
+    } else {
+      expanded_index_components.push_back(zero);
+    }
+  }
+
+  return MakeConcatHlo(expanded_index_components, /*dimension=*/0);
+}
+
+// Body of the while loop that performs the scatter operation using other HLOs.
+static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
+    HloInstruction* scatter, HloInstruction* induction_var,
+    const std::vector<HloInstruction*>& loop_state) {
+  const ScatterDimensionNumbers& dim_numbers =
+      scatter->scatter_dimension_numbers();
+  CHECK_EQ(loop_state.size(), 3);
+  HloInstruction* operand = loop_state[0];
+  HloInstruction* scatter_indices = loop_state[1];
+  HloInstruction* updates = loop_state[2];
+
+  bool has_scalar_indices = scatter_indices->shape().dimensions_size() == 1;
+  CHECK_EQ(has_scalar_indices,
+           dim_numbers.index_vector_dim() ==
+               scatter->operand(1)->shape().dimensions_size());
+
+  // Build a vector form of the induction variable of the while loop.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * induction_var_as_vector,
+      MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
+                       /*result_shape_bounds=*/{1}));
+
+  // Pick the index to scatter from scatter_indices based on the induction_var
+  // and transform that to an index into the `operand` space.
+  HloInstruction* index_vector;
+  if (has_scalar_indices) {
+    TF_ASSIGN_OR_RETURN(
+        index_vector,
+        MakeDynamicSliceHlo(scatter_indices, induction_var_as_vector, {1}));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_into_scatter_indices,
+        PadVectorWithZeros(induction_var_as_vector,
+                           /*zeros_to_prepend=*/0, /*zeros_to_append=*/1));
+    int index_vector_size = scatter_indices->shape().dimensions(1);
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * index_vector_2d,
+        MakeDynamicSliceHlo(scatter_indices, index_into_scatter_indices,
+                            {1, index_vector_size}));
+    TF_ASSIGN_OR_RETURN(index_vector,
+                        ElideDegenerateDims(index_vector_2d, {0}));
+  }
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * scatter_slice_start,
+      ExpandIndexVectorIntoOperandSpace(index_vector, dim_numbers,
+                                        operand->shape().dimensions_size()));
+
+  // Extract the slice to be used to update from `updates` tensor for the
+  // induction_var corresponding to this iteration of the while loop.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * index_into_updates,
+      PadVectorWithZeros(
+          induction_var_as_vector, /*zeros_to_prepend=*/0,
+          /*zeros_to_append=*/updates->shape().dimensions_size() - 1));
+  std::vector<int64> update_slice_bounds(updates->shape().dimensions().begin(),
+                                         updates->shape().dimensions().end());
+  update_slice_bounds[0] = 1;
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * update_slice,
+      MakeDynamicSliceHlo(updates, index_into_updates, update_slice_bounds));
+  TF_ASSIGN_OR_RETURN(HloInstruction * update_slice_for_scatter,
+                      ElideDegenerateDims(update_slice, {0}));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * update_slice_with_dims_inserted,
+      InsertDegenerateDims(update_slice_for_scatter,
+                           AsInt64Slice(dim_numbers.inserted_window_dims())));
+
+  // Extact the slice to update from `operand` tensor.
+  const Shape& update_slice_shape = update_slice_with_dims_inserted->shape();
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * operand_slice_to_update,
+      MakeDynamicSliceHlo(operand, scatter_slice_start,
+                          AsInt64Slice(update_slice_shape.dimensions())));
+
+  // Compute the new value for the slice to be updated in `operand` tensor by
+  // combining the existing value and the update value using the update
+  // computation.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * updated_operand_slice,
+      MakeMapHlo({operand_slice_to_update, update_slice_with_dims_inserted},
+                 scatter->to_apply()));
+
+  // Write the updated value of the slice into `operand` tensor.
+  TF_ASSIGN_OR_RETURN(HloInstruction * updated_operand,
+                      MakeDynamicUpdateSliceHlo(operand, updated_operand_slice,
+                                                scatter_slice_start));
+
+  return StatusOr<std::vector<HloInstruction*>>{
+      {updated_operand, scatter_indices, updates}};
+}
+
+// High Level Algorithm.
+//
+// 1. Canonicalize the scatter_indices tensor such that it has rank 2, where
+//    each row is an index into the operand.
+// 2. Canonicalize the updates tensor such that is has rank `num_window_dims+1`
+//    and the scatter dim is the most-major dimension.
+// 3. Iterate over the set of indices in the canonicalized scatter_indices
+//    tensor using a while loop, updating the operand for each such index. Each
+//    iteration of this while loop performs the following:
+//      a. Pick the index from scatter_indices for this iteration.
+//      b. Transfrom this index into an index into the operand space.
+//      c. Extract the slice to be used to update from the updates tensor.
+//      d. Extract the slice to update from the operand tensor.
+//      e. Compute the new value for the slice to update by combining the slices
+//         from c. and d. using the update_computation of scatter.
+//      f. Write the updated value of the slice into the operand tensor.
+
+StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
+    HloInstruction* scatter) {
+  HloInstruction* operand = scatter->mutable_operand(0);
+  HloInstruction* scatter_indices = scatter->mutable_operand(1);
+  HloInstruction* updates = scatter->mutable_operand(2);
+  const ScatterDimensionNumbers& dim_numbers =
+      scatter->scatter_dimension_numbers();
+
+  // If the updates tensor is empty, there is no need to update the operand. We
+  // can return the operand as is.
+  if (ShapeUtil::IsZeroElementArray(updates->shape())) {
+    return operand;
+  }
+
+  // Compute the trip count for the while loop to be used for scatter. This
+  // should be the number of indices we should scatter into the operand.
+  const Shape& scatter_indices_shape = scatter_indices->shape();
+  int64 scatter_loop_trip_count = 1;
+  for (int64 i = 0, e = scatter_indices_shape.dimensions_size(); i < e; i++) {
+    if (i != dim_numbers.index_vector_dim()) {
+      scatter_loop_trip_count *= scatter_indices_shape.dimensions(i);
+    }
+  }
+  if (!IsInt32(scatter_loop_trip_count)) {
+    return Unimplemented(
+        "Scatter operations with more than 2147483647 scatter indices are not "
+        "supported. This error occurred for %s.",
+        scatter->ToString().c_str());
+  }
+
+  // Canonicalize the scatter_indices, after which the size of its most-major
+  // dimension must be same as the while loop trip count.
+  TF_ASSIGN_OR_RETURN(HloInstruction * canonical_scatter_indices,
+                      CanonicalizeScatterIndices(
+                          scatter_indices, dim_numbers.index_vector_dim()));
+  CHECK_EQ(scatter_loop_trip_count,
+           canonical_scatter_indices->shape().dimensions(0));
+
+  // Canonicalize the updates, after which the size of its most-major dimension
+  // must be same as the while loop trip count.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * canonical_updates,
+      PermuteScatterAndWindowDims(
+          updates, AsInt64Slice(dim_numbers.update_window_dims())));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * adjusted_canonical_updates,
+      AdjustScatterDims(scatter_indices->shape(), canonical_updates,
+                        dim_numbers.index_vector_dim()));
+  CHECK_EQ(scatter_loop_trip_count,
+           adjusted_canonical_updates->shape().dimensions(0));
+
+  // The while loop that implements the scatter operation.
+  StatusOr<std::vector<HloInstruction*>> scatter_loop_result_status =
+      WhileUtil::MakeCountedLoop(
+          scatter->parent(), scatter_loop_trip_count,
+          {operand, canonical_scatter_indices, adjusted_canonical_updates},
+          [&](HloInstruction* induction_var,
+              const std::vector<HloInstruction*>& loop_state) {
+            return ScatterLoopBody(scatter, induction_var, loop_state);
+          });
+  TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> scatter_loop_result,
+                      scatter_loop_result_status);
+  return scatter_loop_result.front();
+}
+
+StatusOr<bool> ScatterExpander::Run(HloModule* module) {
+  std::vector<HloInstruction*> scatter_instrs;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (instr->opcode() == HloOpcode::kScatter) {
+        scatter_instrs.push_back(instr);
+      }
+    }
+  }
+
+  for (auto instr : scatter_instrs) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandScatter(instr));
+    TF_RETURN_IF_ERROR(
+        instr->parent()->ReplaceInstruction(instr, expanded_root));
+  }
+
+  return !scatter_instrs.empty();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/service/scatter_expander.h
index bfcdfc62f9..14f062c89c 100644
--- a/tensorflow/compiler/xla/ptr_util.h
+++ b/tensorflow/compiler/xla/service/scatter_expander.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,23 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_
-#define TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
 
-// As this was moved to tensorflow/core/util, provide indirections here to
-// maintain current functionality of the library.
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
-#include <stddef.h>
+namespace xla {
 
-#include <memory>
-#include <type_traits>
-#include <utility>
+class ScatterExpander : public HloPassInterface {
+ public:
+  absl::string_view name() const override { return "scatter_expander"; }
+  StatusOr<bool> Run(HloModule* module) override;
 
-#include "tensorflow/core/util/ptr_util.h"
+ private:
+  StatusOr<HloInstruction*> ExpandScatter(HloInstruction* scatter);
+};
 
-namespace xla {
-using tensorflow::MakeUnique;
-using tensorflow::WrapUnique;
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SCATTER_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 1dbf540d13..d39a5191b8 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -20,10 +20,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -46,7 +47,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -55,8 +55,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
 
+using absl::StrCat;
 using ::tensorflow::strings::Printf;
-using ::tensorflow::strings::StrCat;
 
 namespace xla {
 
@@ -245,7 +245,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
     const ExecutionOptions* execution_options) {
-  auto config = MakeUnique<HloModuleConfig>(program_shape);
+  auto config = absl::make_unique<HloModuleConfig>(program_shape);
   ComputationLayout* computation_layout =
       config->mutable_entry_computation_layout();
   if (program_shape.parameters_size() != argument_shapes.size()) {
@@ -326,7 +326,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     if (directory_path.empty() && execution_directory_path.empty()) {
       continue;
     }
-    auto hlo_snapshot = MakeUnique<HloSnapshot>();
+    auto hlo_snapshot = absl::make_unique<HloSnapshot>();
     *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i];
     if (!directory_path.empty()) {
       string filename =
@@ -409,7 +409,8 @@ Service::ExecuteParallelAndRegisterResult(
       streams.push_back(std::move(stream));
 
       if (replica == 0 && profile != nullptr) {
-        timers.push_back(MakeUnique<se::Timer>(streams.back()->parent()));
+        timers.push_back(
+            absl::make_unique<se::Timer>(streams.back()->parent()));
         streams.back()
             ->InitTimer(timers.back().get())
             .ThenStartTimer(timers.back().get());
@@ -800,7 +801,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
       module_proto.name().c_str());
 
   // Dump computation proto state if flag is set.
-  auto hlo_snapshot = MakeUnique<HloSnapshot>();
+  auto hlo_snapshot = absl::make_unique<HloSnapshot>();
   const string& directory_path =
       module_config->debug_options().xla_dump_computations_to();
   const string& execution_directory_path =
@@ -954,7 +955,7 @@ namespace {
 // shape and DeviceMemoryBase values of the clone are identical to the original.
 std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
     const ShapedBuffer& shaped_buffer, int device_ordinal) {
-  auto clone = MakeUnique<ShapedBuffer>(
+  auto clone = absl::make_unique<ShapedBuffer>(
       shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(),
       shaped_buffer.platform(), device_ordinal);
   clone->buffers() = shaped_buffer.buffers();
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index a4ea2b28f4..6a22f8bef4 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -21,6 +21,10 @@ limitations under the License.
 #include <set>
 #include <string>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -28,28 +32,24 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/math/math_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
-using tensorflow::str_util::Join;
-using tensorflow::strings::Printf;
-
 namespace xla {
-
 namespace {
 
+using absl::StrJoin;
+using tensorflow::strings::Printf;
+
 // Returns true if no element is present in slice more than once.
 bool AllUnique(tensorflow::gtl::ArraySlice<int64> slice) {
   return std::set<int64>(slice.begin(), slice.end()).size() == slice.size();
 }
 
-Status ExpectArray(const Shape& shape, tensorflow::StringPiece op_type) {
+Status ExpectArray(const Shape& shape, absl::string_view op_type) {
   if (!ShapeUtil::IsArray(shape)) {
     return InvalidArgument("Expected array argument for %s, but got %s.",
                            std::string(op_type).c_str(),
@@ -233,10 +233,12 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   switch (opcode) {
     case HloOpcode::kFloor:
     case HloOpcode::kCeil:
+    case HloOpcode::kRoundNearestAfz:
       if (!ShapeUtil::ElementIsFloating(shape)) {
         return InvalidArgument(
-            "Expected element type in shape to be floating for floor/ceil "
-            "operation; got %s.",
+            "Expected element type in shape to be floating for %s operation; "
+            "got %s.",
+            HloOpcodeString(opcode).c_str(),
             PrimitiveType_Name(shape.element_type()).c_str());
       }
       return shape;
@@ -250,8 +252,9 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
       if (!ShapeUtil::ElementIsFloating(shape) &&
           !ShapeUtil::ElementIsComplex(shape)) {
         return InvalidArgument(
-            "Expected element type in shape to be floating or complex for "
-            "sin/cos/exp/log/tanh operation; got %s.",
+            "Expected element type in shape to be floating or complex for %s "
+            "operation; got %s.",
+            HloOpcodeString(opcode).c_str(),
             PrimitiveType_Name(shape.element_type()).c_str());
       }
       return shape;
@@ -264,19 +267,51 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
       } else {
         return InvalidArgument(
             "Expected element type in shape to be floating or complex for "
-            "real/imag operation; got %s.",
+            "%s operation; got %s.",
+            HloOpcodeString(opcode).c_str(),
             PrimitiveType_Name(shape.element_type()).c_str());
       }
     case HloOpcode::kAbs:
       if (ShapeUtil::ElementIsComplex(shape)) {
         return ShapeUtil::ChangeElementType(
             shape, primitive_util::ComplexComponentType(shape.element_type()));
+      } else if (ShapeUtil::ElementIsSigned(shape)) {
+        return shape;
+      } else {
+        return InvalidArgument(
+            "Expected element type in shape to be floating or complex for "
+            "%s operation; got %s.",
+            HloOpcodeString(opcode).c_str(),
+            PrimitiveType_Name(shape.element_type()).c_str());
       }
-      return shape;
     case HloOpcode::kClz:
+      if (!ShapeUtil::ElementIsIntegral(shape)) {
+        return InvalidArgument(
+            "Expected an integral element type in argument to Clz "
+            "operation; got %s.",
+            PrimitiveType_Name(shape.element_type()).c_str());
+      }
+      return shape;
     case HloOpcode::kNegate:
-    case HloOpcode::kRoundNearestAfz:
+      if (!ShapeUtil::ElementIsIntegral(shape) &&
+          !ShapeUtil::ElementIsFloating(shape) &&
+          !ShapeUtil::ElementIsComplex(shape)) {
+        return InvalidArgument(
+            "Expected element type in shape to be integral, floating or "
+            "complex for %s operation; got %s.",
+            HloOpcodeString(opcode).c_str(),
+            PrimitiveType_Name(shape.element_type()).c_str());
+      }
+      return shape;
     case HloOpcode::kSign:
+      if (!ShapeUtil::ElementIsSigned(shape) &&
+          !ShapeUtil::ElementIsComplex(shape)) {
+        return InvalidArgument(
+            "Expected element type in shape to be signed or complex for "
+            "%s operation; got %s.",
+            HloOpcodeString(opcode).c_str(),
+            PrimitiveType_Name(shape.element_type()).c_str());
+      }
       return shape;
 
     case HloOpcode::kNot:
@@ -878,16 +913,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
       HloOpcodeString(opcode).c_str(), ShapeUtil::HumanString(lhs).c_str(),
       ShapeUtil::HumanString(rhs).c_str(),
-      Join(broadcast_dimensions, ", ").c_str());
+      StrJoin(broadcast_dimensions, ", ").c_str());
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
-  TF_RETURN_IF_ERROR(
-      ExpectArray(lhs, tensorflow::strings::StrCat("lhs of binary operation ",
-                                                   HloOpcodeString(opcode))));
-  TF_RETURN_IF_ERROR(
-      ExpectArray(rhs, tensorflow::strings::StrCat("rhs of binary operation ",
-                                                   HloOpcodeString(opcode))));
+  TF_RETURN_IF_ERROR(ExpectArray(
+      lhs, absl::StrCat("lhs of binary operation ", HloOpcodeString(opcode))));
+  TF_RETURN_IF_ERROR(ExpectArray(
+      rhs, absl::StrCat("rhs of binary operation ", HloOpcodeString(opcode))));
   switch (opcode) {
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
@@ -1058,7 +1091,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
         "%s.",
-        Join(pieces, ", ").c_str());
+        StrJoin(pieces, ", ").c_str());
   }
 
   // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@@ -1075,7 +1108,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     if (dimensions[i] != i) {
       return InvalidArgument(
           "Map requires monotonically increasing dimension numbers; got: %s.",
-          Join(dimensions, ", ").c_str());
+          StrJoin(dimensions, ", ").c_str());
     }
   }
 
@@ -1530,7 +1563,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferConvolveShape(
     const Shape& lhs, const Shape& rhs, const Window& window,
-    const ConvolutionDimensionNumbers& dnums) {
+    const ConvolutionDimensionNumbers& dnums, int64 feature_group_count) {
   TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of convolution"));
   TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of convolution"));
 
@@ -1640,12 +1673,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   const int64 kernel_output_features =
       rhs.dimensions(dnums.kernel_output_feature_dimension());
 
-  if (input_features != kernel_input_features) {
+  if (input_features != kernel_input_features * feature_group_count) {
     return InvalidArgument(
         "Expected LHS feature dimension (value %lld) to match RHS "
-        "input feature dimension (value %lld); got <conv>(%s, %s)\n"
+        "input feature dimension * feature_group_count (value %lld); "
+        "got <conv>(%s, %s)\n"
         "Dimension numbers: {%s}.",
-        input_features, kernel_input_features,
+        input_features, kernel_input_features * feature_group_count,
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str(), dnums.DebugString().c_str());
   }
@@ -1975,14 +2009,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
         "{%s}; strides: {%s}.",
         message.c_str(), ShapeUtil::HumanString(arg).c_str(),
-        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
-        Join(strides, ",").c_str());
+        StrJoin(starts, ",").c_str(), StrJoin(limits, ",").c_str(),
+        StrJoin(strides, ",").c_str());
   };
   TF_RETURN_IF_ERROR(ExpectArray(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
-      Join(limits, ", ").c_str());
+      ShapeUtil::HumanString(arg).c_str(), StrJoin(starts, ", ").c_str(),
+      StrJoin(limits, ", ").c_str());
 
   if (starts.size() != limits.size()) {
     return error(Printf("slice start and limit sizes differ: %zu vs %zu",
@@ -2045,7 +2079,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
       ShapeUtil::HumanString(operand_shape).c_str(),
       ShapeUtil::HumanString(start_indices_shape).c_str(),
-      Join(slice_sizes, ", ").c_str());
+      StrJoin(slice_sizes, ", ").c_str());
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
@@ -2342,7 +2376,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     return InvalidArgument(
         "Reshape dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
-        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
+        StrJoin(dimensions, ",").c_str(),
+        ShapeUtil::HumanString(operand).c_str());
   }
 
   return inferred_shape;
@@ -2462,8 +2497,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   if (arg_shapes.size() != to_apply.parameters_size()) {
     string computation_signature = ShapeUtil::HumanString(to_apply);
     string argument_shapes =
-        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
-          tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
+        StrJoin(arg_shapes, ", ", [](string* out, const Shape* shape) {
+          absl::StrAppend(out, ShapeUtil::HumanString(*shape));
         });
     return InvalidArgument(
         "Call applied function arity must match number of arguments; got: "
@@ -2491,201 +2526,199 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 static Status ValidateGatherDimensionNumbers(
     const Shape& input_shape,
-    tensorflow::gtl::ArraySlice<int64> gather_indices_shape,
+    tensorflow::gtl::ArraySlice<int64> start_indices_shape,
     const GatherDimensionNumbers& dim_numbers) {
-  if (!c_is_sorted(dim_numbers.output_window_dims())) {
+  if (!absl::c_is_sorted(dim_numbers.offset_dims())) {
     return InvalidArgument(
         "Output window dimensions in gather op must be ascending; got: %s.",
-        Join(dim_numbers.output_window_dims(), ", ").c_str());
+        StrJoin(dim_numbers.offset_dims(), ", ").c_str());
   }
 
-  if (c_adjacent_find(dim_numbers.output_window_dims()) !=
-      dim_numbers.output_window_dims().end()) {
+  if (absl::c_adjacent_find(dim_numbers.offset_dims()) !=
+      dim_numbers.offset_dims().end()) {
     return InvalidArgument(
         "Output window dimensions in gather op must not repeat; got: %s.",
-        Join(dim_numbers.output_window_dims(), ", ").c_str());
+        StrJoin(dim_numbers.offset_dims(), ", ").c_str());
   }
 
-  const int64 output_window_dim_count = dim_numbers.output_window_dims_size();
+  const int64 output_offset_dim_count = dim_numbers.offset_dims_size();
   const int64 output_shape_rank =
-      output_window_dim_count + gather_indices_shape.size() - 1;
+      output_offset_dim_count + start_indices_shape.size() - 1;
 
-  for (int i = 0; i < dim_numbers.output_window_dims_size(); ++i) {
-    int64 window_index = dim_numbers.output_window_dims(i);
-    if (window_index < 0 || window_index >= output_shape_rank) {
+  for (int i = 0; i < dim_numbers.offset_dims_size(); ++i) {
+    int64 offset_dim = dim_numbers.offset_dims(i);
+    if (offset_dim < 0 || offset_dim >= output_shape_rank) {
       return InvalidArgument(
-          "Window index %d in gather op is out of bounds; got %lld, but should "
+          "Offset dimension %d in gather op is out of bounds; got %lld, but "
+          "should "
           "have been in [0,%lld).",
-          i, window_index, output_shape_rank);
+          i, offset_dim, output_shape_rank);
     }
   }
 
-  if (dim_numbers.gather_dims_to_operand_dims_size() !=
-      gather_indices_shape[dim_numbers.index_vector_dim()]) {
+  if (dim_numbers.start_index_map_size() !=
+      start_indices_shape[dim_numbers.index_vector_dim()]) {
     return InvalidArgument(
-        "Gather op has %d elements in gather_dims_to_operand_dims and the "
-        "bound of dimension index_vector_dim=%lld of gather_indices is "
+        "Gather op has %d elements in start_index_map and the "
+        "bound of dimension index_vector_dim=%lld of start_indices is "
         "%lld. These two numbers must be equal.",
-        dim_numbers.gather_dims_to_operand_dims_size(),
-        dim_numbers.index_vector_dim(),
-        gather_indices_shape[dim_numbers.index_vector_dim()]);
+        dim_numbers.start_index_map_size(), dim_numbers.index_vector_dim(),
+        start_indices_shape[dim_numbers.index_vector_dim()]);
   }
 
-  for (int i = 0; i < dim_numbers.gather_dims_to_operand_dims_size(); i++) {
-    int64 gather_dim_to_input_dim = dim_numbers.gather_dims_to_operand_dims(i);
-    if (gather_dim_to_input_dim < 0 ||
-        gather_dim_to_input_dim >= input_shape.dimensions_size()) {
+  for (int i = 0; i < dim_numbers.start_index_map_size(); i++) {
+    int64 operand_dim_for_start_index_i = dim_numbers.start_index_map(i);
+    if (operand_dim_for_start_index_i < 0 ||
+        operand_dim_for_start_index_i >= input_shape.dimensions_size()) {
       return InvalidArgument(
-          "Invalid gather_dims_to_operand_dims mapping; domain is [0, %d), "
-          "got: %d->%lld.",
-          input_shape.dimensions_size(), i, gather_dim_to_input_dim);
+          "Invalid start_index_map; domain is [0, %d), got: %d->%lld.",
+          input_shape.dimensions_size(), i, operand_dim_for_start_index_i);
     }
   }
 
-  std::vector<int64> sorted_gather_dims_to_operand_dims(
-      dim_numbers.gather_dims_to_operand_dims().begin(),
-      dim_numbers.gather_dims_to_operand_dims().end());
+  std::vector<int64> sorted_start_index_map(
+      dim_numbers.start_index_map().begin(),
+      dim_numbers.start_index_map().end());
 
-  c_sort(sorted_gather_dims_to_operand_dims);
+  absl::c_sort(sorted_start_index_map);
 
-  if (c_adjacent_find(sorted_gather_dims_to_operand_dims) !=
-      sorted_gather_dims_to_operand_dims.end()) {
+  if (absl::c_adjacent_find(sorted_start_index_map) !=
+      sorted_start_index_map.end()) {
     return InvalidArgument(
-        "Repeated dimensions are not allowed in gather_dims_to_operand_dims; "
+        "Repeated dimensions are not allowed in start_index_map; "
         "got: %s.",
-        Join(dim_numbers.gather_dims_to_operand_dims(), ", ").c_str());
+        StrJoin(dim_numbers.start_index_map(), ", ").c_str());
   }
 
-  for (int64 elided_dim : dim_numbers.elided_window_dims()) {
-    if (elided_dim < 0 || elided_dim >= input_shape.dimensions_size()) {
+  for (int64 collapsed_dim : dim_numbers.collapsed_slice_dims()) {
+    if (collapsed_dim < 0 || collapsed_dim >= input_shape.dimensions_size()) {
       return InvalidArgument(
-          "Invalid elided_window_dims set in gather op; valid range is [0, "
+          "Invalid collapsed_slice_dims set in gather op; valid range is [0, "
           "%d), got: %lld.",
-          input_shape.dimensions_size(), elided_dim);
+          input_shape.dimensions_size(), collapsed_dim);
     }
   }
 
-  if (!c_is_sorted(dim_numbers.elided_window_dims())) {
+  if (!absl::c_is_sorted(dim_numbers.collapsed_slice_dims())) {
     return InvalidArgument(
-        "elided_window_dims in gather op must be sorted; got: %s",
-        Join(dim_numbers.elided_window_dims(), ", ").c_str());
+        "collapsed_slice_dims in gather op must be sorted; got: %s",
+        StrJoin(dim_numbers.collapsed_slice_dims(), ", ").c_str());
   }
 
-  if (c_adjacent_find(dim_numbers.elided_window_dims()) !=
-      dim_numbers.elided_window_dims().end()) {
+  if (absl::c_adjacent_find(dim_numbers.collapsed_slice_dims()) !=
+      dim_numbers.collapsed_slice_dims().end()) {
     return InvalidArgument(
-        "Repeated dimensions not allowed in elided_window_dims in gather op; "
+        "Repeated dimensions not allowed in collapsed_slice_dims in gather op; "
         "got: %s.",
-        Join(dim_numbers.elided_window_dims(), ", ").c_str());
+        StrJoin(dim_numbers.collapsed_slice_dims(), ", ").c_str());
   }
 
   return Status::OK();
 }
 
 /*static*/ StatusOr<Shape> ShapeInference::InferGatherShape(
-    const Shape& input_shape, const Shape& gather_indices_shape,
+    const Shape& input_shape, const Shape& start_indices_shape,
     const GatherDimensionNumbers& gather_dim_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds) {
+    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
   TF_RETURN_IF_ERROR(
       ExpectArray(input_shape, "input tensor operand gather op"));
   TF_RETURN_IF_ERROR(
-      ExpectArray(gather_indices_shape, "gather indices operand of gather op"));
+      ExpectArray(start_indices_shape, "gather indices operand of gather op"));
 
-  if (!ShapeUtil::ElementIsIntegral(gather_indices_shape)) {
+  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
     return InvalidArgument(
         "Gather indices parameter must be an integral tensor; got %s.",
-        ShapeUtil::HumanString(gather_indices_shape).c_str());
+        ShapeUtil::HumanString(start_indices_shape).c_str());
   }
 
   // We implicitly reshape gather indices of shape P[A,B,C] to P[A,B,C,1] if
   // index_vector_dim is rank(P).  The bounds of this expanded shape is
-  // stored in expanded_gather_indices_shape.
+  // stored in expanded_start_indices_shape.
 
-  if (gather_indices_shape.dimensions_size() <
+  if (start_indices_shape.dimensions_size() <
           gather_dim_numbers.index_vector_dim() ||
       gather_dim_numbers.index_vector_dim() < 0) {
     return InvalidArgument(
-        "Gather index leaf dimension must be within [0, rank(gather_indices) + "
-        "1). rank(gather_indices) is %d and gather index leaf dimension is "
+        "Gather index leaf dimension must be within [0, rank(start_indices) + "
+        "1). rank(start_indices) is %d and gather index leaf dimension is "
         "%lld.",
-        gather_indices_shape.dimensions_size(),
+        start_indices_shape.dimensions_size(),
         gather_dim_numbers.index_vector_dim());
   }
 
-  std::vector<int64> expanded_gather_indices_shape;
-  expanded_gather_indices_shape.reserve(gather_indices_shape.dimensions_size());
-  c_copy(gather_indices_shape.dimensions(),
-         std::back_inserter(expanded_gather_indices_shape));
-  if (expanded_gather_indices_shape.size() ==
+  std::vector<int64> expanded_start_indices_shape;
+  expanded_start_indices_shape.reserve(start_indices_shape.dimensions_size());
+  absl::c_copy(start_indices_shape.dimensions(),
+               std::back_inserter(expanded_start_indices_shape));
+  if (expanded_start_indices_shape.size() ==
       gather_dim_numbers.index_vector_dim()) {
-    expanded_gather_indices_shape.push_back(1);
+    expanded_start_indices_shape.push_back(1);
   }
 
   TF_RETURN_IF_ERROR(ValidateGatherDimensionNumbers(
-      input_shape, expanded_gather_indices_shape, gather_dim_numbers));
+      input_shape, expanded_start_indices_shape, gather_dim_numbers));
 
-  if (window_bounds.size() != input_shape.dimensions_size()) {
+  if (slice_sizes.size() != input_shape.dimensions_size()) {
     return InvalidArgument(
-        "Gather op must have one window bound for every input dimension; got: "
-        "len(window_bounds)=%lu, input_shape.rank=%d.",
-        window_bounds.size(), input_shape.dimensions_size());
+        "Gather op must have one slice size for every input dimension; got: "
+        "len(slice_sizes)=%lu, input_shape.rank=%d.",
+        slice_sizes.size(), input_shape.dimensions_size());
   }
 
-  if (window_bounds.size() !=
-      gather_dim_numbers.output_window_dims_size() +
-          gather_dim_numbers.elided_window_dims_size()) {
+  if (slice_sizes.size() !=
+      gather_dim_numbers.offset_dims_size() +
+          gather_dim_numbers.collapsed_slice_dims_size()) {
     return InvalidArgument(
-        "All components of the window index in a gather op must either be a "
-        "output window index or explicitly elided; got len(window_bounds)=%lu, "
-        "output_window_bounds=%s, elided_window_bounds=%s.",
-        window_bounds.size(),
-        Join(gather_dim_numbers.output_window_dims(), ",").c_str(),
-        Join(gather_dim_numbers.elided_window_dims(), ",").c_str());
+        "All components of the offset index in a gather op must either be a "
+        "offset dimension or explicitly collapsed; got len(slice_sizes)=%lu, "
+        "output_slice_sizes=%s, collapsed_slice_dims=%s.",
+        slice_sizes.size(),
+        StrJoin(gather_dim_numbers.offset_dims(), ",").c_str(),
+        StrJoin(gather_dim_numbers.collapsed_slice_dims(), ",").c_str());
   }
 
-  for (int i = 0; i < window_bounds.size(); i++) {
-    int64 window_bound = window_bounds[i];
-    int64 corresponding_input_bound = input_shape.dimensions(i);
-    if (window_bound < 0 || window_bound > corresponding_input_bound) {
+  for (int i = 0; i < slice_sizes.size(); i++) {
+    int64 slice_size = slice_sizes[i];
+    int64 corresponding_input_size = input_shape.dimensions(i);
+    if (slice_size < 0 || slice_size > corresponding_input_size) {
       return InvalidArgument(
-          "Window bound at index %d in gather op is out of range, must be "
-          "within "
-          "[0, %lld), got %lld.",
-          i, corresponding_input_bound + 1, window_bound);
+          "Slice size at index %d in gather op is out of range, must be "
+          "within [0, %lld), got %lld.",
+          i, corresponding_input_size + 1, slice_size);
     }
   }
 
-  for (int i = 0; i < gather_dim_numbers.elided_window_dims_size(); i++) {
-    if (window_bounds[gather_dim_numbers.elided_window_dims(i)] != 1) {
+  for (int i = 0; i < gather_dim_numbers.collapsed_slice_dims_size(); i++) {
+    if (slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)] != 1) {
       return InvalidArgument(
-          "Gather op can only elide window indices with bound 1, but bound is "
+          "Gather op can only collapse slice dims with bound 1, but bound is "
           "%lld for index %lld at position %d.",
-          window_bounds[gather_dim_numbers.elided_window_dims(i)],
-          gather_dim_numbers.elided_window_dims(i), i);
+          slice_sizes[gather_dim_numbers.collapsed_slice_dims(i)],
+          gather_dim_numbers.collapsed_slice_dims(i), i);
     }
   }
 
-  int64 result_rank = gather_dim_numbers.output_window_dims_size() +
-                      (expanded_gather_indices_shape.size() - 1);
-  int64 window_dims_seen = 0;
+  int64 result_rank = gather_dim_numbers.offset_dims_size() +
+                      (expanded_start_indices_shape.size() - 1);
+  int64 offset_dims_seen = 0;
   int64 gather_dims_seen = 0;
   std::vector<int64> output_dim_bounds;
   output_dim_bounds.reserve(result_rank);
   for (int64 i = 0; i < result_rank; i++) {
     int64 current_bound;
     bool is_window_index =
-        c_binary_search(gather_dim_numbers.output_window_dims(), i);
+        absl::c_binary_search(gather_dim_numbers.offset_dims(), i);
     if (is_window_index) {
-      while (c_binary_search(gather_dim_numbers.elided_window_dims(),
-                             window_dims_seen)) {
-        window_dims_seen++;
+      while (absl::c_binary_search(gather_dim_numbers.collapsed_slice_dims(),
+                                   offset_dims_seen)) {
+        offset_dims_seen++;
       }
-      current_bound = window_bounds[window_dims_seen++];
+      current_bound = slice_sizes[offset_dims_seen++];
     } else {
       if (gather_dims_seen == gather_dim_numbers.index_vector_dim()) {
         gather_dims_seen++;
       }
-      current_bound = expanded_gather_indices_shape[gather_dims_seen++];
+      current_bound = expanded_start_indices_shape[gather_dims_seen++];
     }
 
     output_dim_bounds.push_back(current_bound);
@@ -2701,16 +2734,16 @@ Status ValidateScatterDimensionNumbers(
     tensorflow::gtl::ArraySlice<int64> scatter_indices_shape,
     const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) {
   // Validate update_window_dims in ScatterDimensionNumbers.
-  if (!c_is_sorted(dim_numbers.update_window_dims())) {
+  if (!absl::c_is_sorted(dim_numbers.update_window_dims())) {
     return InvalidArgument(
         "update_window_dims in scatter op must be sorted; got: %s.",
-        Join(dim_numbers.update_window_dims(), ", ").c_str());
+        StrJoin(dim_numbers.update_window_dims(), ", ").c_str());
   }
-  if (c_adjacent_find(dim_numbers.update_window_dims()) !=
+  if (absl::c_adjacent_find(dim_numbers.update_window_dims()) !=
       dim_numbers.update_window_dims().end()) {
     return InvalidArgument(
         "update_window_dims in scatter op must not repeat; got: %s.",
-        Join(dim_numbers.update_window_dims(), ", ").c_str());
+        StrJoin(dim_numbers.update_window_dims(), ", ").c_str());
   }
   const int64 updates_rank = ShapeUtil::Rank(updates_shape);
   for (int64 window_dim : dim_numbers.update_window_dims()) {
@@ -2723,16 +2756,16 @@ Status ValidateScatterDimensionNumbers(
   }
 
   // Validate inserted_window_dims in ScatterDimensionNumbers.
-  if (!c_is_sorted(dim_numbers.inserted_window_dims())) {
+  if (!absl::c_is_sorted(dim_numbers.inserted_window_dims())) {
     return InvalidArgument(
         "inserted_window_dims in scatter op must be sorted; got: %s.",
-        Join(dim_numbers.inserted_window_dims(), ", ").c_str());
+        StrJoin(dim_numbers.inserted_window_dims(), ", ").c_str());
   }
-  if (c_adjacent_find(dim_numbers.inserted_window_dims()) !=
+  if (absl::c_adjacent_find(dim_numbers.inserted_window_dims()) !=
       dim_numbers.inserted_window_dims().end()) {
     return InvalidArgument(
         "inserted_window_dims in scatter op must not repeat; got: %s.",
-        Join(dim_numbers.inserted_window_dims(), ", ").c_str());
+        StrJoin(dim_numbers.inserted_window_dims(), ", ").c_str());
   }
   for (int64 inserted_dim : dim_numbers.inserted_window_dims()) {
     if (inserted_dim < 0 || inserted_dim >= operand_shape.dimensions_size()) {
@@ -2768,13 +2801,13 @@ Status ValidateScatterDimensionNumbers(
   std::vector<int64> sorted_scatter_dims_to_operand_dims(
       dim_numbers.scatter_dims_to_operand_dims().begin(),
       dim_numbers.scatter_dims_to_operand_dims().end());
-  c_sort(sorted_scatter_dims_to_operand_dims);
-  if (c_adjacent_find(sorted_scatter_dims_to_operand_dims) !=
+  absl::c_sort(sorted_scatter_dims_to_operand_dims);
+  if (absl::c_adjacent_find(sorted_scatter_dims_to_operand_dims) !=
       sorted_scatter_dims_to_operand_dims.end()) {
     return InvalidArgument(
         "Repeated dimensions not allowed in scatter_dims_to_operand_dims; "
         "got: %s.",
-        Join(dim_numbers.scatter_dims_to_operand_dims(), ", ").c_str());
+        StrJoin(dim_numbers.scatter_dims_to_operand_dims(), ", ").c_str());
   }
 
   return Status::OK();
@@ -2836,32 +2869,32 @@ Status ValidateScatterDimensionNumbers(
       scatter_dim_numbers));
 
   int64 inserted_dims_seen = 0;
-  std::vector<int64> max_update_window_bounds;
+  std::vector<int64> max_update_slice_sizes;
   for (int i = 0; i < operand_shape.dimensions_size(); ++i) {
     if (inserted_dims_seen < scatter_dim_numbers.inserted_window_dims_size() &&
         scatter_dim_numbers.inserted_window_dims(inserted_dims_seen) == i) {
       ++inserted_dims_seen;
     } else {
-      max_update_window_bounds.push_back(operand_shape.dimensions(i));
+      max_update_slice_sizes.push_back(operand_shape.dimensions(i));
     }
   }
   for (int i = 0; i < scatter_dim_numbers.update_window_dims_size(); ++i) {
     auto update_window_dim = scatter_dim_numbers.update_window_dims(i);
     if (updates_shape.dimensions(update_window_dim) >
-        max_update_window_bounds[i]) {
+        max_update_slice_sizes[i]) {
       return InvalidArgument(
           "Bounds of the window dimensions of updates must not exceed the "
           "bounds of the corresponding dimensions of operand. For dimension "
           "%lld, updates bound is %lld, operand bound is %lld.",
           update_window_dim, updates_shape.dimensions(update_window_dim),
-          max_update_window_bounds[i]);
+          max_update_slice_sizes[i]);
     }
   }
 
   int64 scatter_dims_seen = 0;
   for (int64 i = 0; i < ShapeUtil::Rank(updates_shape); ++i) {
     bool is_update_window_dim =
-        c_binary_search(scatter_dim_numbers.update_window_dims(), i);
+        absl::c_binary_search(scatter_dim_numbers.update_window_dims(), i);
     if (is_update_window_dim) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index c185b0a1bd..4974ac9916 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -112,7 +112,8 @@ class ShapeInference {
   // filter (rhs) to lhs in the way specified by the fields on window.
   static StatusOr<Shape> InferConvolveShape(
       const Shape& lhs, const Shape& rhs, const Window& window,
-      const ConvolutionDimensionNumbers& dimension_numbers);
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count = 1);
 
   // Infers the shape produced by the given FFT type on the given operand.
   static StatusOr<Shape> InferFftShape(
@@ -275,9 +276,9 @@ class ShapeInference {
   // with the given input shape, gather indices shape and gather dimension
   // numbers.
   static StatusOr<Shape> InferGatherShape(
-      const Shape& input_shape, const Shape& gather_indices_shape,
+      const Shape& input_shape, const Shape& start_indices_shape,
       const GatherDimensionNumbers& gather_dim_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
   // Helper that validates the given input shape, scatter indices shape, updates
   // shape, and scatter dimension numbers that constitute a scatter operation,
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index a73fa181cd..4ed8fc6b86 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1654,11 +1654,11 @@ TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGather) {
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{0},
-                                  /*elided_window_dims=*/{1},
-                                  /*gather_dims_to_operand_dims=*/{1},
+                                  /*offset_dims=*/{0},
+                                  /*collapsed_slice_dims=*/{1},
+                                  /*start_index_map=*/{1},
                                   /*index_vector_dim=*/1),
-                              /*window_bounds=*/{64, 1}));
+                              /*slice_sizes=*/{64, 1}));
   EXPECT_TRUE(
       ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {64, 32})))
       << ShapeUtil::HumanString(gather_shape);
@@ -1669,11 +1669,11 @@ TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGatherV2) {
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{1},
-                                  /*elided_window_dims=*/{0},
-                                  /*gather_dims_to_operand_dims=*/{0},
+                                  /*offset_dims=*/{1},
+                                  /*collapsed_slice_dims=*/{0},
+                                  /*start_index_map=*/{0},
                                   /*index_vector_dim=*/1),
-                              /*window_bounds=*/{1, 48}));
+                              /*slice_sizes=*/{1, 48}));
   EXPECT_TRUE(
       ShapeUtil::Equal(gather_shape, ShapeUtil::MakeShape(F32, {32, 48})))
       << ShapeUtil::HumanString(gather_shape);
@@ -1684,11 +1684,11 @@ TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGatherNd) {
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_4d_tensor_10_9_8_7_1_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{4},
-                                  /*elided_window_dims=*/{0},
-                                  /*gather_dims_to_operand_dims=*/{0},
+                                  /*offset_dims=*/{4},
+                                  /*collapsed_slice_dims=*/{0},
+                                  /*start_index_map=*/{0},
                                   /*index_vector_dim=*/4),
-                              /*window_bounds=*/{1, 48}));
+                              /*slice_sizes=*/{1, 48}));
   EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
                                ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 48})))
       << ShapeUtil::HumanString(gather_shape);
@@ -1700,11 +1700,11 @@ TEST_F(ScatterGatherShapeInferenceTest, TensorFlowBatchDynamicSlice) {
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*offset_dims=*/{4, 5, 6, 7, 8},
+              /*collapsed_slice_dims=*/{},
+              /*start_index_map=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/4),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          /*slice_sizes=*/{30, 29, 28, 27, 26}));
   EXPECT_TRUE(ShapeUtil::Equal(
       gather_shape,
       ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26})))
@@ -1717,11 +1717,11 @@ TEST_F(ScatterGatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) {
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*offset_dims=*/{4, 5, 6, 7, 8},
+              /*collapsed_slice_dims=*/{},
+              /*start_index_map=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/2),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   EXPECT_TRUE(ShapeUtil::Equal(
       gather_shape,
@@ -1735,11 +1735,11 @@ TEST_F(ScatterGatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) {
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{4, 5, 6, 7, 8},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+              /*offset_dims=*/{4, 5, 6, 7, 8},
+              /*collapsed_slice_dims=*/{},
+              /*start_index_map=*/{0, 1, 2, 3, 4},
               /*index_vector_dim=*/0),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+          /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   EXPECT_TRUE(ShapeUtil::Equal(
       gather_shape,
@@ -1749,16 +1749,15 @@ TEST_F(ScatterGatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) {
 
 TEST_F(ScatterGatherShapeInferenceTest, NoOutputGatherDims) {
   // This is equivalent to a dynamic slice.
-  TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
-      ShapeInference::InferGatherShape(
-          f32_5d_tensor_50_49_48_47_46_, s64_vector_5_,
-          HloGatherInstruction::MakeGatherDimNumbers(
-              /*output_window_dims=*/{0, 1, 2, 3, 4},
-              /*elided_window_dims=*/{},
-              /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
-              /*index_vector_dim=*/0),
-          /*window_bounds=*/{30, 29, 28, 27, 26}));
+  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+                          ShapeInference::InferGatherShape(
+                              f32_5d_tensor_50_49_48_47_46_, s64_vector_5_,
+                              HloGatherInstruction::MakeGatherDimNumbers(
+                                  /*offset_dims=*/{0, 1, 2, 3, 4},
+                                  /*collapsed_slice_dims=*/{},
+                                  /*start_index_map=*/{0, 1, 2, 3, 4},
+                                  /*index_vector_dim=*/0),
+                              /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
   EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
                                ShapeUtil::MakeShape(F32, {30, 29, 28, 27, 26})))
@@ -1772,11 +1771,11 @@ TEST_F(ScatterGatherShapeInferenceTest, ScalarGatherIndices) {
                           ShapeInference::InferGatherShape(
                               f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
                               HloGatherInstruction::MakeGatherDimNumbers(
-                                  /*output_window_dims=*/{0, 1, 2, 3},
-                                  /*elided_window_dims=*/{0},
-                                  /*gather_dims_to_operand_dims=*/{0},
+                                  /*offset_dims=*/{0, 1, 2, 3},
+                                  /*collapsed_slice_dims=*/{0},
+                                  /*start_index_map=*/{0},
                                   /*index_vector_dim=*/0),
-                              /*window_bounds=*/{1, 30, 29, 28, 27}));
+                              /*slice_sizes=*/{1, 30, 29, 28, 27}));
 
   EXPECT_TRUE(ShapeUtil::Equal(gather_shape,
                                ShapeUtil::MakeShape(F32, {30, 29, 28, 27})))
@@ -1787,11 +1786,11 @@ TEST_F(ScatterGatherShapeInferenceTest, TupleShapedTensorInput) {
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       tuple_shape_, s64_vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{0},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{1},
+          /*offset_dims=*/{0},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{1},
           /*index_vector_dim=*/1),
-      /*window_bounds=*/{64, 1});
+      /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Expected array argument for input"))
@@ -1802,11 +1801,11 @@ TEST_F(ScatterGatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, tuple_shape_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{0},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{1},
+          /*offset_dims=*/{0},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{1},
           /*index_vector_dim=*/0),
-      /*window_bounds=*/{64, 1});
+      /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Expected array argument for gather indices"))
@@ -1817,11 +1816,11 @@ TEST_F(ScatterGatherShapeInferenceTest, FloatingPointGatherIndicesInput) {
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{0},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{1},
+          /*offset_dims=*/{0},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{1},
           /*index_vector_dim=*/0),
-      /*window_bounds=*/{64, 1});
+      /*slice_sizes=*/{64, 1});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Gather indices parameter must be an integral tensor"))
@@ -1833,11 +1832,11 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 8, 7},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 8, 7},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
@@ -1850,11 +1849,11 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 7},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 7},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
@@ -1867,14 +1866,14 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 99, 100, 101},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 99, 100, 101},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Window index 2 in gather op is out of bounds"))
+              HasSubstr("Offset dimension 2 in gather op is out of bounds"))
       << statusor.status();
 }
 
@@ -1883,14 +1882,14 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 9},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 9},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Window index 4 in gather op is out of bounds"))
+              HasSubstr("Offset dimension 4 in gather op is out of bounds"))
       << statusor.status();
 }
 
@@ -1899,16 +1898,16 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{4},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{4},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
-      HasSubstr("All components of the window index in a gather op must either "
-                "be a output window index or explicitly elided"))
+      HasSubstr("All components of the offset index in a gather op must either "
+                "be a offset dimension or explicitly collapsed"))
       << statusor.status();
 }
 
@@ -1917,14 +1916,14 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{0, 1, 2, 3, 19},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{0, 1, 2, 3, 19},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Invalid elided_window_dims set in gather op; valid "
+              HasSubstr("Invalid collapsed_slice_dims set in gather op; valid "
                         "range is [0, 5), got: 19"))
       << statusor.status();
 }
@@ -1934,16 +1933,15 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{0, 1, 2, 3, 3},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{0, 1, 2, 3, 3},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(
-      statusor.status().error_message(),
-      HasSubstr(
-          "Repeated dimensions not allowed in elided_window_dims in gather op"))
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Repeated dimensions not allowed in "
+                        "collapsed_slice_dims in gather op"))
       << statusor.status();
 }
 
@@ -1952,17 +1950,16 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(
-      statusor.status().error_message(),
-      HasSubstr("Gather op has 4 elements in gather_dims_to_operand_dims and "
-                "the bound of dimension index_vector_dim=4 of "
-                "gather_indices is 5. These two numbers must be equal."))
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Gather op has 4 elements in start_index_map and "
+                        "the bound of dimension index_vector_dim=4 of "
+                        "start_indices is 5. These two numbers must be equal."))
       << statusor.status();
 }
 
@@ -1971,16 +1968,14 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 7},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 7},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(
-      statusor.status().error_message(),
-      HasSubstr("Invalid gather_dims_to_operand_dims mapping; domain is "
-                "[0, 5), got: 4->7"))
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Invalid start_index_map; domain is [0, 5), got: 4->7"))
       << statusor.status();
 }
 
@@ -1989,16 +1984,15 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 3},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 3},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
-      HasSubstr(
-          "Repeated dimensions are not allowed in gather_dims_to_operand_dims"))
+      HasSubstr("Repeated dimensions are not allowed in start_index_map"))
       << statusor.status();
 }
 
@@ -2007,14 +2001,14 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{2, 1},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{2, 1},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{1, 1, 28, 27, 26});
+      /*slice_sizes=*/{1, 1, 28, 27, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("elided_window_dims in gather op must be sorted"))
+              HasSubstr("collapsed_slice_dims in gather op must be sorted"))
       << statusor.status();
 }
 
@@ -2023,15 +2017,15 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7},
-          /*elided_window_dims=*/{2},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7},
+          /*collapsed_slice_dims=*/{2},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 1, 300, 26});
+      /*slice_sizes=*/{30, 29, 1, 300, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Window bound at index 3 in gather op is out of range, "
-                        "must be within [0, 48), got 300"))
+              HasSubstr("Slice size at index 3 in gather op is out of range, "
+                        "must be within [0, 48), got 300."))
       << statusor.status();
 }
 
@@ -2040,16 +2034,15 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 26});
+      /*slice_sizes=*/{30, 29, 28, 26});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(
       statusor.status().error_message(),
-      HasSubstr(
-          "Gather op must have one window bound for every input dimension"))
+      HasSubstr("Gather op must have one slice size for every input dimension"))
       << statusor.status();
 }
 
@@ -2058,15 +2051,15 @@ TEST_F(ScatterGatherShapeInferenceTest,
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7},
-          /*elided_window_dims=*/{1},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7},
+          /*collapsed_slice_dims=*/{1},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/4),
-      /*window_bounds=*/{30, 29, 28, 26, 20});
+      /*slice_sizes=*/{30, 29, 28, 26, 20});
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Gather op can only elide window indices with bound 1, "
-                        "but bound is 29 for index 1 at position 0"))
+              HasSubstr("Gather op can only collapse slice dims with bound 1, "
+                        "but bound is 29 for index 1 at position 0."))
       << statusor.status();
 }
 
@@ -2074,16 +2067,16 @@ TEST_F(ScatterGatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) {
   StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
       HloGatherInstruction::MakeGatherDimNumbers(
-          /*output_window_dims=*/{4, 5, 6, 7, 8},
-          /*elided_window_dims=*/{},
-          /*gather_dims_to_operand_dims=*/{0, 1, 2, 3, 4},
+          /*offset_dims=*/{4, 5, 6, 7, 8},
+          /*collapsed_slice_dims=*/{},
+          /*start_index_map=*/{0, 1, 2, 3, 4},
           /*index_vector_dim=*/32),
-      /*window_bounds=*/{30, 29, 28, 27, 26});
+      /*slice_sizes=*/{30, 29, 28, 27, 26});
 
   ASSERT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("Gather index leaf dimension must be within [0, "
-                        "rank(gather_indices) + 1)"))
+                        "rank(start_indices) + 1)"))
       << statusor.status();
 }
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 7d7dcac10b..5c12dc37b7 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -18,8 +18,9 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -76,7 +77,7 @@ void ShapedBuffer::clear() {
 }
 
 string ShapedBuffer::ToString() const {
-  string s = tensorflow::strings::StrCat(
+  string s = absl::StrCat(
       "ShapedBuffer(", platform_->Name(), ":", device_ordinal(),
       "), on-host shape=" + ShapeUtil::HumanStringWithLayout(on_host_shape()),
       ", on-device shape=" +
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index 0fc2436679..d69e6362e9 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -34,7 +35,7 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
   xla::StreamExecutorMemoryAllocator allocator(platform, executors);
   const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
   const int kDeviceOrdinal = 0;
-  auto scoped_buffer = tensorflow::MakeUnique<xla::ScopedShapedBuffer>(
+  auto scoped_buffer = absl::make_unique<xla::ScopedShapedBuffer>(
       shape, shape, &allocator, kDeviceOrdinal);
   std::unique_ptr<xla::ShapedBuffer> buffer = std::move(scoped_buffer);
   buffer = nullptr;
diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h
index 18e2651abb..84607cd012 100644
--- a/tensorflow/compiler/xla/service/source_map_util.h
+++ b/tensorflow/compiler/xla/service/source_map_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
-#define TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_
 
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -43,4 +43,4 @@ Status InvalidParameterArgument(const OpMetadata& op_metadata,
 }  // namespace source_map_util
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SOURCE_MAP_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/stream_pool.cc b/tensorflow/compiler/xla/service/stream_pool.cc
index c0582c6a2d..5d1cd1c442 100644
--- a/tensorflow/compiler/xla/service/stream_pool.cc
+++ b/tensorflow/compiler/xla/service/stream_pool.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -35,7 +35,7 @@ StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) {
 
   if (!stream) {
     // Create a new stream.
-    stream = MakeUnique<se::Stream>(executor);
+    stream = absl::make_unique<se::Stream>(executor);
     stream->Init();
     VLOG(1) << stream->DebugStreamPointers()
             << " StreamPool created new stream";
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 32d368a904..0c577ec67a 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -27,7 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/notification.h"
 
-using ::tensorflow::strings::StrCat;
+using absl::StrCat;
 
 namespace xla {
 /* static */ tensorflow::mutex
@@ -61,7 +63,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferLiteralFromDevice(
   if (!s.ok()) {
     return s;
   }
-  return MakeUnique<Literal>(std::move(literal));
+  return absl::make_unique<Literal>(std::move(literal));
 }
 
 Status TransferManager::TransferLiteralFromDevice(
@@ -120,7 +122,7 @@ StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
   if (!s.ok()) {
     return s;
   }
-  return MakeUnique<Literal>(std::move(literal));
+  return absl::make_unique<Literal>(std::move(literal));
 }
 
 Status TransferManager::TransferArrayToDevice(
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 475a2e5c14..f77690a462 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -152,6 +152,26 @@ class TransferManager {
       const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
       int device_ordinal);
 
+  // The given ShapedBuffer holds a handle to allocated memory, but it is not
+  // in the general case legal to immediately copy or access that allocated
+  // memory because queued operations on the device may alias that memory.
+  // Memory ordering is enforced by the Stream's happens-before relationship
+  // which allows eager deallocation and reallocation of buffers host-side even
+  // if the device hasn't finished with them.
+  //
+  // In certain cases, it can be known that a ShapedBuffer does not have any
+  // conflicting accesses on the device and thus is eligible to be accessed at
+  // any time from the host.
+  //
+  // This function returns true if device_buffer can be accessed immediately
+  // without waiting for the Stream's previously enqueued items. This only
+  // returns true if all subbuffers in device_buffer can be accessed
+  // immediately.
+  virtual bool CanShapedBufferBeAccessedNow(
+      se::StreamExecutor* executor, const ShapedBuffer& device_buffer) const {
+    return false;
+  }
+
   /////
   // The TransferManager class also serves as a point to register objects for
   // the various platforms.
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 49e1f87319..530f40e4b2 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -109,6 +109,7 @@ Status FoldTransposeIntoDot(InstructionOperandsPair pair) {
 
   std::unique_ptr<HloInstruction> new_dot = HloInstruction::CreateDot(
       dot->shape(), new_lhs, new_rhs, new_dim_numbers);
+  new_dot->set_precision_config(dot->precision_config());
   return dot->parent()->ReplaceWithNewInstruction(dot, std::move(new_dot));
 }
 
@@ -178,6 +179,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   auto new_conv = HloInstruction::CreateConvolve(
       convolution.shape(), new_lhs, new_rhs, convolution.window(), new_dnums);
+  new_conv->set_precision_config(convolution.precision_config());
   TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
       &convolution, std::move(new_conv)));
 
diff --git a/tensorflow/compiler/xla/service/transpose_folding.h b/tensorflow/compiler/xla/service/transpose_folding.h
index 71e8446452..3e5aa2db60 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.h
+++ b/tensorflow/compiler/xla/service/transpose_folding.h
@@ -49,7 +49,7 @@ class TransposeFolding : public HloPassInterface {
   explicit TransposeFolding(
       TransposableGemmOperandsFn transposable_gemm_operands,
       TransposableConvOperandsFn transposable_conv_operands);
-  tensorflow::StringPiece name() const override { return "transpose-folding"; }
+  absl::string_view name() const override { return "transpose-folding"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 0447807a41..cb07b8d4d3 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -19,6 +19,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -26,17 +29,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
 string BufferAlias::ToString() const {
-  return tensorflow::strings::StrCat("BufferAlias(", instruction_->name(), "[",
-                                     tensorflow::str_util::Join(index_, ","),
-                                     "])");
+  return absl::StrCat("BufferAlias(", instruction_->name(), "[",
+                      absl::StrJoin(index_, ","), "])");
 }
 
 std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias) {
@@ -441,7 +441,7 @@ PointsToSet& TuplePointsToAnalysis::CreateEmptyPointsToSet(
   PerInstruction* pi = PerInst(instruction);
   CHECK(pi->points_to_set == nullptr)
       << "instruction should not have been present in the map.";
-  auto set = MakeUnique<PointsToSet>(&instruction->shape());
+  auto set = absl::make_unique<PointsToSet>(&instruction->shape());
   pi->points_to_set = std::move(set);
   // Return *set using the iterator returned by emplace.
   return *pi->points_to_set;
@@ -495,8 +495,7 @@ StatusOr<const LogicalBuffer*> TuplePointsToAnalysis::GetBufferDefinedAt(
   if (buffers.size() != 1 || buffers[0]->instruction() != instruction) {
     return FailedPrecondition(
         "instruction %s does not define buffer at index {%s}",
-        instruction->name().c_str(),
-        tensorflow::str_util::Join(index, ",").c_str());
+        instruction->name().c_str(), absl::StrJoin(index, ",").c_str());
   }
   return buffers[0];
 }
@@ -562,8 +561,7 @@ string TuplePointsToAnalysis::ToString() const {
   for (const auto* computation : module_->MakeNonfusionComputations()) {
     const char* entry =
         computation == module_->entry_computation() ? "entry " : "";
-    tensorflow::strings::StrAppend(&output, entry, "computation ",
-                                   computation->name(), ":\n");
+    absl::StrAppend(&output, entry, "computation ", computation->name(), ":\n");
     for (const HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       InstructionToString(instruction, &output);
@@ -575,12 +573,11 @@ string TuplePointsToAnalysis::ToString() const {
     }
   }
 
-  tensorflow::strings::StrAppend(&output, "LogicalBuffers:\n");
+  absl::StrAppend(&output, "LogicalBuffers:\n");
   for (const auto& b : logical_buffer_analysis_->logical_buffers()) {
-    tensorflow::strings::StrAppend(&output, "  buffer ", b->ToString(), ":\n");
+    absl::StrAppend(&output, "  buffer ", b->ToString(), ":\n");
     for (const BufferAlias& alias : logical_buffer_aliases_.at(b->id())) {
-      tensorflow::strings::StrAppend(&output, "    alias ", alias.ToString(),
-                                     "\n");
+      absl::StrAppend(&output, "    alias ", alias.ToString(), "\n");
     }
   }
   return output;
@@ -589,20 +586,18 @@ string TuplePointsToAnalysis::ToString() const {
 void TuplePointsToAnalysis::InstructionToString(
     const HloInstruction* instruction, string* output) const {
   const string prefix = instruction->IsFused() ? "    " : "";
-  tensorflow::strings::StrAppend(output, prefix, "  instruction ",
-                                 instruction->ToShortString(), ":\n");
+  absl::StrAppend(output, prefix, "  instruction ",
+                  instruction->ToShortString(), ":\n");
   const PointsToSet& points_to_set = GetPointsToSet(instruction);
   points_to_set.ForEachElement([&prefix, &output](
                                    const ShapeIndex& index,
                                    const PointsToSet::BufferList& points_to) {
-    tensorflow::strings::StrAppend(
-        output, prefix, "    {", tensorflow::str_util::Join(index, ","), "}: ",
-        tensorflow::str_util::Join(
-            points_to, ", ",
-            [](string* out, const LogicalBuffer* source) {
-              out->append(source->ToString());
-            }),
-        "\n");
+    absl::StrAppend(output, prefix, "    {", absl::StrJoin(index, ","), "}: ",
+                    absl::StrJoin(points_to, ", ",
+                                  [](string* out, const LogicalBuffer* source) {
+                                    out->append(source->ToString());
+                                  }),
+                    "\n");
   });
 }
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 686bb05328..62c7bb685d 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -109,7 +110,7 @@ class PointsToSet {
   // Add a tuple source instruction for the given index.
   void add_tuple_source(const ShapeIndex& index, HloInstruction* tuple);
 
-  using BufferList = tensorflow::gtl::InlinedVector<const LogicalBuffer*, 1>;
+  using BufferList = absl::InlinedVector<const LogicalBuffer*, 1>;
 
   // Return the list of logical buffers for the subshape at index.
   const BufferList& element(const ShapeIndex& index) const {
@@ -203,7 +204,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // logical buffer The buffer alias set is the inverse of the points-to set.
   // That is, LogicalBuffer B is in the points-to set of instruction I at index
   // N iff instruction I, index N is a BufferAlias of B.
-  using BufferAliasVector = tensorflow::gtl::InlinedVector<BufferAlias, 1>;
+  using BufferAliasVector = absl::InlinedVector<BufferAlias, 1>;
   const BufferAliasVector& GetBufferAliases(const LogicalBuffer& buffer) const;
 
   // Returns the number of logical buffers in the module
@@ -226,8 +227,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // instructions produce a single buffer (the top-level buffer), some produce
   // no buffers (eg bitcast), and some produce more than one buffer (eg,
   // tuple-shaped parameters).
-  using BufferDefinitionVector =
-      tensorflow::gtl::InlinedVector<const LogicalBuffer*, 1>;
+  using BufferDefinitionVector = absl::InlinedVector<const LogicalBuffer*, 1>;
   const BufferDefinitionVector& GetBuffersDefinedByInstruction(
       const HloInstruction* instruction) const;
 
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index 7509501883..8c91d6e69d 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -30,7 +30,7 @@ class TupleSimplifier : public HloPassInterface {
   TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
   explicit TupleSimplifier(bool exclude_entry_computation);
   ~TupleSimplifier() override {}
-  tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
+  absl::string_view name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index af2cb6dc2a..7e4ac92a7c 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 namespace xla {
 
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::optional;
+using absl::nullopt;
+using absl::optional;
 
 // Finds and returns the non-constant operand in instr.
 //
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h
index bf59813e8c..bf497f4892 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.h
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
 
@@ -25,8 +25,8 @@ namespace xla {
 // nullopt otherwise. max_value_returned limits the number of steps that are
 // evaluated while trying to brute force a loop trip count, trip counts larger
 // than max_value_returned result in nullopt.
-tensorflow::gtl::optional<int64> ComputeWhileLoopTripCount(
-    HloInstruction *while_op, int64 max_value_returned = 128);
+absl::optional<int64> ComputeWhileLoopTripCount(HloInstruction *while_op,
+                                                int64 max_value_returned = 128);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
index 62af45128a..aab1180662 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -32,7 +33,7 @@ static Status ReplaceUsesWhileKeepingLoopInvariance(
 
   std::vector<HloInstruction*> users;
   users.reserve(old_instr->user_count());
-  c_copy(old_instr->users(), std::back_inserter(users));
+  absl::c_copy(old_instr->users(), std::back_inserter(users));
 
   for (auto* user : users) {
     for (int64 i = 0, e = user->operand_count(); i < e; i++) {
@@ -108,10 +109,10 @@ StatusOr<bool> WhileLoopConstantSinking::Run(HloModule* module) {
     //
     // This will let us sink the constant into the outer while first and then
     // into the inner while in a single run of this pass.
-    c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
-              [](const HloInstruction* instr) {
-                return instr->opcode() == HloOpcode::kWhile;
-              });
+    absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
+                    [](const HloInstruction* instr) {
+                      return instr->opcode() == HloOpcode::kWhile;
+                    });
   }
 
   for (HloInstruction* while_instr : while_instrs) {
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
index 21fb8568a8..2dba7d7f75 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
@@ -54,7 +54,7 @@ class WhileLoopConstantSinking : public HloPassInterface {
  public:
   ~WhileLoopConstantSinking() override = default;
 
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "while-loop-invariant-code-motion";
   }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 266039d2ff..0e7667de83 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -206,7 +206,8 @@ body {
   p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0
   p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1
 
-  outfeed = token[] outfeed(p_body.0)
+  token = token[] after-all()
+  outfeed = token[] outfeed(p_body.0, token)
   ROOT root = (f32[2],f32[2],f32[2]) tuple(p_body.0, p_body.1, p_body.1)
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 09ddcffb22..f4098f28b3 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -14,18 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace xla {
 
+using absl::InlinedVector;
 using tensorflow::gtl::FlatMap;
 using tensorflow::gtl::FlatSet;
-using tensorflow::gtl::InlinedVector;
 
 // Copies `to_hoist` to the computation containing `while_instr`, hoisting its
 // operands as needed.  All of its transitive operands are expected to be either
@@ -65,8 +66,8 @@ static void CreateLoopInvariantCopy(
       };
 
       InlinedVector<HloInstruction*, 4> new_operands;
-      c_transform(old_instruction->operands(), std::back_inserter(new_operands),
-                  get_new_operand);
+      absl::c_transform(old_instruction->operands(),
+                        std::back_inserter(new_operands), get_new_operand);
 
       HloInstruction* new_instruction =
           parent_of_while->AddInstruction(old_instruction->CloneWithNewOperands(
@@ -197,7 +198,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
              op->opcode() == HloOpcode::kConstant;
     };
 
-    if (!c_all_of(instruction->operands(), is_invariant)) {
+    if (!absl::c_all_of(instruction->operands(), is_invariant)) {
       continue;
     }
 
@@ -257,10 +258,10 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
   bool changed = false;
   std::vector<HloInstruction*> while_instrs;
   for (auto* comp : module->computations()) {
-    c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
-              [](const HloInstruction* instr) {
-                return instr->opcode() == HloOpcode::kWhile;
-              });
+    absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
+                    [](const HloInstruction* instr) {
+                      return instr->opcode() == HloOpcode::kWhile;
+                    });
   }
 
   for (HloInstruction* while_instr : while_instrs) {
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index 8e6cc87875..2cdf20ce80 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -38,7 +38,7 @@ class WhileLoopInvariantCodeMotion : public HloPassInterface {
       : hoist_constants_(hoist_constants) {}
   ~WhileLoopInvariantCodeMotion() override = default;
 
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "while-loop-invariant-code-motion";
   }
   StatusOr<bool> Run(HloModule* module) override;
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 32e69c335b..e14014b961 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -28,6 +28,10 @@ namespace op = xla::testing::opcode_matchers;
 
 class WhileLoopInvariantCodeMotionTest : public HloVerifiedTestBase {
  public:
+  WhileLoopInvariantCodeMotionTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+
   // Makes a computation which has one parameter, of the given shape, and always
   // returns PRED[]{true}.  This is useful as a dummy loop condition.
   HloComputation* MakeAlwaysTrueComputation(const Shape& param_shape,
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index dd8697e680..6a7bfe3f12 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -14,17 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/gtl/optional.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::optional;
+using absl::optional;
 
 // Determines whether the given instruction is a send/recv node, or has a
 // subcomputation which contains a send/recv node.
@@ -237,12 +236,11 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
             << "Instruction " << user->ToString(print_no_metadata)
             << " should be unused (except by root of while body), but has "
                "users: {"
-            << tensorflow::str_util::Join(
-                   user->users(), ", ",
-                   [&](string* out, const HloInstruction* instr) {
-                     tensorflow::strings::StrAppend(
-                         out, instr->ToString(print_no_metadata));
-                   })
+            << absl::StrJoin(user->users(), ", ",
+                             [&](string* out, const HloInstruction* instr) {
+                               absl::StrAppend(
+                                   out, instr->ToString(print_no_metadata));
+                             })
             << "}";
 
         replacements.emplace(user, nullptr);
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index 3d3e1d60f2..78024f14dc 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -33,9 +33,7 @@ namespace xla {
 class WhileLoopSimplifier : public HloPassInterface {
  public:
   ~WhileLoopSimplifier() override {}
-  tensorflow::StringPiece name() const override {
-    return "simplify-while-loops";
-  }
+  absl::string_view name() const override { return "simplify-while-loops"; }
   StatusOr<bool> Run(HloModule* module) override;
 };
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 2e1571943e..cfe4104f6d 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace {
@@ -27,6 +28,11 @@ namespace {
 namespace op = xla::testing::opcode_matchers;
 
 class WhileLoopSimplifierTest : public HloVerifiedTestBase {
+ public:
+  WhileLoopSimplifierTest()
+      : HloVerifiedTestBase(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false) {}
+
  protected:
   // Makes an HloModule that contains a loop with `num_iters` iteration.
   void MakeModuleWithSimpleLoop(int num_iters);
@@ -64,10 +70,8 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
   }
   )";
 
-  string hlo_string = tensorflow::str_util::StringReplace(
-      hlo_string_template, "{{LOOP_BOUND}}",
-      tensorflow::strings::StrCat(42 + num_iters),
-      /*replace_all=*/true);
+  string hlo_string = absl::StrReplaceAll(
+      hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
   ParseAndVerifyModule(hlo_string);
 }
 
@@ -103,10 +107,8 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
   }
   )";
 
-  string hlo_string = tensorflow::str_util::StringReplace(
-      hlo_string_template, "{{LOOP_BOUND}}",
-      tensorflow::strings::StrCat(42 + num_iters),
-      /*replace_all=*/true);
+  string hlo_string = absl::StrReplaceAll(
+      hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
   ParseAndVerifyModule(hlo_string);
 }
 
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index 1ef17b9d7d..e8f76ff745 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -14,15 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/while_util.h"
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 
-using tensorflow::strings::StrCat;
+using absl::StrCat;
 
 static StatusOr<HloComputation*> WidenWhileCondition(
     HloComputation* narrow_condition, const Shape& wide_shape) {
@@ -206,7 +207,7 @@ static StatusOr<HloInstruction*> MakeInitTupleFromInitValues(
   HloInstruction* zero = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
   init_values_with_indvar.push_back(zero);
-  c_copy(init_values, std::back_inserter(init_values_with_indvar));
+  absl::c_copy(init_values, std::back_inserter(init_values_with_indvar));
   return computation->AddInstruction(
       HloInstruction::CreateTuple(init_values_with_indvar));
 }
@@ -215,8 +216,9 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
   std::vector<Shape> loop_state_shape_components;
   loop_state_shape_components.reserve(init_values.size() + 1);
   loop_state_shape_components.push_back(ShapeUtil::MakeShape(S32, {}));
-  c_transform(init_values, std::back_inserter(loop_state_shape_components),
-              [](HloInstruction* instr) { return instr->shape(); });
+  absl::c_transform(init_values,
+                    std::back_inserter(loop_state_shape_components),
+                    [](HloInstruction* instr) { return instr->shape(); });
   return ShapeUtil::MakeTupleShape(loop_state_shape_components);
 }
 
diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc
index 2ccb919acf..5e69419333 100644
--- a/tensorflow/compiler/xla/service/while_util_test.cc
+++ b/tensorflow/compiler/xla/service/while_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_util.h"
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -206,7 +207,7 @@ ENTRY main {
   auto is_while = [](const HloInstruction* instr) {
     return instr->opcode() == HloOpcode::kWhile;
   };
-  EXPECT_EQ(c_count_if(main->instructions(), is_while), 1);
+  EXPECT_EQ(absl::c_count_if(main->instructions(), is_while), 1);
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
index 8763e588c4..a7f0e207eb 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
@@ -24,7 +24,7 @@ namespace xla {
 class ZeroSizedHloElimination : public HloPassInterface {
  public:
   StatusOr<bool> Run(HloModule* module) override;
-  tensorflow::StringPiece name() const override {
+  absl::string_view name() const override {
     return "zero_sized_hlo_elimination";
   }
 };
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index c74dd648ad..c793a39c27 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -21,8 +21,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index c4c958be4a..c8ff55e784 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/shape_tree.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -242,7 +243,7 @@ TEST_F(ShapeTreeTest, InvalidIndexingNestedTuple) {
 TEST_F(ShapeTreeTest, ShapeTreeOfNonCopyableType) {
   ShapeTree<std::unique_ptr<int>> shape_tree{tuple_shape_};
   EXPECT_EQ(shape_tree.element({2}).get(), nullptr);
-  *shape_tree.mutable_element({2}) = MakeUnique<int>(42);
+  *shape_tree.mutable_element({2}) = absl::make_unique<int>(42);
   EXPECT_EQ(*shape_tree.element({2}), 42);
 }
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 34869cc507..31ddd57eef 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/overflow_util.h"
@@ -30,26 +38,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
 
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 string ShapeIndex::ToString() const { return ShapeIndexView(*this).ToString(); }
 
 string ShapeIndexView::ToString() const {
-  return StrCat("{", tensorflow::str_util::Join(indices_, ","), "}");
+  return StrCat("{", absl::StrJoin(indices_, ","), "}");
 }
 
 bool ShapeIndexView::operator==(const ShapeIndexView& other) const {
@@ -449,14 +453,14 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 namespace {
 
 // Class to memoize the computation of
-//   tensorflow::str_util::Lowercase(PrimitiveType_Name(p))
+//   absl::AsciiStrToLower(PrimitiveType_Name(p))
 // for all PrimitiveType values "p"
 class PrimitiveTypeNameGenerator {
  public:
   PrimitiveTypeNameGenerator() {
     for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
       if (PrimitiveType_IsValid(i)) {
-        lowercase_name_[i] = tensorflow::str_util::Lowercase(
+        lowercase_name_[i] = absl::AsciiStrToLower(
             PrimitiveType_Name(static_cast<PrimitiveType>(i)));
       }
     }
@@ -507,7 +511,7 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
     return text;
   }
   return StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[",
-                tensorflow::str_util::Join(shape.dimensions(), ","), "]");
+                absl::StrJoin(shape.dimensions(), ","), "]");
 }
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
@@ -543,30 +547,30 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
                                     : "(unknown)",
                                 ": ", HumanString(shape)));
   }
-  return StrCat("(", tensorflow::str_util::Join(parameters, ", "), ") -> ",
+  return StrCat("(", absl::StrJoin(parameters, ", "), ") -> ",
                 HumanString(program_shape.result()));
 }
 
 namespace {
 // Parses shapes with simple recursive descent structure -- consumes from the
 // front of s and passes that view recursively as required.
-StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
-  tensorflow::str_util::RemoveLeadingWhitespace(s);
+StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
+  *s = StripLeadingAsciiWhitespace(*s);
 
-  if (tensorflow::str_util::ConsumePrefix(s, "(")) {  // Tuple.
+  if (absl::ConsumePrefix(s, "(")) {  // Tuple.
     std::vector<Shape> shapes;
     bool must_end = false;
     while (true) {
-      if (tensorflow::str_util::ConsumePrefix(s, ")")) {
+      if (absl::ConsumePrefix(s, ")")) {
         break;
       } else if (must_end) {
         return InvalidArgument("Expected end of tuple; got: \"%s\"",
-                               std::string(*s).c_str());
+                               string(*s).c_str());
       }
       shapes.emplace_back();
       TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
-      tensorflow::str_util::RemoveLeadingWhitespace(s);
-      must_end = !tensorflow::str_util::ConsumePrefix(s, ",");
+      *s = StripLeadingAsciiWhitespace(*s);
+      must_end = !absl::ConsumePrefix(s, ",");
     }
     return ShapeUtil::MakeTupleShape(shapes);
   }
@@ -575,9 +579,9 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   string dimensions_string;
   string format_string;
   string layout_string;
-  // tensorflow::StringPiece is not compatible with internal RE2 StringPiece, so
+  // absl::string_view is not compatible with internal RE2 StringPiece, so
   // we convert in to the RE2-consumable type and then consume the corresponding
-  // amount from our StringPiece type.
+  // amount from our string_view type.
   static LazyRE2 shape_pattern = {
       "^(\\w*\\d*)\\[([\\d,]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,]+)})?"};
   tensorflow::RegexpStringPiece s_consumable(s->data(), s->size());
@@ -585,12 +589,12 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                    &dimensions_string, &format_string, &layout_string)) {
     size_t consumed = s->size() - s_consumable.size();
     s->remove_prefix(consumed);
-    auto string_to_int64 = [&s](const string& input) -> StatusOr<int64> {
+    auto string_to_int64 = [&s](absl::string_view input) -> StatusOr<int64> {
       int64 element;
-      if (!tensorflow::strings::safe_strto64(input.c_str(), &element)) {
+      if (!absl::SimpleAtoi(input, &element)) {
         return InvalidArgument(
             "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"",
-            input.c_str(), std::string(*s).c_str());
+            string(input).c_str(), string(*s).c_str());
       }
       return element;
     };
@@ -598,7 +602,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
     auto comma_list_to_int64s =
         [string_to_int64](const string& input) -> StatusOr<std::vector<int64>> {
       std::vector<int64> results;
-      for (const string& piece : tensorflow::str_util::Split(input, ',')) {
+      for (const auto& piece : absl::StrSplit(input, ',', absl::SkipEmpty())) {
         TF_ASSIGN_OR_RETURN(int64 element, string_to_int64(piece));
         results.push_back(element);
       }
@@ -645,16 +649,15 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   }
 
   return InvalidArgument("Invalid shape string to parse: \"%s\"",
-                         std::string(*s).c_str());
+                         string(*s).c_str());
 }
 }  // namespace
 
-/* static */ StatusOr<Shape> ShapeUtil::ParseShapeString(
-    tensorflow::StringPiece s) {
+/* static */ StatusOr<Shape> ShapeUtil::ParseShapeString(absl::string_view s) {
   TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s));
   if (!s.empty()) {
     return InvalidArgument("Invalid shape string to parse: \"%s\"",
-                           std::string(s).c_str());
+                           string(s).c_str());
   }
   return shape;
 }
@@ -1014,12 +1017,13 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
 }
 
 /* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) {
+  if (!IsTuple(shape)) {
+    return 1;
+  }
   int64 count = 0;
-  ForEachSubshape(shape, [&](const Shape&, const ShapeIndex& index) {
-    if (IsLeafIndex(shape, index)) {
-      ++count;
-    }
-  });
+  for (const Shape& subshape : shape.tuple_shapes()) {
+    count += GetLeafCount(subshape);
+  }
   return count;
 }
 
@@ -1171,8 +1175,7 @@ Status ForEachMutableSubshapeHelper(
     CHECK(TransposeIsBitcast(shape, new_shape, InversePermutation(permutation)))
         << "shape=" << HumanStringWithLayout(shape)
         << ", new_shape=" << HumanStringWithLayout(new_shape)
-        << ", permutation={" << tensorflow::str_util::Join(permutation, ",")
-        << "}";
+        << ", permutation={" << absl::StrJoin(permutation, ",") << "}";
   }
   return new_shape;
 }
@@ -1459,7 +1462,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
          check_input_unit_indices(output_shape, input_shape);
 }
 
-/* static */ tensorflow::gtl::optional<Shape> ShapeUtil::AlignLayouts(
+/* static */ absl::optional<Shape> ShapeUtil::AlignLayouts(
     const Shape& input_shape, const Shape& output_shape) {
   CHECK(IsArray(input_shape));
   CHECK(IsArray(output_shape));
@@ -1498,7 +1501,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     if (input_dimension_product < output_dimension_product ||
         j == output_rank) {
       if (i == input_rank) {
-        return tensorflow::gtl::nullopt;
+        return absl::nullopt;
       }
       dimension_to_alignment_index[i] = alignment.size() - 1;
       input_dimension_product *= input_shape.dimensions(i);
@@ -1509,7 +1512,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     }
   }
   if (input_dimension_product != output_dimension_product) {
-    return tensorflow::gtl::nullopt;
+    return absl::nullopt;
   }
   // We also need to store an end element so that we know where the last
   // alignment part ends.
@@ -1553,7 +1556,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part;
          ++i, ++j) {
       if (i == input_rank) {
-        return tensorflow::gtl::nullopt;
+        return absl::nullopt;
       }
       // Skip trivial dimensions with a bound of 1.
       if (input_shape.dimensions(input_dimension_numbers[i]) == 1) {
@@ -1566,7 +1569,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
       if (dimension_to_alignment_index[input_dimension_numbers[i]] !=
               current_alignment_index ||
           input_dimension_numbers[i] > current_dimension_number) {
-        return tensorflow::gtl::nullopt;
+        return absl::nullopt;
       }
       current_dimension_number = input_dimension_numbers[i];
     }
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index d6f17fc965..84f36e48a0 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include <initializer_list>
 #include <string>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -31,8 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
@@ -74,7 +74,7 @@ class ShapeIndex {
   // push_front is O(n^2), but shapes don't usually have a ton of dimensions.
   void push_front(int64 value) { indices_.insert(indices_.begin(), value); }
 
-  using container_type = tensorflow::gtl::InlinedVector<int64, 2>;
+  using container_type = absl::InlinedVector<int64, 2>;
 
   container_type::const_iterator begin() const { return indices_.begin(); }
   container_type::const_iterator end() const { return indices_.end(); }
@@ -228,7 +228,7 @@ class ShapeUtil {
 
   // Parses a ShapeUtil::HumanString-format shape string back into a shape
   // object.
-  static StatusOr<Shape> ParseShapeString(tensorflow::StringPiece s);
+  static StatusOr<Shape> ParseShapeString(absl::string_view s);
 
   // Returns whether the LHS and RHS shapes have the same dimensions; note: does
   // not check element type.
@@ -597,8 +597,8 @@ class ShapeUtil {
   // layout). The layout of 'input_shape' is kept fixed. Returns
   // 'output_shape_with_layout' if such a layout can be found, and an error
   // otherwise.
-  static tensorflow::gtl::optional<Shape> AlignLayouts(
-      const Shape& input_shape, const Shape& output_shape);
+  static absl::optional<Shape> AlignLayouts(const Shape& input_shape,
+                                            const Shape& output_shape);
 
   // Returns a shape with the given dimension deleted.
   // For example:
@@ -737,13 +737,13 @@ class ShapeUtil {
     int64 n = -1;
     std::vector<int64> indexes(base.begin(), base.end());
     const int kNumThreads = tensorflow::port::NumSchedulableCPUs();
-    tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
+    absl::optional<tensorflow::thread::ThreadPool> pool;
     if (parallel) {
       pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
     }
 
     while (n < rank) {
-      if (pool != tensorflow::gtl::nullopt) {
+      if (pool != absl::nullopt) {
         pool->Schedule(
             [indexes, &visitor_function] { visitor_function(indexes); });
       } else {
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index e5dd62ae9a..7549ba9c78 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 
 #include <numeric>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -23,8 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace {
@@ -849,13 +849,13 @@ TEST(ShapeUtilTest, PermuteDimensionsLayout) {
   std::iota(layout.begin(), layout.end(), 0);
   do {
     Shape s = ShapeUtil::MakeShapeWithLayout(F32, {10, 100, 1000}, layout);
-    SCOPED_TRACE(tensorflow::strings::StrCat("s=", ShapeUtil::HumanString(s)));
+    SCOPED_TRACE(absl::StrCat("s=", ShapeUtil::HumanString(s)));
 
     std::vector<int64> permutation(3);
     std::iota(permutation.begin(), permutation.end(), 0);
     do {
-      SCOPED_TRACE(tensorflow::strings::StrCat(
-          "permutation=", tensorflow::str_util::Join(permutation, ",")));
+      SCOPED_TRACE(
+          absl::StrCat("permutation=", absl::StrJoin(permutation, ",")));
 
       // TransposeIsBitcast takes the inverse of the permutation that
       // PermuteDimensions takes.
diff --git a/tensorflow/compiler/xla/sparse_index_array.h b/tensorflow/compiler/xla/sparse_index_array.h
index f2ce22d672..70fab3bea5 100644
--- a/tensorflow/compiler/xla/sparse_index_array.h
+++ b/tensorflow/compiler/xla/sparse_index_array.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -139,7 +140,7 @@ void SparseIndexArray::SortWithValues(
 
   // Reorder the array elements according to sort_order.  Work through the array
   // and follow cycles so we can do the reorder in-place.
-  tensorflow::gtl::InlinedVector<int64, 8> saved_index(rank());
+  absl::InlinedVector<int64, 8> saved_index(rank());
   for (int64 i = 0; i < num_elements; ++i) {
     // sort_order[i] == -1 indicates the element has already been copied.
     if (sort_order[i] < 0) {
diff --git a/tensorflow/compiler/xla/status_macros.cc b/tensorflow/compiler/xla/status_macros.cc
index a6b1f9004f..b88fe367d7 100644
--- a/tensorflow/compiler/xla/status_macros.cc
+++ b/tensorflow/compiler/xla/status_macros.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stacktrace.h"
 
@@ -37,8 +36,7 @@ static void LogError(const Status& status, const char* filename, int line,
   if (TF_PREDICT_TRUE(log_severity != tensorflow::NUM_SEVERITIES)) {
     string stack_trace;
     if (should_log_stack_trace) {
-      stack_trace =
-          tensorflow::strings::StrCat("\n", tensorflow::CurrentStackTrace());
+      stack_trace = absl::StrCat("\n", tensorflow::CurrentStackTrace());
     }
     switch (log_severity) {
       case tensorflow::INFO:
@@ -142,17 +140,15 @@ Status MakeErrorStream::Impl::GetStatus() {
   is_done_ = true;
 
   const string& stream_str = stream_.str();
-  const string str =
-      prior_message_handling_ == kAppendToPriorMessage
-          ? tensorflow::strings::StrCat(prior_message_, stream_str)
-          : tensorflow::strings::StrCat(stream_str, prior_message_);
+  const string str = prior_message_handling_ == kAppendToPriorMessage
+                         ? absl::StrCat(prior_message_, stream_str)
+                         : absl::StrCat(stream_str, prior_message_);
   if (TF_PREDICT_FALSE(str.empty())) {
-    return MakeError(file_, line_, code_,
-                     tensorflow::strings::StrCat(
-                         str, "Error without message at ", file_, ":", line_),
-                     true /* should_log */,
-                     tensorflow::ERROR /* log_severity */,
-                     should_log_stack_trace_);
+    return MakeError(
+        file_, line_, code_,
+        absl::StrCat(str, "Error without message at ", file_, ":", line_),
+        true /* should_log */, tensorflow::ERROR /* log_severity */,
+        should_log_stack_trace_);
   } else {
     return MakeError(file_, line_, code_, str, should_log_, log_severity_,
                      should_log_stack_trace_);
diff --git a/tensorflow/compiler/xla/test.h b/tensorflow/compiler/xla/test.h
index 87a8c5f3a5..a657554dc2 100644
--- a/tensorflow/compiler/xla/test.h
+++ b/tensorflow/compiler/xla/test.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPLIER_XLA_TEST_H_
-#define TENSORFLOW_COMPLIER_XLA_TEST_H_
+#ifndef TENSORFLOW_COMPILER_XLA_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_TEST_H_
 
 // This header includes gmock.h and enables the use of gmock matchers in tests
 // in third_party/tensorflow/compiler/xla.
@@ -45,4 +45,4 @@ limitations under the License.
 
 #include "tensorflow/core/platform/test.h"
 
-#endif  // TENSORFLOW_COMPLIER_XLA_TEST_H_
+#endif  // TENSORFLOW_COMPILER_XLA_TEST_H_
diff --git a/tensorflow/compiler/xla/test_helpers.h b/tensorflow/compiler/xla/test_helpers.h
index 8918350135..3ede5e6e38 100644
--- a/tensorflow/compiler/xla/test_helpers.h
+++ b/tensorflow/compiler/xla/test_helpers.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <list>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 42d52aee78..6b29d833da 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -43,6 +43,7 @@ cc_library(
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = True,
 )
@@ -98,6 +99,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -113,7 +115,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:backend",
@@ -127,6 +128,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -144,6 +148,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -187,7 +192,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
@@ -201,6 +205,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -274,6 +280,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -385,6 +392,8 @@ xla_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -551,6 +560,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -665,6 +675,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -683,7 +694,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -691,6 +701,7 @@ xla_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -709,6 +720,19 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "scatter_test",
+    srcs = ["scatter_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 # Repeat dot_operation_runtime_test with single-threaded eigen.
 xla_test(
     name = "dot_operation_single_threaded_runtime_test",
@@ -727,7 +751,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -735,6 +758,7 @@ xla_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -798,6 +822,7 @@ CONVOLUTION_TEST_DEPS = [
     "//tensorflow/compiler/xla/client:padding",
     "//tensorflow/compiler/xla/client:xla_builder",
     "//tensorflow/compiler/xla/tests:client_library_test_base",
+    "//tensorflow/compiler/xla/tests:hlo_test_base",
     "//tensorflow/compiler/xla/tests:literal_test_util",
     "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     "//tensorflow/core:lib",
@@ -809,7 +834,10 @@ xla_test(
     timeout = "long",
     srcs = ["convolution_test.cc"],
     shard_count = 25,
-    deps = CONVOLUTION_TEST_DEPS,
+    deps = CONVOLUTION_TEST_DEPS + [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 xla_test(
@@ -819,7 +847,10 @@ xla_test(
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
     backends = ["gpu"],
     shard_count = 25,
-    deps = CONVOLUTION_TEST_DEPS,
+    deps = CONVOLUTION_TEST_DEPS + [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 xla_test(
@@ -870,6 +901,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -903,6 +935,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -979,6 +1012,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1052,6 +1087,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1105,6 +1141,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1133,6 +1170,8 @@ xla_test_library(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1140,6 +1179,7 @@ xla_test(
     name = "reduce_window_test",
     timeout = "long",
     srcs = [],
+    shard_count = 20,
     tags = [
         "enable_for_xla_interpreter",
         "optonly",
@@ -1195,6 +1235,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1205,12 +1246,12 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        ":client_library_test_base",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1221,12 +1262,12 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        ":client_library_test_base",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1270,6 +1311,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1335,6 +1377,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1396,6 +1439,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1465,6 +1510,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1525,17 +1571,16 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -1620,6 +1665,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1632,7 +1678,6 @@ xla_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
@@ -1643,6 +1688,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1736,6 +1782,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1757,6 +1804,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/stream_executor",
+        "@com_google_absl//absl/memory",
         "@llvm//:core",
     ],
 )
@@ -1808,6 +1856,7 @@ xla_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1820,13 +1869,9 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
-        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1834,6 +1879,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1860,7 +1907,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1868,6 +1914,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -1994,6 +2041,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2035,6 +2083,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -2061,6 +2110,8 @@ tf_cc_test(
 xla_test(
     name = "test_utils_test",
     srcs = ["test_utils_test.cc"],
+    # There is nothing backend specific in this test, so just pick an arbitrary backend.
+    backends = ["cpu"],
     deps = [
         ":local_client_test_base",
         ":test_utils",
@@ -2069,6 +2120,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 74f2e36f82..577fd1ab3b 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -35,11 +35,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
+using tensorflow::gtl::ArraySlice;
+
 class ArrayElementwiseOpTest : public ClientLibraryTestBase {
  public:
   ErrorSpec error_spec_{0.0001, 0.0001};
@@ -293,6 +296,22 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) {
   ComputeAndCompareR1<int64>(&b, expected, {lhs_data.get(), rhs_data.get()});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, CmpTwoConstantU64s) {
+  XlaBuilder b(TestName());
+
+  std::vector<uint64> lhs{static_cast<uint64>(0x8000000000000000ULL)};
+  std::unique_ptr<Literal> lhs_literal = LiteralUtil::CreateR1<uint64>({lhs});
+  auto lhs_param = Parameter(&b, 0, lhs_literal->shape(), "lhs_param");
+
+  std::vector<uint64> rhs{static_cast<uint64>(0x7FFFFFFFFFFFFFFFULL)};
+  std::unique_ptr<Literal> rhs_literal = LiteralUtil::CreateR1<uint64>({rhs});
+  auto rhs_param = Parameter(&b, 1, rhs_literal->shape(), "rhs_param");
+
+  Lt(lhs_param, rhs_param);
+
+  ComputeAndCompare(&b, {std::move(*lhs_literal), std::move(*rhs_literal)});
+}
+
 TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
   const int count = GetParam();
   XlaBuilder builder(TestName());
@@ -411,7 +430,64 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementF32s) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
-XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
+class IntegerDivideOpTest : public ArrayElementwiseOpTest {
+ protected:
+  template <typename T>
+  void TestDivRem(ArraySlice<T> dividends, ArraySlice<T> divisors,
+                  ArraySlice<T> quotients, ArraySlice<T> remainders) {
+    {
+      XlaBuilder builder(TestName());
+      XlaOp dividend;
+      XlaOp divisor;
+      auto dividend_data =
+          CreateR1Parameter<T>(dividends, 0, "dividend", &builder, &dividend);
+      auto divisor_data =
+          CreateR1Parameter<T>(divisors, 1, "divisor", &builder, &divisor);
+      Div(dividend, divisor);
+
+      ComputeAndCompareR1<T>(&builder, quotients,
+                             {dividend_data.get(), divisor_data.get()});
+    }
+
+    // Test with a compile-time constant divisor.
+    {
+      XlaBuilder builder(TestName());
+      XlaOp dividend;
+      auto dividend_data =
+          CreateR1Parameter<T>(dividends, 0, "dividend", &builder, &dividend);
+      Div(dividend, ConstantR1<T>(&builder, divisors));
+
+      ComputeAndCompareR1<T>(&builder, quotients, {dividend_data.get()});
+    }
+
+    {
+      XlaBuilder builder(TestName());
+      XlaOp dividend;
+      XlaOp divisor;
+      auto dividend_data =
+          CreateR1Parameter<T>(dividends, 0, "dividend", &builder, &dividend);
+      auto divisor_data =
+          CreateR1Parameter<T>(divisors, 1, "divisor", &builder, &divisor);
+      Rem(dividend, divisor);
+
+      ComputeAndCompareR1<T>(&builder, remainders,
+                             {dividend_data.get(), divisor_data.get()});
+    }
+
+    // Test with a compile-time constant divisor.
+    {
+      XlaBuilder builder(TestName());
+      XlaOp dividend;
+      auto dividend_data =
+          CreateR1Parameter<T>(dividends, 0, "dividend", &builder, &dividend);
+      Rem(dividend, ConstantR1<T>(&builder, divisors));
+
+      ComputeAndCompareR1<T>(&builder, remainders, {dividend_data.get()});
+    }
+  }
+};
+
+XLA_TEST_F(IntegerDivideOpTest, DivS32s) {
   // clang-format off
   // Some interesting values to test.
   std::vector<int32> vals = {
@@ -435,58 +511,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
     }
   }
 
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    XlaOp divisor;
-    auto dividend_data =
-        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
-    auto divisor_data =
-        CreateR1Parameter<int32>(divisors, 1, "divisor", &builder, &divisor);
-    Div(dividend, divisor);
-
-    ComputeAndCompareR1<int32>(&builder, quotients,
-                               {dividend_data.get(), divisor_data.get()});
-  }
-
-  // Test with a compile-time constant divisor.
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    auto dividend_data =
-        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
-    Div(dividend, ConstantR1<int32>(&builder, divisors));
-
-    ComputeAndCompareR1<int32>(&builder, quotients, {dividend_data.get()});
-  }
-
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    XlaOp divisor;
-    auto dividend_data =
-        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
-    auto divisor_data =
-        CreateR1Parameter<int32>(divisors, 1, "divisor", &builder, &divisor);
-    Rem(dividend, divisor);
-
-    ComputeAndCompareR1<int32>(&builder, remainders,
-                               {dividend_data.get(), divisor_data.get()});
-  }
+  TestDivRem<int32>(dividends, divisors, quotients, remainders);
+}
 
-  // Test with a compile-time constant divisor.
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    auto dividend_data =
-        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
-    Rem(dividend, ConstantR1<int32>(&builder, divisors));
+XLA_TEST_F(IntegerDivideOpTest, SignedOverflow) {
+  std::vector<int32> dividends = {5, INT32_MIN}, divisors = {0, -1},
+                     quotients = {-1, INT32_MIN}, remainders = {5, 0};
 
-    ComputeAndCompareR1<int32>(&builder, remainders, {dividend_data.get()});
-  }
+  TestDivRem<int32>(dividends, divisors, quotients, remainders);
 }
 
-XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
+XLA_TEST_F(IntegerDivideOpTest, DivU32s) {
   // clang-format off
   // Some interesting values to test.
   std::vector<uint32> vals = {
@@ -506,53 +541,14 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
     }
   }
 
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    XlaOp divisor;
-    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
-                                                   &builder, &dividend);
-    auto divisor_data =
-        CreateR1Parameter<uint32>(divisors, 1, "divisor", &builder, &divisor);
-    Div(dividend, divisor);
-
-    ComputeAndCompareR1<uint32>(&builder, quotients,
-                                {dividend_data.get(), divisor_data.get()});
-  }
-
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
-                                                   &builder, &dividend);
-    Div(dividend, ConstantR1<uint32>(&builder, divisors));
-
-    ComputeAndCompareR1<uint32>(&builder, quotients, {dividend_data.get()});
-  }
-
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    XlaOp divisor;
-    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
-                                                   &builder, &dividend);
-    auto divisor_data =
-        CreateR1Parameter<uint32>(divisors, 1, "divisor", &builder, &divisor);
-    Rem(dividend, divisor);
-
-    ComputeAndCompareR1<uint32>(&builder, remainders,
-                                {dividend_data.get(), divisor_data.get()});
-  }
+  TestDivRem<uint32>(dividends, divisors, quotients, remainders);
+}
 
-  {
-    XlaBuilder builder(TestName());
-    XlaOp dividend;
-    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
-                                                   &builder, &dividend);
-    Rem(dividend, ConstantR1<uint32>(&builder, divisors));
+XLA_TEST_F(IntegerDivideOpTest, UnsignedOverflow) {
+  std::vector<int32> dividends = {5}, divisors = {0}, quotients = {-1},
+                     remainders = {5};
 
-    ComputeAndCompareR1<uint32>(&builder, remainders, {dividend_data.get()});
-  }
+  TestDivRem<int32>(dividends, divisors, quotients, remainders);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantC64s) {
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 24b17b7100..ac90a3adb6 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/math/math_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -382,7 +382,7 @@ struct BatchNormTestParam {
 
   friend ::std::ostream& operator<<(::std::ostream& os,
                                     const BatchNormTestParam& p) {
-    os << "bounds={" << tensorflow::str_util::Join(p.bounds, ", ") << "}, ";
+    os << "bounds={" << absl::StrJoin(p.bounds, ", ") << "}, ";
     os << "feature_index=" << p.feature_index << ", ";
     os << "random_value_mean=" << p.random_value_mean << ", ";
     os << "random_value_var=" << p.random_value_var;
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index c7b94b5bba..74d4d2eb10 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 59d917054b..9cd974fd9b 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -17,18 +17,18 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -196,8 +196,8 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
         AsInt64Slice(expected.shape().dimensions()), minor_to_major);
     TF_ASSIGN_OR_RETURN(auto actual,
                         ExecuteAndTransfer(computation, arguments, &layout));
-    verify_output(*actual, tensorflow::strings::StrCat(
-                               "Test with output layout: ",
+    verify_output(*actual,
+                  absl::StrCat("Test with output layout: ",
                                ShapeUtil::HumanStringWithLayout(layout)));
   } while (std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
   return Status::OK();
@@ -258,7 +258,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
             output_with_layout));
     string error_message = "Test with input layouts: ";
     for (const auto& str : layout_strings) {
-      tensorflow::strings::StrAppend(&error_message, str, " ");
+      absl::StrAppend(&error_message, str, " ");
     }
     verify_output(*actual, error_message);
     return Status::OK();
@@ -391,7 +391,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
 }
 
 void ClientLibraryTestBase::ComputeAndCompareR1U8(
-    XlaBuilder* builder, tensorflow::StringPiece expected,
+    XlaBuilder* builder, absl::string_view expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -546,7 +546,7 @@ XlaComputation ClientLibraryTestBase::CreateScalarReluSensitivity() {
 
 std::unique_ptr<Array2D<float>> ClientLibraryTestBase::CreatePatternedMatrix(
     int rows, int cols, float offset) {
-  auto array = MakeUnique<Array2D<float>>(rows, cols);
+  auto array = absl::make_unique<Array2D<float>>(rows, cols);
   for (int64 row = 0; row < rows; ++row) {
     for (int64 col = 0; col < cols; ++col) {
       (*array)(row, col) = col + (row * 1000.0f) + offset;
@@ -561,7 +561,7 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
                                                             int cols_padded) {
   CHECK_GE(rows_padded, rows);
   CHECK_GE(cols_padded, cols);
-  auto array = MakeUnique<Array2D<float>>(rows_padded, cols_padded, 0.0);
+  auto array = absl::make_unique<Array2D<float>>(rows_padded, cols_padded, 0.0);
   for (int64 row = 0; row < rows; ++row) {
     for (int64 col = 0; col < cols; ++col) {
       (*array)(row, col) = col + (row * 1000.0f);
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index b04a3b105c..ac96d3e325 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
@@ -30,13 +32,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/bitmap.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
@@ -202,7 +202,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Compare the result of the computation to a strings. In XLA strings are
   // represented using rank-1 U8 shapes.
   void ComputeAndCompareR1U8(
-      XlaBuilder* builder, tensorflow::StringPiece expected,
+      XlaBuilder* builder, absl::string_view expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // Convenience method for running a built computation, transferring the
@@ -613,7 +613,7 @@ template <typename NativeT>
 std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
     const int rows, const int cols, NativeT min_value, NativeT max_value,
     uint32 seed) {
-  auto result = MakeUnique<Array2D<NativeT>>(rows, cols);
+  auto result = absl::make_unique<Array2D<NativeT>>(rows, cols);
   PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
   for (int y = 0; y < rows; ++y) {
     for (int x = 0; x < cols; ++x) {
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 5a06d061f0..8226b6de3f 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -145,8 +145,8 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
-                                                  "depends on a parameter"))
+    EXPECT_TRUE(
+        absl::StrContains(value.status().ToString(), "depends on a parameter"))
         << value.status();
   }
 }
@@ -161,8 +161,8 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) {
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
-                                                  "depends on a parameter"))
+    EXPECT_TRUE(
+        absl::StrContains(value.status().ToString(), "depends on a parameter"))
         << value.status();
   }
 }
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 1adc68cc48..7a203d6873 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -447,11 +448,11 @@ std::vector<float> GetInterestingF16ConversionTestCases() {
 XLA_TEST_F(ConvertTest, ConvertR1F16ToR1F32) {
   std::vector<float> test_cases = GetInterestingF16ConversionTestCases();
   std::vector<half> input;
-  c_transform(test_cases, std::back_inserter(input),
-              [](float f) { return Eigen::half(f); });
+  absl::c_transform(test_cases, std::back_inserter(input),
+                    [](float f) { return Eigen::half(f); });
   std::vector<float> expected_output;
-  c_transform(input, std::back_inserter(expected_output),
-              [](Eigen::half h) { return static_cast<float>(h); });
+  absl::c_transform(input, std::back_inserter(expected_output),
+                    [](Eigen::half h) { return static_cast<float>(h); });
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GlobalData> dot_lhs_handle,
@@ -470,8 +471,8 @@ XLA_TEST_F(ConvertTest, ConvertR1F16ToR1F32) {
 XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F16) {
   std::vector<float> input = GetInterestingF16ConversionTestCases();
   std::vector<half> expected_output;
-  c_transform(input, std::back_inserter(expected_output),
-              [](float f) { return Eigen::half(f); });
+  absl::c_transform(input, std::back_inserter(expected_output),
+                    [](float f) { return Eigen::half(f); });
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GlobalData> dot_lhs_handle,
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 7b6bbc4f57..38b6da4fa9 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <array>
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -88,9 +88,9 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
 
 XLA_TEST_F(ConvolutionDimensionNumbersTest,
            TwoConvsWithDifferentDimensionNumbers) {
-  auto input_array = MakeUnique<Array4D<float>>(2, 3, 5, 5);
+  auto input_array = absl::make_unique<Array4D<float>>(2, 3, 5, 5);
   input_array->FillWithMultiples(0.1);
-  auto weight_array = MakeUnique<Array4D<float>>(4, 3, 1, 1);
+  auto weight_array = absl::make_unique<Array4D<float>>(4, 3, 1, 1);
   weight_array->FillWithMultiples(0.2);
   auto weight_data =
       client_
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 5ed8122e00..d2c6478b02 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
@@ -26,16 +28,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -70,16 +70,16 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     const int kKernelSizeY = 2;
     const int kOutputActivationSizeZ = 256;
     const int kMiniBatchSize = 4;
-    auto alhs =
-        MakeUnique<Array4D<T>>(kMiniBatchSize, kInputActivationSizeZ,
-                               kInputActivationSizeY, kInputActivationSizeX);
+    auto alhs = absl::make_unique<Array4D<T>>(
+        kMiniBatchSize, kInputActivationSizeZ, kInputActivationSizeY,
+        kInputActivationSizeX);
     alhs->FillWithMultiples(static_cast<T>(1.0f));
     ASSERT_EQ(3, alhs->width());
     ASSERT_EQ(3, alhs->height());
 
-    auto arhs =
-        MakeUnique<Array4D<T>>(kOutputActivationSizeZ, kInputActivationSizeZ,
-                               kKernelSizeY, kKernelSizeX);
+    auto arhs = absl::make_unique<Array4D<T>>(kOutputActivationSizeZ,
+                                              kInputActivationSizeZ,
+                                              kKernelSizeY, kKernelSizeX);
     Array2D<T> rhs_raster({
         {1.0f, 0.0f},  // row 0
         {0.0f, 0.0f},  // row 1
@@ -465,7 +465,7 @@ void iota_int_init_value(std::vector<T>& values, int init_value) {
 }
 
 template <typename T>
-class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest {
+class Convolve2D_1x3x3x5_3x3x5x3_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
@@ -520,8 +520,139 @@ class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest {
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x5x5_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x3x3x5_3x3x5x5_Valid, Types) { this->RunTest(); }
+TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x5x3_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x3x3x5_3x3x5x3_Valid, Types) { this->RunTest(); }
+
+template <typename T>
+class Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 3, 3, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 15};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1->Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(16029), static_cast<T>(16218), static_cast<T>(16407),
+         static_cast<T>(17172), static_cast<T>(17370), static_cast<T>(17568),
+         static_cast<T>(18369), static_cast<T>(18576), static_cast<T>(18783),
+         static_cast<T>(19620), static_cast<T>(19836), static_cast<T>(20052),
+         static_cast<T>(20925), static_cast<T>(21150), static_cast<T>(21375)});
+    auto expected_r4 = expected_r1->Reshape({1, 1, 1, 15}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(*input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(*filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, *expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 6};
+    std::vector<int64> filter_dims = {2, 2, 2, 12};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/3);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1->Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1->Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(5076), static_cast<T>(5160), static_cast<T>(5244),
+         static_cast<T>(5328), static_cast<T>(6164), static_cast<T>(6264),
+         static_cast<T>(6364), static_cast<T>(6464), static_cast<T>(7380),
+         static_cast<T>(7496), static_cast<T>(7612), static_cast<T>(7728)});
+    auto expected_r4 = expected_r1->Reshape({1, 1, 1, 12}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(*input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(*filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, *expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) {
+  this->RunTest();
+}
 
 // Test fixture to run convolution tests with and without convolution
 // canonicalization enabled.
@@ -765,5 +896,44 @@ XLA_TEST_F(ConvolutionTest, NoCudnnAlgorithmPicker) {
                      std::move(*LiteralUtil::CreateFromArray(filter_data))});
 }
 
+class ConvolutionHloTest : public HloTestBase {};
+
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64Forward)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f64[3,56,56,16] parameter(0)
+  %arg1 = f64[3,3,3,64] parameter(1)
+  ROOT %conv = f64[54,54,16,64] convolution(%arg0, %arg1), window={size=3x3}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f64[2,5,8,1] parameter(0)
+  %arg1 = f64[2,5,8,2] parameter(1)
+  ROOT %conv = f64[4,4,1,2] convolution(%arg0, %arg1), window={size=5x8 pad=1_2x1_2}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardInput)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %output = f64[4,5,16,16] parameter(0)
+  %kernel = f64[5,3,7,7] parameter(1)
+  %reverse = f64[5,3,7,7] reverse(f64[5,3,7,7] %kernel), dimensions={2,3}
+  ROOT %convolution = f64[4,3,16,16] convolution(%output, %reverse), window={size=7x7 pad=3_3x3_3}, dim_labels=bf01_io01->bf01
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 5ef273e5a2..50a9ebc1e9 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 13c777835e..6f7fc0e6e5 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 0e9e92ed99..5873516442 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -261,16 +262,14 @@ string PrintDotTestParam(
     const ::testing::TestParamInfo<DotTestParam>& test_param) {
   const DotTestParam& param = test_param.param;
   if (param.has_addend) {
-    return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n,
-                                       "_MajorToMinor",
-                                       param.dot_lhs_row_major ? "T" : "F",
-                                       param.dot_rhs_row_major ? "T" : "F",
-                                       param.addend_row_major ? "T" : "F");
+    return absl::StrCat(param.m, "x", param.k, "x", param.n, "_MajorToMinor",
+                        param.dot_lhs_row_major ? "T" : "F",
+                        param.dot_rhs_row_major ? "T" : "F",
+                        param.addend_row_major ? "T" : "F");
   } else {
-    return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n,
-                                       "_MajorToMinor",
-                                       param.dot_lhs_row_major ? "T" : "F",
-                                       param.dot_rhs_row_major ? "T" : "F");
+    return absl::StrCat(param.m, "x", param.k, "x", param.n, "_MajorToMinor",
+                        param.dot_lhs_row_major ? "T" : "F",
+                        param.dot_rhs_row_major ? "T" : "F");
   }
 }
 
diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
index 39cc6c5927..4a835a8e21 100644
--- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc
+++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include <limits>
 #include <string>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -39,8 +39,7 @@ class FloorCeilTest : public ClientLibraryTestBase {
   // Runs a computation and comparison on expected vs f(input)
   void TestR1F32(tensorflow::gtl::ArraySlice<float> input,
                  tensorflow::gtl::ArraySlice<float> expected, Function f) {
-    LOG(INFO) << "input: {" << tensorflow::str_util::Join(expected, ", ")
-              << "}";
+    LOG(INFO) << "input: {" << absl::StrJoin(expected, ", ") << "}";
     XlaBuilder builder(TestName());
     auto c = ConstantR1<float>(&builder, input);
     if (f == kCeil) {
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 792be0d3fc..341124170a 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -22,13 +22,13 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index b77bece85a..205d417f0c 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -25,13 +25,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using tensorflow::gtl::nullopt;
+using absl::nullopt;
 
 class GatherOperationTest : public HloTestBase {
  protected:
   void RunTest(const string& hlo_text, Literal* operand,
-               Literal* gather_indices) {
-    RunTest(hlo_text, {operand, gather_indices});
+               Literal* start_indices) {
+    RunTest(hlo_text, {operand, start_indices});
   }
 
   void RunTest(const string& hlo_text,
@@ -52,18 +52,17 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 3}
+      slice_sizes={1, 3}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherV2) {
@@ -74,18 +73,17 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherMultipleBatchDims) {
@@ -96,18 +94,18 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_0) {
@@ -118,18 +116,18 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdMultipleBatchDims_1) {
@@ -140,18 +138,18 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   ROOT gather = s32[2,1,1,2] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNd) {
@@ -162,20 +160,20 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, TensorFlowGatherNdNonDefaultIndexVectorDim) {
@@ -186,20 +184,20 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, DynamicSlice) {
@@ -210,18 +208,17 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({1, 1});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({1, 1});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, BatchDynamicSlice) {
@@ -232,18 +229,18 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   ROOT gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, ZeroDimBounds) {
@@ -254,17 +251,16 @@ ENTRY main {
   operand = s32[3,0] parameter(0)
   indices = s32[2] parameter(1)
   ROOT gather = s32[2,0] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 0}
+      slice_sizes={1, 0}
 }
 )";
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, OutOfBoundsIndex) {
@@ -278,19 +274,19 @@ ENTRY main {
   operand = s32[3,3]{1,0} parameter(0)
   indices = s32[6,2]{1,0} parameter(1)
   gather = s32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = s32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<int32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<int32>(
       {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, OutOfBoundsUnsignedIndex) {
@@ -304,19 +300,19 @@ ENTRY main {
   operand = s32[3,3]{1,0} parameter(0)
   indices = u32[6,2]{1,0} parameter(1)
   gather = s32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = s32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<uint32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<uint32>(
       {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483648u, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, NegativeIndex) {
@@ -330,19 +326,19 @@ ENTRY main {
   operand = s32[3,3]{1,0} parameter(0)
   indices = s32[6,2]{1,0} parameter(1)
   gather = s32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = s32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<int32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<int32>(
       {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, NegativeIndexIntoUnsignedOperand) {
@@ -356,19 +352,19 @@ ENTRY main {
   operand = u32[3,3]{1,0} parameter(0)
   indices = s32[6,2]{1,0} parameter(1)
   gather = u32[6,1,1]{2,1,0} gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   ROOT result = u32[6]{0} reshape(gather)
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<uint32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR2<int32>(
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR2<int32>(
       {{2, -1}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, OneScalarIndex) {
@@ -379,17 +375,17 @@ ENTRY main {
   operand = s32[2,3,2]{2,1,0} parameter(0)
   index = s32[] parameter(1)
   ROOT gather = s32[1,3,2]{2,1,0} gather(operand, index),
-      output_window_dims={0,1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0},
+      offset_dims={0,1,2},
+      collapsed_slice_dims={},
+      start_index_map={0},
       index_vector_dim=0,
-      window_bounds={1,3,2}
+      slice_sizes={1,3,2}
 }
 )";
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR3<int32>(
       {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR0<int32>(1);
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, ScalarResult) {
@@ -400,16 +396,16 @@ ENTRY main {
   operand = s32[4]{0} parameter(0)
   index = s32[] parameter(1)
   ROOT gather = s32[] gather(operand, index),
-      output_window_dims={},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=0,
-      window_bounds={1}
+      slice_sizes={1}
 }
 )";
   std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR0<int32>(1);
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR0<int32>(1);
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, ZeroSizedResult) {
@@ -420,17 +416,17 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[0] parameter(1)
   ROOT gather = s32[0,3] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0},
-      gather_dims_to_operand_dims={0},
+      offset_dims={1},
+      collapsed_slice_dims={0},
+      start_index_map={0},
       index_vector_dim=1,
-      window_bounds={1, 3}
+      slice_sizes={1, 3}
 }
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices = LiteralUtil::CreateR1<int32>({});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherV2) {
@@ -441,11 +437,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[3,2] gather(operand, indices),
-      output_window_dims={0},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=1,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[3,2] broadcast(one), dimensions={}
   ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
@@ -453,9 +449,8 @@ ENTRY main {
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({0, 2});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherMultipleBatchDims) {
@@ -466,11 +461,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,3,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={1},
-      gather_dims_to_operand_dims={1},
+      offset_dims={1},
+      collapsed_slice_dims={1},
+      start_index_map={1},
       index_vector_dim=2,
-      window_bounds={3, 1}
+      slice_sizes={3, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
   ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
@@ -478,9 +473,9 @@ ENTRY main {
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNdMultipleBatchDims) {
@@ -491,11 +486,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=2,
-      window_bounds={1, 1}
+      slice_sizes={1, 1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -503,9 +498,9 @@ ENTRY main {
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNd) {
@@ -516,11 +511,11 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=1,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -530,9 +525,9 @@ ENTRY main {
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest,
@@ -544,11 +539,11 @@ ENTRY main {
   operand = s32[3,3,2] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,2] gather(operand, indices),
-      output_window_dims={1},
-      elided_window_dims={0,1},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1,2}
+      slice_sizes={1,1,2}
   one = s32[] constant(1)
   one_broadcasted = s32[2,2] broadcast(one), dimensions={}
   ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
@@ -558,9 +553,9 @@ ENTRY main {
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
                                     {{-7, 7}, {-8, 8}, {-9, 9}}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedDynamicSlice) {
@@ -571,11 +566,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2] parameter(1)
   gather = s32[1,1] gather(operand, indices),
-      output_window_dims={0,1},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={0,1},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[1,1] broadcast(one), dimensions={}
   ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
@@ -583,9 +578,8 @@ ENTRY main {
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
-      LiteralUtil::CreateR1<int32>({1, 1});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  std::unique_ptr<Literal> start_indices = LiteralUtil::CreateR1<int32>({1, 1});
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 XLA_TEST_F(GatherOperationTest, FusedBatchDynamicSlice) {
@@ -596,11 +590,11 @@ ENTRY main {
   operand = s32[3,3] parameter(0)
   indices = s32[2,2] parameter(1)
   gather = s32[2,1,1] gather(operand, indices),
-      output_window_dims={1,2},
-      elided_window_dims={},
-      gather_dims_to_operand_dims={0,1},
+      offset_dims={1,2},
+      collapsed_slice_dims={},
+      start_index_map={0,1},
       index_vector_dim=0,
-      window_bounds={1,1}
+      slice_sizes={1,1}
   one = s32[] constant(1)
   one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
   ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
@@ -608,9 +602,9 @@ ENTRY main {
 )";
   std::unique_ptr<Literal> operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
-  std::unique_ptr<Literal> gather_indices =
+  std::unique_ptr<Literal> start_indices =
       LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
-  RunTest(hlo_text, operand.get(), gather_indices.get());
+  RunTest(hlo_text, operand.get(), start_indices.get());
 }
 
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
@@ -622,11 +616,11 @@ XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
   //   operand = s32[3,3] parameter(0)
   //   indices = s32[2] parameter(1)
   //   ROOT gather = s32[2,3] gather(operand, indices),
-  //       output_window_dims={1},
-  //       elided_window_dims={0},
-  //       gather_dims_to_operand_dims={0},
+  //       offset_dims={1},
+  //       collapsed_slice_dims={0},
+  //       start_index_map={0},
   //       index_vector_dim=1,
-  //       window_bounds={1, 3}
+  //       slice_sizes={1, 3}
   // }
 
   XlaBuilder builder("gather_basic");
@@ -637,9 +631,9 @@ XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
   auto operand = Parameter(&builder, 0, operand_shape, "operand");
   auto indices = Parameter(&builder, 1, indices_shape, "indices");
   GatherDimensionNumbers dim_numbers;
-  dim_numbers.add_output_window_dims(1);
-  dim_numbers.add_elided_window_dims(0);
-  dim_numbers.add_gather_dims_to_operand_dims(0);
+  dim_numbers.add_offset_dims(1);
+  dim_numbers.add_collapsed_slice_dims(0);
+  dim_numbers.add_start_index_map(0);
   dim_numbers.set_index_vector_dim(1);
   Gather(operand, indices, dim_numbers, {1, 3});
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index f05d1a8b9d..93ea144438 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -20,12 +20,15 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -39,9 +42,9 @@ namespace xla {
 
 namespace {
 
-using tensorflow::StringPiece;
+using absl::optional;
+using absl::string_view;
 using tensorflow::gtl::ArraySlice;
-using tensorflow::gtl::optional;
 
 constexpr char kInterpreter[] = "interpreter";
 
@@ -83,24 +86,42 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
 
 }  // namespace
 
-HloTestBase::HloTestBase(bool allow_mixed_precision_in_hlo_verifier)
+HloTestBase::HloTestBase(bool verifier_layout_sensitive,
+                         bool allow_mixed_precision_in_hlo_verifier)
     : HloTestBase(GetTestPlatform(), GetReferencePlatform(),
+                  verifier_layout_sensitive,
                   allow_mixed_precision_in_hlo_verifier) {}
 
 HloTestBase::HloTestBase(se::Platform* test_platform,
                          se::Platform* reference_platform,
+                         bool verifier_layout_sensitive,
                          bool allow_mixed_precision_in_hlo_verifier)
     : test_runner_(test_platform), reference_runner_(reference_platform) {
-  hlo_verifier_ =
-      MakeUnique<HloVerifier>(allow_mixed_precision_in_hlo_verifier);
+  hlo_verifier_ = absl::make_unique<HloVerifier>(
+      /*layout_sensitive=*/verifier_layout_sensitive,
+      /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier);
 }
 
-/* static */
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
-  return MakeUnique<HloModule>(name, GetModuleConfigForTest());
+  return absl::make_unique<HloModule>(name, GetModuleConfigForTest());
+}
+
+/* static */
+StatusOr<bool> HloTestBase::RunHloPass(HloPassInterface* hlo_pass,
+                                       HloModule* module) {
+  const string module_str_before_run = module->ToProto().ShortDebugString();
+  const auto status_or = hlo_pass->Run(module);
+  if (status_or.status().ok()) {
+    const string module_str_after_run = module->ToProto().ShortDebugString();
+    if (!status_or.ValueOrDie()) {
+      // Check that the proto remains same.
+      EXPECT_EQ(module_str_after_run, module_str_before_run);
+    }
+  }
+  return status_or;
 }
 
-/*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
+DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
@@ -199,7 +220,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
       MakeFakeArguments(module.get()).ConsumeValueOrDie();
 
   std::vector<Literal*> fake_argument_ptrs;
-  c_transform(
+  absl::c_transform(
       fake_arguments, std::back_inserter(fake_argument_ptrs),
       [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
 
@@ -213,7 +234,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
   const auto& fake_arguments =
       MakeFakeArguments(module.get()).ConsumeValueOrDie();
   std::vector<Literal*> fake_argument_ptrs;
-  c_transform(
+  absl::c_transform(
       fake_arguments, std::back_inserter(fake_argument_ptrs),
       [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
 
@@ -222,8 +243,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompare(
-    const StringPiece hlo_string,
-    const tensorflow::gtl::optional<ErrorSpec>& error,
+    string_view hlo_string, const absl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
       HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
@@ -236,7 +256,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
                        reference_preprocessor);
 }
 
-::testing::AssertionResult HloTestBase::Run(const StringPiece hlo_string) {
+::testing::AssertionResult HloTestBase::Run(string_view hlo_string) {
   auto module_or_status =
       HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
@@ -248,7 +268,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
       MakeFakeArguments(module_or_status.ValueOrDie().get())
           .ConsumeValueOrDie();
   std::vector<Literal*> fake_argument_ptrs;
-  c_transform(
+  absl::c_transform(
       fake_arguments, std::back_inserter(fake_argument_ptrs),
       [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
   return test_runner_
@@ -260,7 +280,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
-    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const string& filename, const absl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
       HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
@@ -273,8 +293,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
-    const StringPiece hlo_string,
-    const tensorflow::gtl::optional<ErrorSpec>& error,
+    string_view hlo_string, const absl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
       HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
@@ -288,7 +307,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
-    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const string& filename, const absl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
       HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
@@ -301,10 +320,10 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
 }
 
 HloComputation* HloTestBase::FindComputation(HloModule* module,
-                                             tensorflow::StringPiece name) {
+                                             absl::string_view name) {
   auto computations = module->computations();
-  auto it = c_find_if(computations,
-                      [&](HloComputation* c) { return c->name() == name; });
+  auto it = absl::c_find_if(
+      computations, [&](HloComputation* c) { return c->name() == name; });
   if (it == computations.end()) {
     return nullptr;
   }
@@ -312,11 +331,11 @@ HloComputation* HloTestBase::FindComputation(HloModule* module,
 }
 
 HloInstruction* HloTestBase::FindInstruction(HloModule* module,
-                                             tensorflow::StringPiece name) {
+                                             absl::string_view name) {
   for (const HloComputation* c : module->computations()) {
     auto instructions = c->instructions();
-    auto it = c_find_if(instructions,
-                        [&](HloInstruction* i) { return i->name() == name; });
+    auto it = absl::c_find_if(
+        instructions, [&](HloInstruction* i) { return i->name() == name; });
     if (it != instructions.end()) {
       return *it;
     }
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 4232eeceb1..06bcc39741 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -72,20 +72,27 @@ class HloTestBase : public ::testing::Test {
   // options from command-line flags. If you want a fresh HloModule object and
   // then add HloComputations to it, it's recommended to use this method in your
   // tests.
-  static std::unique_ptr<HloModule> CreateNewModule(
-      const string& name = TestName());
+  std::unique_ptr<HloModule> CreateNewModule(const string& name = TestName());
+
+  // Runs the hlo_pass with the provided module and returns the result. This
+  // function also verifies that the module remains unchanged when hlo_pass
+  // returns false as the StatusOr value.
+  static StatusOr<bool> RunHloPass(HloPassInterface* hlo_pass,
+                                   HloModule* module);
 
  protected:
   // This uses the interpreter backend as the reference backend and
   // automatically finds another supported backend as the test backend. If the
   // interpreter is the only supported backend, it will be both the test backend
   // and the reference backend.
-  HloTestBase(bool allow_mixed_precision_in_hlo_verifier = true);
+  HloTestBase(bool verifier_layout_sensitive = false,
+              bool allow_mixed_precision_in_hlo_verifier = true);
 
   // If your test doesn't use interpreter as the reference backend, you can use
   // this constructor. Note that your test target is responsible for linking in
   // both needed backends.
   HloTestBase(se::Platform* test_platform, se::Platform* reference_platform,
+              bool verifier_layout_sensitive = false,
               bool allow_mixed_precision_in_hlo_verifier = true);
 
   ~HloTestBase() override {}
@@ -93,10 +100,13 @@ class HloTestBase : public ::testing::Test {
   // Populates debug options from command-line flags and adjusts the options for
   // testing. It is recommended to use this when you need to pass in
   // DebugOptions, e.g. when creating a module from a string or a file.
-  static DebugOptions GetDebugOptionsForTest();
+  //
+  // This function is virtual so tests can specify an alternative set of debug
+  // options (e.g. disabling additional passes).
+  virtual DebugOptions GetDebugOptionsForTest();
 
   // Gets an HloModuleConfig with options appropriate for tests.
-  static HloModuleConfig GetModuleConfigForTest() {
+  HloModuleConfig GetModuleConfigForTest() {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     return config;
@@ -131,7 +141,7 @@ class HloTestBase : public ::testing::Test {
   ::testing::AssertionResult RunAndCompare(
       std::unique_ptr<HloModule> module,
       const tensorflow::gtl::ArraySlice<Literal*> arguments,
-      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
 
@@ -140,22 +150,20 @@ class HloTestBase : public ::testing::Test {
   ::testing::AssertionResult RunAndCompareNoHloPasses(
       std::unique_ptr<HloModule> module,
       const tensorflow::gtl::ArraySlice<Literal*> arguments,
-      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
 
   // Executes an hlo module with fake inputs and compares the results.
   ::testing::AssertionResult RunAndCompare(
-      std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::optional<ErrorSpec>& error,
+      std::unique_ptr<HloModule> module, const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
 
   // Same as above, except that the module will be executed without Hlo
   // optimization.
   ::testing::AssertionResult RunAndCompareNoHloPasses(
-      std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::optional<ErrorSpec>& error,
+      std::unique_ptr<HloModule> module, const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
 
@@ -163,23 +171,23 @@ class HloTestBase : public ::testing::Test {
   // input. Module can be passed in directly, or parsed from an hlo_string,
   // or loaded from a file.
   ::testing::AssertionResult RunAndCompare(
-      const tensorflow::StringPiece hlo_string,
-      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const absl::string_view hlo_string,
+      const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
-  ::testing::AssertionResult Run(const tensorflow::StringPiece hlo_string)
+  ::testing::AssertionResult Run(const absl::string_view hlo_string)
       TF_MUST_USE_RESULT;
   ::testing::AssertionResult RunAndCompareFromFile(
-      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const string& filename, const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
   ::testing::AssertionResult RunAndCompareNoHloPasses(
-      const tensorflow::StringPiece hlo_string,
-      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const absl::string_view hlo_string,
+      const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
   ::testing::AssertionResult RunAndCompareNoHloPassesFromFile(
-      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const string& filename, const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
 
@@ -222,10 +230,8 @@ class HloTestBase : public ::testing::Test {
   //
   // This is useful for tests which create HLOs from a string and then want to
   // inspect a particular computation or instruction.
-  HloComputation* FindComputation(HloModule* module,
-                                  tensorflow::StringPiece name);
-  HloInstruction* FindInstruction(HloModule* module,
-                                  tensorflow::StringPiece name);
+  HloComputation* FindComputation(HloModule* module, absl::string_view name);
+  HloInstruction* FindInstruction(HloModule* module, absl::string_view name);
 
   // Return an HLO verifier constructed for the test backend.
   HloVerifier& verifier() const { return *hlo_verifier_; }
@@ -256,7 +262,7 @@ class HloTestBase : public ::testing::Test {
   StatusOr<::testing::AssertionResult> RunAndCompareInternal(
       std::unique_ptr<HloModule> module,
       const tensorflow::gtl::ArraySlice<Literal*> arguments,
-      const tensorflow::gtl::optional<ErrorSpec>& error, bool run_hlo_passes,
+      const absl::optional<ErrorSpec>& error, bool run_hlo_passes,
       const std::function<void(HloModule*)>& reference_preprocessor);
 };
 
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index ad1f5b9eed..8f86c528d0 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -24,8 +25,11 @@ limitations under the License.
 
 namespace xla {
 
-HloVerifiedTestBase::HloVerifiedTestBase()
-    : shape_verifier_(MakeUnique<ShapeVerifier>()) {}
+HloVerifiedTestBase::HloVerifiedTestBase(bool layout_sensitive,
+                                         bool allow_mixed_precision)
+    : HloTestBase(
+          /*verifier_layout_sensitive=*/layout_sensitive,
+          /*allow_mixed_precision_in_hlo_verifier=*/allow_mixed_precision) {}
 
 HloVerifiedTestBase::~HloVerifiedTestBase() {
   // We can't call the ASSERT or EXPECT test macros in destructors, so we
@@ -50,8 +54,7 @@ void HloVerifiedTestBase::TearDown() {
 }
 
 void HloVerifiedTestBase::VerifyModule(HloModule* module) {
-  HloVerifier verifier(/*allow_mixed_precision=*/true);
-  xla::StatusOr<bool> mutated = verifier.Run(module);
+  xla::StatusOr<bool> mutated = verifier().Run(module);
   if (!mutated.ok()) {
     ADD_FAILURE() << "HloVerifier failed: " << mutated.status();
   } else {
@@ -72,7 +75,7 @@ HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) {
   return modules_.back().get();
 }
 
-void HloVerifiedTestBase::ParseAndVerifyModule(tensorflow::StringPiece hlo_text,
+void HloVerifiedTestBase::ParseAndVerifyModule(absl::string_view hlo_text,
                                                const HloModuleConfig& config) {
   CHECK(!module_) << "Called ParseModule when test already has a module.";
   TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text, config));
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
index 5b28c01c36..cc6967feed 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
@@ -29,7 +29,8 @@ namespace xla {
 // performs verification on that module on tear-down.
 class HloVerifiedTestBase : public HloTestBase {
  protected:
-  HloVerifiedTestBase();
+  explicit HloVerifiedTestBase(bool layout_sensitive,
+                               bool allow_mixed_precision);
   ~HloVerifiedTestBase() override;
 
   // Constructs a default shape verifier.
@@ -44,32 +45,28 @@ class HloVerifiedTestBase : public HloTestBase {
   // Returns the default HloModule, lazily creating it if necessary via
   // HloTestBase::CreateNewModule().
   HloModule& module();
-  void ParseAndVerifyModule(tensorflow::StringPiece hlo_text,
+  void ParseAndVerifyModule(absl::string_view hlo_text,
                             const HloModuleConfig& config = HloModuleConfig());
 
-  // Sets the shape-size function used during hlo verification. If this isn't
-  // called, a default ShapeVerifier is used instead.
-  void SetShapeVerifier(std::unique_ptr<ShapeVerifier> shape_verifier) {
-    shape_verifier_ = std::move(shape_verifier);
-  }
-
   // Creates a new module for a test, and stores it in modules_ so it can be
   // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent
   // creation of unverified modules.
   HloModule* CreateNewModule(const string& name = TestName());
 
+ private:
+  void VerifyModule(HloModule* module);
+
   // It is confusing to store modules created by module() and CreateNewModule()
   // in different fields, but it allows us to migrate tests to
   // HloVerifiedTestBase more easily, so it's a win because we can verify more
   // modules. See b/80488902.
- private:
+  //
   // Lazily populated. Access via module().
   std::unique_ptr<HloModule> module_;
   // Populated by calls to CreateNewModule.
   std::vector<std::unique_ptr<HloModule>> modules_;
-  std::unique_ptr<ShapeVerifier> shape_verifier_;
+
   bool tear_down_called_ = false;
-  static void VerifyModule(HloModule* module);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index cde1dcd9cd..a4e3a998fc 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -94,7 +94,7 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
     const LiteralSlice& expected, const LiteralSlice& actual,
-    const tensorflow::gtl::optional<ErrorSpec>& error) {
+    const absl::optional<ErrorSpec>& error) {
   if (error.has_value()) {
     VLOG(1) << "Expects near";
     return StatusToAssertion(literal_comparison::Near(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 31a099c15f..3dad91951e 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <random>
 #include <string>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -146,7 +146,7 @@ class LiteralTestUtil {
   // will be compared recursively.
   static ::testing::AssertionResult NearOrEqual(
       const LiteralSlice& expected, const LiteralSlice& actual,
-      const tensorflow::gtl::optional<ErrorSpec>& error) TF_MUST_USE_RESULT;
+      const absl::optional<ErrorSpec>& error) TF_MUST_USE_RESULT;
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil);
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index f297b2b847..4151bfae03 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -80,7 +80,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   std::vector<string> results;
   TF_CHECK_OK(env->GetMatchingPaths(pattern, &results));
 
-  LOG(INFO) << "results: [" << tensorflow::str_util::Join(results, ", ") << "]";
+  LOG(INFO) << "results: [" << absl::StrJoin(results, ", ") << "]";
   EXPECT_EQ(3, results.size());
   for (const string& result : results) {
     LiteralProto literal_proto;
@@ -105,8 +105,10 @@ TEST(LiteralTestUtilTest, NotEqualHasValuesInMessage) {
   auto actual = LiteralUtil::CreateR1<int32>({4, 5, 6});
   ::testing::AssertionResult result =
       LiteralTestUtil::Equal(*expected, *actual);
-  EXPECT_THAT(result.message(), ::testing::HasSubstr("expected: {1, 2, 3}"));
-  EXPECT_THAT(result.message(), ::testing::HasSubstr("actual:   {4, 5, 6}"));
+  EXPECT_THAT(result.message(),
+              ::testing::HasSubstr("Expected literal:\n{1, 2, 3}"));
+  EXPECT_THAT(result.message(),
+              ::testing::HasSubstr("Actual literal:\n{4, 5, 6}"));
 }
 
 TEST(LiteralTestUtilTest, NearComparatorR1) {
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index e719da54d4..8d65869557 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
@@ -125,7 +126,7 @@ class LLVMCompilerTest : public ::testing::Test {
   static std::unique_ptr<HloModule> CreateNewModule() {
     HloModuleConfig config;
     config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    return MakeUnique<HloModule>(TestName(), config);
+    return absl::make_unique<HloModule>(TestName(), config);
   }
 };
 
diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
index 6fc1115097..0487d31409 100644
--- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
@@ -51,8 +51,9 @@ void LlvmIrGenTestBase::CompileAndVerifyIr(
     std::unique_ptr<HloModule> hlo_module, const string& pattern,
     bool match_optimized_ir) {
   SetIrHook(match_optimized_ir);
-  TF_ASSERT_OK(CompileToExecutable(std::move(hlo_module)).status());
+  Status status = CompileToExecutable(std::move(hlo_module)).status();
   ResetIrHook();
+  TF_ASSERT_OK(status);
 
   StatusOr<bool> filecheck_result = RunFileCheck(ir_, pattern);
   TF_ASSERT_OK(filecheck_result.status());
@@ -73,9 +74,10 @@ void LlvmIrGenTestBase::CompileAheadOfTimeAndVerifyIr(
     std::unique_ptr<HloModule> hlo_module, const AotCompilationOptions& options,
     const string& pattern, bool match_optimized_ir) {
   SetIrHook(match_optimized_ir);
-  TF_ASSERT_OK(
-      CompileToAotCompilationResult(std::move(hlo_module), options).status());
+  Status status =
+      CompileToAotCompilationResult(std::move(hlo_module), options).status();
   ResetIrHook();
+  TF_ASSERT_OK(status);
 
   StatusOr<bool> filecheck_result = RunFileCheck(ir_, pattern);
   ASSERT_TRUE(filecheck_result.ok());
diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
index e2cd5bcc5a..237a4a361e 100644
--- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -24,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,7 +53,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) {
   // deallocation happen on the right allocator.
   ExecutableRunOptions options;
   options.set_allocator(allocator);
-  tensorflow::gtl::optional<ScopedShapedBuffer> result =
+  absl::optional<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {},
                           DefaultExecutableBuildOptions(), options);
 
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index eaddf756db..948b60061e 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index da8c42d465..7956a034f8 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -17,12 +17,13 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -133,7 +134,7 @@ class TestLinspaceMaxParametric
     float from = -128.0, to = 256.0;
     std::unique_ptr<Array2D<T>> alhs =
         MakeLinspaceArray2D<T>(from, to, rows, cols);
-    auto arhs = MakeUnique<Array2D<T>>(rows, cols, static_cast<T>(1.0f));
+    auto arhs = absl::make_unique<Array2D<T>>(rows, cols, static_cast<T>(1.0f));
 
     XlaBuilder builder(
         tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols));
@@ -158,7 +159,7 @@ class TestLinspaceMaxParametric
 string PrintTestLinspaceMaxParam(
     const ::testing::TestParamInfo<TestLinspaceMaxParam>& test_param) {
   const TestLinspaceMaxParam& param = test_param.param;
-  return tensorflow::strings::StrCat(param.rows, "r", param.cols, "c");
+  return absl::StrCat(param.rows, "r", param.cols, "c");
 }
 
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index eb06b115da..16b77e965d 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include <new>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -52,12 +53,22 @@ class MultiOutputFusionTest : public HloTestBase {
  protected:
   MultiOutputFusionTest() { error_spec_ = ErrorSpec{0.0001, 1e-2}; }
 
+  // Layout assignment assumes that there are no fusions in the input graph.
+  // Since the purpose of this test is to send pre-fused graphs to XLA, we have
+  // to do layout assignment ourselves.
+  DebugOptions GetDebugOptionsForTest() override {
+    auto opts = HloTestBase::GetDebugOptionsForTest();
+    opts.add_xla_disable_hlo_passes("layout-assignment");
+    return opts;
+  }
+
   void RunTest2D(bool manual_fusion, int64 size) {
     auto builder = HloComputation::Builder(TestName());
     auto hlo_module = CreateNewModule();
 
-    const Shape elem_shape0 = ShapeUtil::MakeShape(F32, {});
-    const Shape elem_shape2 = ShapeUtil::MakeShape(F32, {size, size});
+    const Shape elem_shape0 = ShapeUtil::MakeShapeWithLayout(F32, {}, {});
+    const Shape elem_shape2 =
+        ShapeUtil::MakeShapeWithLayout(F32, {size, size}, {1, 0});
 
     auto const0 = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(8.0f)));
@@ -100,10 +111,10 @@ class MultiOutputFusionTest : public HloTestBase {
           nullptr);
     }
 
-    Literal arg1(ShapeUtil::MakeShape(F32, {size, size}));
+    Literal arg1(ShapeUtil::MakeShapeWithDescendingLayout(F32, {size, size}));
     arg1.PopulateWithValue<float>(2.5f);
 
-    Literal expect(ShapeUtil::MakeShape(F32, {size, size}));
+    Literal expect(ShapeUtil::MakeShapeWithDescendingLayout(F32, {size, size}));
     expect.PopulateWithValue<float>(size * 1.5f * 3.5f);
     auto actual =
         ExecuteAndTransfer(std::move(hlo_module),
@@ -115,8 +126,10 @@ class MultiOutputFusionTest : public HloTestBase {
     auto builder = HloComputation::Builder(TestName());
     auto hlo_module = CreateNewModule();
 
-    const Shape elem_shape_F32 = ShapeUtil::MakeShape(F32, {size});
-    const Shape elem_shape_U8 = ShapeUtil::MakeShape(F64, {size});
+    const Shape elem_shape_F32 =
+        ShapeUtil::MakeShapeWithDescendingLayout(F32, {size});
+    const Shape elem_shape_U8 =
+        ShapeUtil::MakeShapeWithDescendingLayout(F64, {size});
     auto param0 = builder.AddInstruction(
         HloInstruction::CreateParameter(0, elem_shape_F32, "0"));
     auto param1 = builder.AddInstruction(
@@ -136,12 +149,13 @@ class MultiOutputFusionTest : public HloTestBase {
 
     HloInstruction* reshape =
         builder.AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(F32, {size, 1}), add));
+            ShapeUtil::MakeShapeWithDescendingLayout(F32, {size, 1}), add));
     DotDimensionNumbers dot_dnums;
     dot_dnums.add_lhs_contracting_dimensions(0);
     dot_dnums.add_rhs_contracting_dimensions(0);
     HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
-        ShapeUtil::MakeShape(F32, {1}), sub, reshape, dot_dnums));
+        ShapeUtil::MakeShapeWithDescendingLayout(F32, {1}), sub, reshape,
+        dot_dnums));
     auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
 
     if (manual_fusion) {
@@ -161,9 +175,9 @@ class MultiOutputFusionTest : public HloTestBase {
                nullptr);
     }
 
-    Literal input0(ShapeUtil::MakeShape(F32, {size}));
+    Literal input0(ShapeUtil::MakeShapeWithDescendingLayout(F32, {size}));
     input0.PopulateWithValue(2.5f);
-    Literal input1(ShapeUtil::MakeShape(F64, {size}));
+    Literal input1(ShapeUtil::MakeShapeWithDescendingLayout(F64, {size}));
     input1.PopulateWithValue(1.);
 
     Literal expect =
@@ -291,7 +305,7 @@ const char* const kScalarOps = R"(
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionMinor)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       c0 = f32[] constant(0)
@@ -323,7 +337,7 @@ XLA_TEST_F(MultiOutputFusionTest,
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionMajor)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       c0 = f32[] constant(0)
@@ -355,7 +369,7 @@ XLA_TEST_F(MultiOutputFusionTest,
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionScalar)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       c0 = f32[] constant(0)
@@ -388,7 +402,7 @@ XLA_TEST_F(MultiOutputFusionTest,
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionMinorWithExtraOutput)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       c0 = f32[] constant(0)
@@ -422,7 +436,7 @@ XLA_TEST_F(MultiOutputFusionTest,
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionMajorWithExtraOutput)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       c0 = f32[] constant(0)
@@ -457,7 +471,7 @@ XLA_TEST_F(MultiOutputFusionTest,
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionScalarWithExtraOutput)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       c0 = f32[] constant(0)
@@ -494,7 +508,7 @@ XLA_TEST_F(MultiOutputFusionTest,
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionNonConstInit)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
       init1 = f32[] parameter(1)
@@ -529,7 +543,7 @@ XLA_TEST_F(MultiOutputFusionTest,
 
 XLA_TEST_F(MultiOutputFusionTest,
            DISABLED_ON_CPU(MultiOutputReduceFusionDifferentElementTypes)) {
-  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+  const string testcase = absl::StrCat(kScalarOps, R"(
     fused_reduce (p0: f16[2,2,2]) -> (f32[2,2], f32[2,2], f16[2,2,2]) {
       p0 = f16[2,2,2]{2,1,0} parameter(0)
       convert = f32[2,2,2]{2,1,0} convert(p0)
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index ca21b0b2ba..cbeddffacf 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -140,7 +140,7 @@ XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
 
 TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   XlaBuilder b(TestName());
-  auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
+  auto input = absl::make_unique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
       {1.0f, 2.0f},  // row 0
       {3.0f, 4.0f},  // row 1
@@ -151,7 +151,7 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   Pad(AddParam(*input, &b), AddParam(*LiteralUtil::CreateR0<float>(1.5), &b),
       r4_padding_on_dim0_dim1_);
 
-  auto expected = MakeUnique<Array4D<float>>(2, 3, 3, 2);
+  auto expected = absl::make_unique<Array4D<float>>(2, 3, 3, 2);
   expected->Fill(1.5);
   (*expected)(1, 0, 0, 0) = 1.0f;
   (*expected)(1, 0, 0, 1) = 2.0f;
@@ -171,7 +171,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
       AddParam(*LiteralUtil::CreateR0<float>(pad_value), &b),
       r4_padding_on_dim0_dim1_);
 
-  auto expected = MakeUnique<Array4D<float>>(8, 5, 1, 1);
+  auto expected = absl::make_unique<Array4D<float>>(8, 5, 1, 1);
   expected->Fill(pad_value);
   (*expected)(1, 0, 0, 0) = 1.0f;
   (*expected)(1, 2, 0, 0) = 2.0f;
@@ -269,7 +269,7 @@ XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
 
 XLA_TEST_F(PadTest, Pad4DU8Array) {
   XlaBuilder b(TestName());
-  auto input = MakeUnique<Array4D<uint8>>(1, 1, 3, 2);
+  auto input = absl::make_unique<Array4D<uint8>>(1, 1, 3, 2);
   Array2D<uint8> input_xy({
       {1, 2},  // row 0
       {3, 4},  // row 1
@@ -280,7 +280,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
   Pad(AddParam(*input, &b), ConstantR0<uint8>(&b, 35),
       r4_padding_on_dim0_dim1_);
 
-  auto expected = MakeUnique<Array4D<uint8>>(2, 3, 3, 2);
+  auto expected = absl::make_unique<Array4D<uint8>>(2, 3, 3, 2);
   expected->Fill(35);
   (*expected)(1, 0, 0, 0) = 1;
   (*expected)(1, 0, 0, 1) = 2;
@@ -301,13 +301,13 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
       Pad(input, ConstantR0<bool>(&b, false), r4_padding_on_dim0_dim1_);
 
   // For the same reason, use Select to convert boolean values to int32.
-  auto zeros = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
-  auto ones = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
+  auto zeros = absl::make_unique<Array4D<int32>>(2, 3, 3, 2);
+  auto ones = absl::make_unique<Array4D<int32>>(2, 3, 3, 2);
   zeros->Fill(0);
   ones->Fill(1);
   Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b));
 
-  auto expected = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
+  auto expected = absl::make_unique<Array4D<int32>>(2, 3, 3, 2);
   expected->Fill(0);
   (*expected)(1, 0, 0, 0) = 1;
   (*expected)(1, 0, 0, 1) = 1;
@@ -321,7 +321,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
 XLA_TEST_P(PadTestFloat, Large2DPad) {
   XlaBuilder b(TestName());
 
-  auto ones = MakeUnique<Array2D<float>>(4, 4);
+  auto ones = absl::make_unique<Array2D<float>>(4, 4);
   ones->Fill(1.0f);
   auto input = AddParam(*ones, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
@@ -342,7 +342,7 @@ XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  auto operand = absl::make_unique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(0.0f);
   auto input = AddParam(*operand, &b);
 
@@ -368,7 +368,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
   constexpr int64 low_padding = 0;
   int64 high_padding[2] = {5, 7};
   constexpr int64 interior_padding = 0;
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  auto operand = absl::make_unique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(1.0f);
   auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
@@ -395,7 +395,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
   int64 low_padding[2] = {-1, -2};
   int64 high_padding[2] = {-3, 4};
   constexpr int64 interior_padding = 0;
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  auto operand = absl::make_unique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(1.0f);
   auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
@@ -423,7 +423,7 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
   int64 low_padding[2] = {4, -1};
   int64 high_padding[2] = {-2, -4};
   int64 interior_padding[2] = {1, 2};
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  auto operand = absl::make_unique<Array2D<float>>(in_rows, in_cols);
   operand->FillUnique(1.0f);
   auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
@@ -446,7 +446,7 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
 // Regression test for b/31827337.
 XLA_TEST_P(PadTestFloat, ReducePad) {
   XlaBuilder b(TestName());
-  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
+  auto ones = absl::make_unique<Array4D<float>>(2, 2, 2, 2);
   ones->Fill(1.0);
   auto input = AddParam(*ones, &b);
 
diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
index a080dd1732..9af9ea4a22 100644
--- a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include <array>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -29,16 +29,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-namespace str_util = tensorflow::str_util;
-namespace strings = tensorflow::strings;
-
 struct ReduceLayout {
   std::array<int64, 4> input_minor_to_major;
   std::array<int64, 3> output_minor_to_major;
 
   string ToString() const {
-    return strings::StrCat(str_util::Join(input_minor_to_major, "x"), "_",
-                           str_util::Join(output_minor_to_major, "x"));
+    return absl::StrCat(absl::StrJoin(input_minor_to_major, "x"), "_",
+                        absl::StrJoin(output_minor_to_major, "x"));
   }
 };
 
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 531648fe3e..0916a07f4f 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -57,8 +58,8 @@ static const int mantissa_sizes[] = {23, 10, 23, 10};
 
 string TestDataToString(const ::testing::TestParamInfo<int> data) {
   int i = data.param;
-  return tensorflow::strings::StrCat(exponent_sizes[i], "_exponent_bits_",
-                                     mantissa_sizes[i], "_mantissa_bits");
+  return absl::StrCat(exponent_sizes[i], "_exponent_bits_", mantissa_sizes[i],
+                      "_mantissa_bits");
 }
 
 // The FPVAL macro allows us to write out the binary representation of the
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 2065271a7f..b93d838349 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
@@ -559,9 +560,9 @@ void PrintTo(const BoundsLayout& spec, std::ostream* os) {
   *os << tensorflow::strings::Printf(
       "R%luToR%lu%s_%s_Reduce%s", spec.bounds.size(),
       spec.bounds.size() - spec.reduce_dims.size(),
-      tensorflow::str_util::Join(spec.bounds, "x").c_str(),
-      tensorflow::str_util::Join(spec.layout, "").c_str(),
-      tensorflow::str_util::Join(spec.reduce_dims, "").c_str());
+      absl::StrJoin(spec.bounds, "x").c_str(),
+      absl::StrJoin(spec.layout, "").c_str(),
+      absl::StrJoin(spec.reduce_dims, "").c_str());
 }
 
 // Add-reduces a broadcasted scalar matrix among dimension 1 and 0.
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 92c93f08b2..60167619a4 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
@@ -357,7 +360,7 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> input_dims(6, 8);
   auto shape = ShapeUtil::MakeShape(F32, input_dims);
 
-  auto arg_literal = MakeUnique<Literal>(shape);
+  auto arg_literal = absl::make_unique<Literal>(shape);
   arg_literal->PopulateWithValue(1.0f);
   const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
 
@@ -368,7 +371,7 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> output_dims = {6, 8, 6, 6, 8, 8};
   Shape result_shape =
       ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout);
-  auto expected = MakeUnique<Literal>(result_shape);
+  auto expected = absl::make_unique<Literal>(result_shape);
   expected->PopulateWithValue(27.0f);
   ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
@@ -578,21 +581,20 @@ string R4ReduceWindowTestDataToString(
     const ::testing::TestParamInfo<
         ::testing::tuple<R4ReduceWindowTestData, bool>>& data) {
   const auto& param = ::testing::get<0>(data.param);
-  string str = tensorflow::strings::StrCat(
-      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
-      "__window_bounds_",
-      tensorflow::str_util::Join(param.window_bounds, "x"),            //
-      "__strides_", tensorflow::str_util::Join(param.strides, "x"),    //
-      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),    //
-      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),  //
-      "__layout_", tensorflow::str_util::Join(param.layout, "_"),      //
+  string str = absl::StrCat(
+      "base_bounds_", absl::StrJoin(param.base_bounds, "x"),        //
+      "__window_bounds_", absl::StrJoin(param.window_bounds, "x"),  //
+      "__strides_", absl::StrJoin(param.strides, "x"),              //
+      "__pad_low_", absl::StrJoin(param.pad_low, "x"),              //
+      "__pad_high_", absl::StrJoin(param.pad_high, "x"),            //
+      "__layout_", absl::StrJoin(param.layout, "_"),                //
       (param.reducer == kAdd) ? "_add" : "_max");
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
   // Test names are not allowed to contain the '-' character.
   std::replace(str.begin(), str.end(), '-', 'n');
   if (::testing::get<1>(data.param)) {
-    str = tensorflow::strings::StrCat(str, "_bfloat16");
+    str = absl::StrCat(str, "_bfloat16");
   }
   return str;
 }
@@ -934,15 +936,15 @@ string R3ReduceWindowTestDataToString(
     const ::testing::TestParamInfo<
         ::testing::tuple<R3ReduceWindowTestData, bool>>& data) {
   const auto& param = ::testing::get<0>(data.param);
-  string str = tensorflow::strings::StrCat(
-      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),
-      "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"),
-      "__strides_", tensorflow::str_util::Join(param.strides, "x"),
-      "__padding_", param.padding == Padding::kSame ? "same" : "valid",
-      "__layout_", param.layout[0], "_", param.layout[1], "_", param.layout[2],
-      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  string str = absl::StrCat(
+      "base_bounds_", absl::StrJoin(param.base_bounds, "x"), "__window_bounds_",
+      absl::StrJoin(param.window_bounds, "x"), "__strides_",
+      absl::StrJoin(param.strides, "x"), "__padding_",
+      param.padding == Padding::kSame ? "same" : "valid", "__layout_",
+      param.layout[0], "_", param.layout[1], "_", param.layout[2], "__reducer_",
+      param.reducer == kAdd ? "add" : "max");
   if (::testing::get<1>(data.param)) {
-    str = tensorflow::strings::StrCat(str, "_bfloat16");
+    str = absl::StrCat(str, "_bfloat16");
   }
   return str;
 }
@@ -1068,17 +1070,16 @@ string R2ReduceWindowTestDataToString(
     const ::testing::TestParamInfo<
         ::testing::tuple<R2ReduceWindowTestData, bool>>& data) {
   const auto& param = ::testing::get<0>(data.param);
-  string str = tensorflow::strings::StrCat(
-      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
-      "__window_bounds_",
-      tensorflow::str_util::Join(param.window_bounds, "x"),          //
-      "__strides_", tensorflow::str_util::Join(param.strides, "x"),  //
-      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),
-      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),
-      "__layout_", param.layout[0], "_", param.layout[1],  //
+  string str = absl::StrCat(
+      "base_bounds_", absl::StrJoin(param.base_bounds, "x"),        //
+      "__window_bounds_", absl::StrJoin(param.window_bounds, "x"),  //
+      "__strides_", absl::StrJoin(param.strides, "x"),              //
+      "__pad_low_", absl::StrJoin(param.pad_low, "x"), "__pad_high_",
+      absl::StrJoin(param.pad_high, "x"), "__layout_", param.layout[0], "_",
+      param.layout[1],  //
       "__reducer_", param.reducer == kAdd ? "add" : "max");
   if (::testing::get<1>(data.param)) {
-    str = tensorflow::strings::StrCat(str, "_bfloat16");
+    str = absl::StrCat(str, "_bfloat16");
   }
   return str;
 }
@@ -1261,21 +1262,27 @@ struct R1ReduceWindowTestData {
      /*pad_low=*/{5},
      /*pad_high=*/{0},
      /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{4096}, /*window_bounds=*/{4096},
+     /*strides=*/{1},
+     /*pad_low=*/{4095},
+     /*pad_high=*/{0},
+     /*reducer=*/Reducer::kMax},
 };
 
 string R1ReduceWindowTestDataToString(
     const ::testing::TestParamInfo<
         ::testing::tuple<R1ReduceWindowTestData, bool>>& data) {
   const auto& param = ::testing::get<0>(data.param);
-  string str = tensorflow::strings::StrCat(
-      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),
-      "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"),
-      "__strides_", tensorflow::str_util::Join(param.strides, "x"),
-      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),
-      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),
-      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  string str =
+      absl::StrCat("base_bounds_", absl::StrJoin(param.base_bounds, "x"),
+                   "__window_bounds_", absl::StrJoin(param.window_bounds, "x"),
+                   "__strides_", absl::StrJoin(param.strides, "x"),
+                   "__pad_low_", absl::StrJoin(param.pad_low, "x"),
+                   "__pad_high_", absl::StrJoin(param.pad_high, "x"),
+                   "__reducer_", param.reducer == kAdd ? "add" : "max");
   if (::testing::get<1>(data.param)) {
-    str = tensorflow::strings::StrCat(str, "_bfloat16");
+    str = absl::StrCat(str, "_bfloat16");
   }
   return str;
 }
@@ -1442,7 +1449,7 @@ ENTRY reduce-window-identity {
 }
 
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+  EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt));
 }
 
 XLA_TEST_F(HloTestBase, ReduceWindowS32) {
@@ -1461,7 +1468,7 @@ ENTRY %reduce-window (parameter.0: s32[81,8], parameter.1: s32[]) -> s32[82,8] {
 }
 
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+  EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt));
 }
 
 XLA_TEST_F(HloTestBase, ReduceWindowF16) {
@@ -1480,7 +1487,7 @@ ENTRY %reduce-window (parameter.0: f16[81,8], parameter.1: f16[]) -> f16[82,8] {
 }
 
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+  EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 41e49b4003..60084f143d 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -43,10 +44,8 @@ struct ReverseSpec {
 
   string ToTestCaseName() const {
     return tensorflow::strings::Printf(
-        "reverse_%s_in_dims_%s_%s",
-        tensorflow::str_util::Join(input_dims, "x").c_str(),
-        tensorflow::str_util::Join(reversal, "x").c_str(),
-        use_bfloat16 ? "bf16" : "f32");
+        "reverse_%s_in_dims_%s_%s", absl::StrJoin(input_dims, "x").c_str(),
+        absl::StrJoin(reversal, "x").c_str(), use_bfloat16 ? "bf16" : "f32");
   }
 };
 
diff --git a/tensorflow/compiler/xla/tests/sample_text_test.cc b/tensorflow/compiler/xla/tests/sample_text_test.cc
index b4f2b74e3d..2b03a0b0b2 100644
--- a/tensorflow/compiler/xla/tests/sample_text_test.cc
+++ b/tensorflow/compiler/xla/tests/sample_text_test.cc
@@ -19,18 +19,18 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
-using tensorflow::gtl::nullopt;
+using absl::nullopt;
 
 class SampleTextTest : public HloTestBase {};
 
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index e42c71eb28..cf2d453f43 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
new file mode 100644
index 0000000000..99eeb12e2b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -0,0 +1,615 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+using absl::nullopt;
+
+class ScatterTest : public HloTestBase {
+ protected:
+  void RunTest(const string& hlo_text, Literal* operand,
+               Literal* scatter_indices, Literal* updates) {
+    RunTest(hlo_text, {operand, scatter_indices, updates});
+  }
+
+  void RunTest(const string& hlo_text,
+               tensorflow::gtl::ArraySlice<Literal*> args) {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            ParseHloString(hlo_text, config));
+    EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt));
+  }
+};
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterV1_Update) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterV2_Update) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV2
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[3,2] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={0},
+      inserted_window_dims={1},
+      scatter_dims_to_operand_dims={1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 30}, {40, 60}, {70, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_Add) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatter_Add
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_Mul) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatter_Mul
+
+mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT mul = s32[] multiply(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=mul_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_F32) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatter_F32
+
+add_f32 (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(f32[] lhs, f32[] rhs)
+}
+
+ENTRY main {
+  operand = f32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = f32[2,3] parameter(2)
+  ROOT scatter = f32[3,3] scatter(operand, indices, updates),
+      to_apply=add_f32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<float>(
+      {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({2, 1});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<float>({{0.4, 1.1, 0.7}, {2.3, 3.1, 1.6}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_RepeatedIndices) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,3] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatter_MultipleBatchDims) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterMultipleBatchDims
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,3,2] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={1},
+      inserted_window_dims={1},
+      scatter_dims_to_operand_dims={1},
+      index_vector_dim=2
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10, 30}, {40, 60}, {70, 90}}, {{5, 5}, {5, 5}, {5, 5}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterNd) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterNd
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3,3,2] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                    {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{-10, 10}, {-40, 40}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, TensorFlowScatterNd_NonDefaultIndexVectorDim) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterNdNonDefaultIndexVectorDim
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3,3,2] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                    {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                    {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{0, 0}, {1, 0}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{-10, 10}, {-20, 20}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, DynamicUpdateSlice) {
+  const char* hlo_text = R"(
+HloModule DynamicUpdateSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[1,1] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={0,1},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({1, 1});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR2<int32>({{10}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, BatchDynamicUpdateSlice) {
+  const char* hlo_text = R"(
+HloModule BatchDynamicUpdateSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  updates = s32[2,1,1] parameter(2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR3<int32>({{{10}}, {{20}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, ZeroDimBounds) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatter_ZeroDimBounds
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,0] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = s32[2,0] parameter(2)
+  ROOT scatter = s32[3,0] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR1<int32>({0, 2});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR2<int32>({{}, {}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, NoUpdateWindowDims) {
+  const string hlo_text = R"(
+HloModule Scatter_NoUpdateWindowDims
+
+add_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(s32[] lhs, s32[] rhs)
+}
+
+ENTRY main {
+  operand = s32[3] parameter(0)
+  indices = s32[2,2,1] parameter(1)
+  updates = s32[2,2] parameter(2)
+  ROOT scatter = s32[3] scatter(operand, indices, updates),
+      to_apply=add_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
+  std::unique_ptr<Literal> scatter_indices =
+      LiteralUtil::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR2<int32>({{10, 20}, {30, 40}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, OutOfBoundsIndex) {
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  updates = s32[6,1,1]{2,1,0} parameter(2)
+  ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR2<int32>(
+      {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483647, 1}, {1, 2}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, OutOfBoundsUnsignedIndex) {
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = u32[6,2]{1,0} parameter(1)
+  updates = s32[6,1,1]{2,1,0} parameter(2)
+  ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR2<uint32>(
+      {{2, 7}, {2, 1}, {1, 1}, {5, 1}, {2147483648u, 1}, {1, 2}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, NegativeIndex) {
+  const string hlo_text = R"(
+HloModule BatchDynamicSlice
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3,3]{1,0} parameter(0)
+  indices = s32[6,2]{1,0} parameter(1)
+  updates = s32[6,1,1]{2,1,0} parameter(2)
+  ROOT scatter = s32[3,3]{1,0} scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR2<int32>(
+      {{2, 7}, {2, 1}, {1, 1}, {-500, 1}, {-2147483648, 1}, {1, 2}});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR3<int32>(
+      {{{10}}, {{20}}, {{30}}, {{40}}, {{50}}, {{60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, OneScalarIndex) {
+  const char* hlo_text = R"(
+HloModule OneScalarIndex
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[2,3,2]{2,1,0} parameter(0)
+  index = s32[] parameter(1)
+  updates = s32[1,3,2]{2,1,0} parameter(2)
+  ROOT scatter = s32[2,3,2]{2,1,0} scatter(operand, index, updates),
+      to_apply=update_s32,
+      update_window_dims={0,1,2},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR3<int32>(
+      {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR0<int32>(1);
+  std::unique_ptr<Literal> updates =
+      LiteralUtil::CreateR3<int32>({{{10, 20}, {30, 40}, {50, 60}}});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, ScalarUpdate) {
+  const char* hlo_text = R"(
+HloModule ScalarUpdate
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[4]{0} parameter(0)
+  index = s32[] parameter(1)
+  updates = s32[] parameter(2)
+  ROOT scatter = s32[4]{0} scatter(operand, index, updates),
+      to_apply=update_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=0
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR0<int32>(1);
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR0<int32>(25);
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+XLA_TEST_F(ScatterTest, EmptyIndices) {
+  const string hlo_text = R"(
+HloModule EmptyIndices
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  operand = s32[3] parameter(0)
+  indices = s32[0] parameter(1)
+  updates = s32[0] parameter(2)
+  ROOT scatter = s32[3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  std::unique_ptr<Literal> operand = LiteralUtil::CreateR1<int32>({1, 2, 3});
+  std::unique_ptr<Literal> scatter_indices = LiteralUtil::CreateR1<int32>({});
+  std::unique_ptr<Literal> updates = LiteralUtil::CreateR1<int32>({});
+  RunTest(hlo_text, operand.get(), scatter_indices.get(), updates.get());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index b8ad6668f8..c57bbbd1e4 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -33,8 +36,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using ::tensorflow::str_util::Join;
-
 class SliceTest : public ClientLibraryTestBase {};
 
 TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
@@ -195,7 +196,7 @@ class SliceR1Test : public ClientLibraryTestBase,
   void Run(const R1Spec& spec) {
     // This can't be an std::vector, since you can't grab an ArraySlice of a
     // vector<bool>.
-    tensorflow::gtl::InlinedVector<NativeT, 1> input(spec.input_dim0);
+    absl::InlinedVector<NativeT, 1> input(spec.input_dim0);
     std::iota(input.begin(), input.end(), NativeT());
     auto literal = LiteralUtil::CreateR1<NativeT>(input);
 
@@ -205,7 +206,7 @@ class SliceR1Test : public ClientLibraryTestBase,
           {spec.slice_stride});
 
     // Ditto.
-    tensorflow::gtl::InlinedVector<NativeT, 1> expected;
+    absl::InlinedVector<NativeT, 1> expected;
     for (int i = spec.slice_start; i < spec.slice_limit;
          i += spec.slice_stride) {
       expected.push_back(i);
@@ -448,13 +449,11 @@ struct R4Spec {
 
 string R4SpecToString(const ::testing::TestParamInfo<R4Spec>& data) {
   const R4Spec& spec = data.param;
-  return tensorflow::strings::StrCat(              //
-      "input_", Join(spec.input_dims, "x"),        //
-      "__layout_", Join(spec.input_layout, ""),    //
-      "__starts_", Join(spec.slice_starts, "x"),   //
-      "__limits_", Join(spec.slice_limits, "x"),   //
-      "__strides_", Join(spec.slice_strides, "x")  //
-  );
+  return absl::StrCat("input_", absl::StrJoin(spec.input_dims, "x"),
+                      "__layout_", absl::StrJoin(spec.input_layout, ""),
+                      "__starts_", absl::StrJoin(spec.slice_starts, "x"),
+                      "__limits_", absl::StrJoin(spec.slice_limits, "x"),
+                      "__strides_", absl::StrJoin(spec.slice_strides, "x"));
 }
 
 class SliceR4Test : public ClientLibraryTestBase,
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index be35ec6c6e..a9874a9186 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/regexp.h"
 
@@ -44,7 +46,7 @@ ManifestT ReadManifest() {
   string contents((std::istreambuf_iterator<char>(file_stream)),
                   std::istreambuf_iterator<char>());
 
-  std::vector<string> lines = tensorflow::str_util::Split(contents, '\n');
+  std::vector<string> lines = absl::StrSplit(contents, '\n');
   for (string& line : lines) {
     auto comment = line.find("//");
     if (comment != string::npos) {
@@ -53,8 +55,8 @@ ManifestT ReadManifest() {
     if (line.empty()) {
       continue;
     }
-    tensorflow::str_util::StripTrailingWhitespace(&line);
-    std::vector<string> pieces = tensorflow::str_util::Split(line, ' ');
+    absl::StripTrailingAsciiWhitespace(&line);
+    std::vector<string> pieces = absl::StrSplit(line, ' ');
     CHECK_GE(pieces.size(), 1);
     auto& platforms = manifest[pieces[0]];
     for (int64 i = 1; i < pieces.size(); ++i) {
@@ -73,8 +75,7 @@ string PrependDisabledIfIndicated(const string& test_case_name,
   // First try full match: test_case_name.test_name
   // If that fails, try to find just the test_case_name; this would disable all
   // tests in the test case.
-  auto it = manifest.find(
-      tensorflow::strings::StrCat(test_case_name, ".", test_name));
+  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
   if (it == manifest.end()) {
     it = manifest.find(test_case_name);
     if (it == manifest.end()) {
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 2647937013..21c58e075e 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include <cmath>
+
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 
 namespace xla {
 
@@ -26,89 +29,101 @@ namespace {
 
 template <typename FloatT, typename GeneratorT>
 void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
-                                             std::minstd_rand0* engine) {
+                                             std::minstd_rand0* engine,
+                                             bool no_duplicates) {
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
-  // Create uniform numbers between 1 and 1.125 to avoid creating denormal
-  // numbers.
-  std::uniform_real_distribution<GeneratorT> generator(1.0f, 1.125f);
-  const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
-  TF_CHECK_OK(literal->Populate<FloatT>(
-      [&](tensorflow::gtl::ArraySlice<int64> indices) {
-        // Generate a random uniform number from -0.0625 and 0.0625 and bias it
-        // with a position dependent number with mean 0.037109375. These number
-        // should allow for long chains of accumulation without being too close
-        // to zero or too large to accumulate all numbers accurately. Only do
-        // this for large literals where the number of elements is much greater
-        // than 47 otherwise only negative values are produced.
-        //
-        // The value is positionally biased using a product of the indices. Add
-        // one to each index value to avoid collapsing to zero if any of the
-        // indices are zero.
-        int64 index_product = 1;
-        for (int64 i : indices) {
-          index_product *= (1 + i);
-        }
-        const int64 negative_bias = should_index_bias ? 47 : 0;
-        FloatT index_bias =
-            static_cast<FloatT>(index_product % 113 - negative_bias) /
-            static_cast<FloatT>(256.0f);
-        return static_cast<FloatT>(generator(*engine) - 1.0625f) + index_bias;
-      }));
+  if (no_duplicates) {
+    // Duplicates may be generated if the number of elements in the literal
+    // exceeds the number of positive values supported by the type.
+    FloatT next_value = std::numeric_limits<FloatT>::min();
+    for (FloatT& value : literal->data<FloatT>()) {
+      value = next_value;
+      next_value =
+          std::nextafter(next_value, std::numeric_limits<FloatT>::max());
+    }
+    std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+                 *engine);
+  } else {
+    std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
+    for (FloatT& value : literal->data<FloatT>()) {
+      value = static_cast<FloatT>(generator(*engine));
+    }
+  }
 }
 
 template <typename FloatT>
 void PopulateWithRandomFloatingPointData(Literal* literal,
-                                         std::minstd_rand0* engine) {
+                                         std::minstd_rand0* engine,
+                                         bool no_duplicates) {
   CHECK(engine != nullptr);
-  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
+  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine,
+                                                          no_duplicates);
 }
 
 template <>
 void PopulateWithRandomFloatingPointData<half>(Literal* literal,
-                                               std::minstd_rand0* engine) {
+                                               std::minstd_rand0* engine,
+                                               bool no_duplicates) {
+  // no_duplicates is ignored for half types. Unique values can only be
+  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
+  // best-effort anyway.
   CHECK(engine != nullptr);
-  PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
+  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
+  for (half& value : literal->data<half>()) {
+    value = static_cast<half>(generator(*engine));
+  }
 }
 
-// The standard library does not have a case for bfloat16, unsurprisingly, so we
-// handle that one specially.
 template <>
 void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
-                                                   std::minstd_rand0* engine) {
+                                                   std::minstd_rand0* engine,
+                                                   bool no_duplicates) {
+  // no_duplicates is ignored for bfloat types. Unique values can only be
+  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
+  // best-effort anyway.
   CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(), BF16);
-  std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
-  TF_CHECK_OK(literal->Populate<bfloat16>(
-      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return static_cast<bfloat16>(generator(*engine));
-      }));
+  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
+  for (bfloat16& value : literal->data<bfloat16>()) {
+    value = static_cast<bfloat16>(generator(*engine));
+  }
 }
 
 template <typename IntT>
-void PopulateWithRandomIntegralData(Literal* literal,
-                                    std::minstd_rand0* engine) {
+void PopulateWithRandomIntegralData(Literal* literal, std::minstd_rand0* engine,
+                                    bool no_duplicates) {
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
-  std::uniform_int_distribution<IntT> generator(
-      std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
-  TF_CHECK_OK(literal->Populate<IntT>(
-      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(*engine);
-      }));
+  if (no_duplicates && ShapeUtil::ElementsIn(literal->shape()) <
+                           std::numeric_limits<IntT>::max()) {
+    std::iota(literal->data<IntT>().begin(), literal->data<IntT>().end(), 0);
+    std::shuffle(literal->data<IntT>().begin(), literal->data<IntT>().end(),
+                 *engine);
+  } else {
+    std::uniform_int_distribution<IntT> generator(
+        std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
+    for (IntT& value : literal->data<IntT>()) {
+      value = generator(*engine);
+    }
+  }
 }
 
 // Similar to MakeFakeLiteral but takes a random number generator engine to
-// enable reusing the engine across randomly generated literals.
+// enable reusing the engine across randomly generated literals. 'no_duplicates'
+// indicates that there should be no duplicate values in each generated
+// array. This is uniqueness is best-effort only. Some types (half and bfloat16)
+// are not supported and uniqueness cannot be guaranteed if the number of
+// elements exceeds the number of different values supported by the type.
 StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
-    const Shape& shape, std::minstd_rand0* engine) {
+    const Shape& shape, std::minstd_rand0* engine, bool no_duplicates) {
   if (ShapeUtil::IsTuple(shape)) {
     std::vector<std::unique_ptr<Literal>> elements;
     for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteralInternal(element_shape, engine));
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<Literal> element,
+          MakeFakeLiteralInternal(element_shape, engine, no_duplicates));
       elements.push_back(std::move(element));
     }
     return LiteralUtil::MakeTupleOwned(std::move(elements));
@@ -116,43 +131,55 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
   if (engine == nullptr) {
     return Literal::CreateFromShape(shape);
   }
-  auto literal = MakeUnique<Literal>(shape);
+  auto literal = absl::make_unique<Literal>(shape);
   switch (shape.element_type()) {
     case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine,
+                                                    no_duplicates);
       break;
     case F16:
-      PopulateWithRandomFloatingPointData<half>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<half>(literal.get(), engine,
+                                                no_duplicates);
       break;
     case F32:
-      PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<float>(literal.get(), engine,
+                                                 no_duplicates);
       break;
     case F64:
-      PopulateWithRandomFloatingPointData<double>(literal.get(), engine);
+      PopulateWithRandomFloatingPointData<double>(literal.get(), engine,
+                                                  no_duplicates);
       break;
     case S8:
-      PopulateWithRandomIntegralData<int8>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int8>(literal.get(), engine,
+                                           no_duplicates);
       break;
     case U8:
-      PopulateWithRandomIntegralData<uint8>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint8>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case S16:
-      PopulateWithRandomIntegralData<int16>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int16>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case U16:
-      PopulateWithRandomIntegralData<uint16>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint16>(literal.get(), engine,
+                                             no_duplicates);
       break;
     case S32:
-      PopulateWithRandomIntegralData<int32>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int32>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case U32:
-      PopulateWithRandomIntegralData<uint32>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint32>(literal.get(), engine,
+                                             no_duplicates);
       break;
     case S64:
-      PopulateWithRandomIntegralData<int64>(literal.get(), engine);
+      PopulateWithRandomIntegralData<int64>(literal.get(), engine,
+                                            no_duplicates);
       break;
     case U64:
-      PopulateWithRandomIntegralData<uint64>(literal.get(), engine);
+      PopulateWithRandomIntegralData<uint64>(literal.get(), engine,
+                                             no_duplicates);
       break;
     case PRED: {
       std::uniform_int_distribution<int> generator(0, 1);
@@ -208,16 +235,12 @@ bool NeedsInitValue(const HloUse& use) {
 
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
-std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
-    const Shape& input_shape, const Shape& slice_shape,
-    std::minstd_rand0* engine) {
-  const int64 rank = ShapeUtil::Rank(input_shape);
-  std::vector<int32> start_indices(rank);
+std::unique_ptr<Literal> MakeRandomIndex(
+    tensorflow::gtl::ArraySlice<int64> index_space, std::minstd_rand0* engine) {
+  std::vector<int32> start_indices(index_space.size());
   if (engine != nullptr) {
-    for (int i = 0; i < rank; ++i) {
-      const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
-                                ShapeUtil::GetDimension(slice_shape, i);
-      std::uniform_int_distribution<int32> generator(0, upper_bound);
+    for (int i = 0; i < index_space.size(); ++i) {
+      std::uniform_int_distribution<int32> generator(0, index_space[i]);
       start_indices[i] = generator(*engine);
     }
   }
@@ -254,6 +277,11 @@ std::vector<HloInstruction*> FindConstrainedUses(
         auto converted_uses = FindConstrainedUses(dataflow, *instruction);
         constrained_uses.insert(constrained_uses.end(), converted_uses.begin(),
                                 converted_uses.end());
+      } else if (opcode == HloOpcode::kSort &&
+                 instruction->operand_count() == 2 && op_num == 0) {
+        // Operand 0 of sort is the array of keys used for key/value
+        // (two-operand) kSort instructions.
+        constrained_uses.push_back(instruction);
       }
     }
   }
@@ -267,56 +295,66 @@ std::vector<HloInstruction*> FindConstrainedUses(
 StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
-  HloInstruction* needs_index = nullptr;
-  HloInstruction* needs_constant = nullptr;
+  std::vector<int64> index_space;
+  bool no_duplicates = false;
+  bool needs_constant = false;
   ConstantType constant_type = ConstantType::kUnknown;
   for (HloInstruction* use : constrained_uses) {
     switch (use->opcode()) {
       case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-        if (needs_index != nullptr) {
-          auto needs_index_shape = needs_index->shape();
-          auto use_shape = use->shape();
-          if (needs_index->opcode() == HloOpcode::kDynamicSlice) {
-            needs_index_shape = needs_index->operand(0)->shape();
-          }
-          if (use->opcode() == HloOpcode::kDynamicSlice) {
-            use_shape = use->operand(0)->shape();
+      case HloOpcode::kDynamicUpdateSlice: {
+        const Shape& indexed_shape = use->operand(0)->shape();
+        const Shape& slice_shape = use->opcode() == HloOpcode::kDynamicSlice
+                                       ? use->shape()
+                                       : use->operand(1)->shape();
+        const int64 rank = ShapeUtil::Rank(indexed_shape);
+        if (!index_space.empty()) {
+          TF_RET_CHECK(rank == index_space.size());
+          for (int64 i = 0; i < rank; ++i) {
+            index_space[i] = std::min(
+                index_space[i], ShapeUtil::GetDimension(indexed_shape, i) -
+                                    ShapeUtil::GetDimension(slice_shape, i));
           }
-          if (!ShapeUtil::Equal(needs_index_shape, use_shape)) {
-            return Unimplemented(
-                "Conflicting operand generation slice index constraints\n");
+        } else {
+          index_space.resize(rank);
+          for (int64 i = 0; i < rank; ++i) {
+            index_space[i] = ShapeUtil::GetDimension(indexed_shape, i) -
+                             ShapeUtil::GetDimension(slice_shape, i);
           }
         }
-        needs_index = use;
         break;
+      }
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
-        needs_constant = use;
+        needs_constant = true;
         constant_type = GetInitValue(*use->to_apply());
         break;
 
       case HloOpcode::kSelectAndScatter:
-        needs_constant = use;
+        needs_constant = true;
         constant_type = GetInitValue(*use->scatter());
         break;
 
+      case HloOpcode::kSort:
+        no_duplicates = true;
+        break;
+
       default:
         return Unimplemented(
             "Constrained operand generation not implemented for %s.",
             use->ToString().c_str());
     }
   }
-  if (needs_index != nullptr && needs_constant != nullptr) {
-    return Unimplemented(
-        "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
-        "constant: %s\n",
-        needs_index->ToString().c_str(), needs_constant->ToString().c_str());
+  int constraint_count = 0;
+  constraint_count += no_duplicates ? 1 : 0;
+  constraint_count += !index_space.empty() ? 1 : 0;
+  constraint_count += needs_constant ? 1 : 0;
+  if (constraint_count > 1) {
+    return Unimplemented("Conflicting operand generation constraints.");
   }
-  if (needs_index != nullptr) {
-    return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
-                                           needs_index->shape(), engine);
-  } else if (needs_constant != nullptr) {
+  if (!index_space.empty()) {
+    return MakeRandomIndex(index_space, engine);
+  } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
         return LiteralUtil::Zero(param.shape().element_type()).CloneToUnique();
@@ -325,10 +363,11 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
       case ConstantType::kUnknown:
         // We want the identity element for the computation, but we don't really
         // know what it is - so any value we generate will be just as wrong.
-        return MakeFakeLiteralInternal(param.shape(), engine);
+        return MakeFakeLiteralInternal(param.shape(), engine,
+                                       /*no_duplicates=*/false);
     }
   } else {
-    return MakeFakeLiteralInternal(param.shape(), engine);
+    return MakeFakeLiteralInternal(param.shape(), engine, no_duplicates);
   }
 }
 
@@ -345,25 +384,36 @@ StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
 
 StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
                                                    bool pseudo_random) {
-  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
-  return MakeFakeLiteralInternal(shape, engine.get());
+  auto engine =
+      pseudo_random ? absl::make_unique<std::minstd_rand0>() : nullptr;
+  return MakeFakeLiteralInternal(shape, engine.get(), /*no_duplicates=*/false);
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
     HloModule* const module, bool pseudo_random) {
+  auto engine =
+      pseudo_random ? absl::make_unique<std::minstd_rand0>() : nullptr;
+  return MakeFakeArguments(module, engine.get());
+}
+
+StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
+    HloModule* const module, std::minstd_rand0* engine) {
   TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(*module));
   const auto params = module->entry_computation()->parameter_instructions();
-  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
   std::vector<std::unique_ptr<Literal>> arguments(params.size());
   for (int i = 0; i < params.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(arguments[i], MakeConstrainedArgument(
-                                          *dataflow, *params[i], engine.get()));
+    arguments[i] =
+        MakeConstrainedArgument(*dataflow, *params[i], engine).ValueOrDie();
   }
   return std::move(arguments);
 }
 
-Status VerifyHloModule(HloModule* const module, bool allow_mixed_precision) {
-  return HloVerifier(allow_mixed_precision).Run(module).status();
+Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
+                       bool allow_mixed_precision) {
+  return HloVerifier(/*layout_sensitive=*/layout_sensitive,
+                     /*allow_mixed_precision=*/allow_mixed_precision)
+      .Run(module)
+      .status();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index e59f215a9a..277d53d423 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <memory>
 #include <random>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -63,8 +63,17 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
 // Generates a vector of arguments containing fake data. The number, shape and
 // layout of the arguments is appropriate for given HLO module.
 //
-// Will handle special cases such as making sure that indices used for dynamic
-// slices are bounded, reduces that call adds use 0 as an init value, etc.
+// A best-effort attempt is made to generate the data in a way which produce
+// stable computation results across platforms. Specifically:
+//
+//  (1) Init values of reductions should be the identity of the reduction
+//  computation.
+//
+//  (2) Indices of dynamic slices and update slices should be in bounds.
+//
+//  (3) Keys of key/value sorts should contain no duplicates.
+//
+// These constraints are best-effort only.
 //
 // If pseudo_random is true, the generated numbers will be generated
 // deterministically in a pseudo random way unless the values are constrated to
@@ -78,10 +87,16 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
     HloModule* const module, bool pseudo_random = true);
 
+// Overload which accepts a random number generator. This enables generation of
+// different random values with sequential calls to MakeFakeArguments by reusing
+// the same generator.
+StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
+    HloModule* const module, std::minstd_rand0* engine);
+
 // Check that a given module satisfies various constraints before trying to
 // execute it.
-Status VerifyHloModule(HloModule* const module,
-                       bool allow_mixed_precision = false);
+Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
+                       bool allow_mixed_precision);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index a2f0338e25..322c8ef090 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -72,5 +73,106 @@ XLA_TEST_F(TestUtilsTest, Token) {
   TF_ASSERT_OK(MakeFakeArguments(module.get()).status());
 }
 
+XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) {
+  auto module = ParseHloString(
+                    R"(HloModule index_space_module
+
+    ENTRY IndexSpace {
+      index_param = s32[3]{0} parameter(0)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
+      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param), dynamic_slice_sizes={1,2,3}
+      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param), dynamic_slice_sizes={3,2,2}
+    })")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 3);
+  const Literal& index_arg = *args[0];
+
+  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+
+  EXPECT_GE(index_arg.Get<int32>({1}), 0);
+  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+
+  EXPECT_GE(index_arg.Get<int32>({2}), 0);
+  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+}
+
+XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
+  auto module = ParseHloString(
+                    R"(HloModule index_space_module
+
+    ENTRY IndexSpace {
+      index_param = s32[3]{0} parameter(0)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
+      update_param.1 = f32[1,2,3]{0,1,2} parameter(3)
+      update_param.2 = f32[3,2,2]{0,1,2} parameter(4)
+
+      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param)
+      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param)
+    })")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 5);
+  const Literal& index_arg = *args[0];
+
+  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+
+  EXPECT_GE(index_arg.Get<int32>({1}), 0);
+  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+
+  EXPECT_GE(index_arg.Get<int32>({2}), 0);
+  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+}
+
+XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
+  // Inputs which are sort keys in key/value sorts should have no duplicates.
+  auto module = ParseHloString(R"(
+HloModule sort.148.1589
+
+ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (f32[1048576], s32[1048576]) {
+  %parameter.0 = f32[1048576]{0} parameter(0)
+  %parameter.1 = s32[1048576]{0} parameter(1)
+  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  const Literal& key_arg = *args[0];
+
+  tensorflow::gtl::FlatSet<uint32> key_set;
+  for (const float& value : key_arg.data<float>()) {
+    EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
+  }
+}
+
+XLA_TEST_F(TestUtilsTest, NoDuplicatesInt32) {
+  // Inputs which are sort keys in key/value sorts should have no duplicates.
+  auto module = ParseHloString(R"(
+HloModule sort.148.1589
+
+ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (s32[1048576], s32[1048576]) {
+  %parameter.0 = s32[1048576]{0} parameter(0)
+  %parameter.1 = s32[1048576]{0} parameter(1)
+  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::unique_ptr<Literal>> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  const Literal& key_arg = *args[0];
+
+  tensorflow::gtl::FlatSet<int32> key_set;
+  for (const int32& value : key_arg.data<int32>()) {
+    EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index 2bdbd08309..c7eb9e2dbe 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -15,11 +15,10 @@ limitations under the License.
 
 #include <array>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -67,7 +66,10 @@ XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(42)));
   module->AddEntryComputation(builder.Build());
 
-  Status status = HloVerifier().Run(module.get()).status();
+  Status status =
+      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
+          .Run(module.get())
+          .status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
       status.error_message(),
@@ -84,7 +86,10 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
       "param"));
   module->AddEntryComputation(builder.Build());
 
-  Status status = HloVerifier().Run(module.get()).status();
+  Status status =
+      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
+          .Run(module.get())
+          .status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
       status.error_message(),
@@ -101,7 +106,10 @@ XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(123)));
   module->AddEntryComputation(builder.Build());
 
-  Status status = HloVerifier().Run(module.get()).status();
+  Status status =
+      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
+          .Run(module.get())
+          .status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(status.error_message(),
               ::testing::HasSubstr(
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 97bbf80aff..c101cd2d20 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <initializer_list>
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -504,7 +505,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
       LiteralUtil::CreateR2<complex64>({{{111, 222}, {331, 442}},
                                         {{1011, 2022}, {3031, 4042}},
                                         {{10011, 20022}, {30031, 40042}}});
-  auto prod = MakeUnique<Literal>(sum->shape());
+  auto prod = absl::make_unique<Literal>(sum->shape());
   ASSERT_TRUE(prod->Populate<complex64>(
                       [&sum](tensorflow::gtl::ArraySlice<int64> indexes) {
                         return sum->Get<complex64>(indexes) *
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 20ae68ab74..8f80a9f3e4 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -190,25 +190,6 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR1) {
   SignAbsTestHelper<complex64>();
 }
 
-XLA_TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
-  XlaBuilder builder(TestName());
-  auto arg = ConstantR1<unsigned int>(
-      &builder, {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
-  Abs(arg);
-
-  ComputeAndCompareR1<unsigned int>(
-      &builder, {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()}, {});
-}
-
-XLA_TEST_F(UnaryOpTest, UnsignedSignTestR1) {
-  XlaBuilder builder(TestName());
-  auto arg = ConstantR1<unsigned int>(
-      &builder, {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
-  Sign(arg);
-
-  ComputeAndCompareR1<unsigned int>(&builder, {1, 1, 0, 1, 1}, {});
-}
-
 XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
   XlaBuilder builder(TestName());
   auto arg = ConstantR2<float>(&builder, {{1.0, -2.0}, {-3.0, 4.0}});
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 11f3efb1f3..6a7ddd9b55 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -16,6 +16,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -29,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -81,8 +84,7 @@ struct ParsedProfileOutputLine {
 Status ParseOneProfileOutputLine(
     const string& line, bool expect_hlo,
     gtl::FlatMap<string, ParsedProfileOutputLine>* parsed_results,
-    tensorflow::gtl::ArraySlice<tensorflow::StringPiece> opcodes_to_ignore =
-        {}) {
+    tensorflow::gtl::ArraySlice<absl::string_view> opcodes_to_ignore = {}) {
   string separator = "[^:]*:: +";
   string match_percentage = R"(\d+\.\d*% +\d+Σ)";
   string match_cycles = R"((\d+) cycles +\( *()" + match_percentage + R"()\))";
@@ -99,7 +101,7 @@ Status ParseOneProfileOutputLine(
 
   string match_opcode =
       expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])";
-  string regexp_pattern = tensorflow::strings::StrCat(
+  string regexp_pattern = absl::StrCat(
       " +", match_cycles, separator, match_usecs, separator, match_flops,
       separator, match_trops, separator, match_bytes_per_sec, separator,
       match_bytes_per_cycle, separator, match_opcode);
@@ -116,7 +118,7 @@ Status ParseOneProfileOutputLine(
         ", Regexp: ", regexp_pattern);
   }
 
-  if (!c_linear_search(opcodes_to_ignore, parsed_line.opcode)) {
+  if (!absl::c_linear_search(opcodes_to_ignore, parsed_line.opcode)) {
     InsertOrDie(parsed_results, parsed_line.opcode, parsed_line);
   }
 
@@ -204,7 +206,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
                          rhs_shape);
 
   std::vector<string> profile_output_lines =
-      tensorflow::str_util::Split(profile_output, '\n');
+      absl::StrSplit(profile_output, '\n');
 
   gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
 
@@ -291,22 +293,20 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
                          matrix_shape);
 
   std::vector<string> profile_output_lines =
-      tensorflow::str_util::Split(profile_output, '\n');
+      absl::StrSplit(profile_output, '\n');
 
   auto while_body_profile_start =
-      c_find_if(profile_output_lines, [](tensorflow::StringPiece s) {
-        return tensorflow::str_util::StartsWith(s,
-                                                "Execution profile for body");
+      absl::c_find_if(profile_output_lines, [](absl::string_view s) {
+        return absl::StartsWith(s, "Execution profile for body");
       });
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.cend());
 
-  auto while_body_profile_end =
-      std::find_if(while_body_profile_start, profile_output_lines.end(),
-                   [](tensorflow::StringPiece s) {
-                     return tensorflow::str_util::StartsWith(
-                         s, "********** microseconds report **********");
-                   });
+  auto while_body_profile_end = std::find_if(
+      while_body_profile_start, profile_output_lines.end(),
+      [](absl::string_view s) {
+        return absl::StartsWith(s, "********** microseconds report **********");
+      });
 
   // We emit a blank line before the "********** microseconds report **********"
   // line.
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index a075195618..15603619b6 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -32,16 +32,14 @@ GTEST_API_ int main(int argc, char** argv) {
   // If the --benchmarks flag is passed in then only run the benchmarks, not the
   // tests.
   for (int i = 1; i < argc; i++) {
-    tensorflow::StringPiece arg(argv[i]);
-    if (arg == "--benchmarks" ||
-        tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
+    absl::string_view arg(argv[i]);
+    if (arg == "--benchmarks" || absl::StartsWith(arg, "--benchmarks=")) {
       const char* pattern = nullptr;
-      if (tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
+      if (absl::StartsWith(arg, "--benchmarks=")) {
         pattern = argv[i] + strlen("--benchmarks=");
       } else {
         // Handle flag of the form '--benchmarks foo' (no '=').
-        if (i + 1 >= argc ||
-            tensorflow::str_util::StartsWith(argv[i + 1], "--")) {
+        if (i + 1 >= argc || absl::StartsWith(argv[i + 1], "--")) {
           LOG(ERROR) << "--benchmarks flag requires an argument.";
           return 2;
         }
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index 897123d760..9835e3d803 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -20,25 +20,28 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
 StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadPath(
-    tensorflow::StringPiece path) {
-  CHECK(!tensorflow::str_util::EndsWith(path, ".gz"))
+    absl::string_view path) {
+  CHECK(!absl::EndsWith(path, ".gz"))
       << "TextLiteralReader no longer supports reading .gz files";
   std::unique_ptr<tensorflow::RandomAccessFile> file;
   Status s =
@@ -54,33 +57,6 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadPath(
 TextLiteralReader::TextLiteralReader(tensorflow::RandomAccessFile* file)
     : file_(file) {}
 
-namespace {
-// This is an optimized version of tensorflow::str_util::Split which uses
-// StringPiece for the delimited strings and uses an out parameter for the
-// result to avoid vector creation/destruction.
-void SplitByDelimToStringPieces(tensorflow::StringPiece text, char delim,
-                                std::vector<tensorflow::StringPiece>* result) {
-  result->clear();
-
-  if (text.empty()) {
-    return;
-  }
-
-  // The following loop is a little strange: its bound is text.size() + 1
-  // instead of the more typical text.size().
-  // The final iteration of the loop (when i is equal to text.size()) handles
-  // the trailing token.
-  size_t token_start = 0;
-  for (size_t i = 0; i < text.size() + 1; i++) {
-    if (i == text.size() || text[i] == delim) {
-      tensorflow::StringPiece token(text.data() + token_start, i - token_start);
-      result->push_back(token);
-      token_start = i + 1;
-    }
-  }
-}
-}  // namespace
-
 StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
   tensorflow::io::RandomAccessInputStream stream(file_.get());
   tensorflow::io::BufferedInputStream buf(&stream, 65536);
@@ -90,11 +66,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
     return s;
   }
 
-  tensorflow::StringPiece sp(shape_string);
-  if (tensorflow::str_util::RemoveWhitespaceContext(&sp) > 0) {
-    string tmp = std::string(sp);
-    shape_string = tmp;
-  }
+  absl::StripAsciiWhitespace(&shape_string);
   TF_ASSIGN_OR_RETURN(Shape shape, ShapeUtil::ParseShapeString(shape_string));
   if (shape.element_type() != F32) {
     return Unimplemented(
@@ -102,38 +74,36 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
         ShapeUtil::HumanString(shape).c_str());
   }
 
-  auto result = MakeUnique<Literal>(shape);
+  auto result = absl::make_unique<Literal>(shape);
   const float fill = std::numeric_limits<float>::quiet_NaN();
   result->PopulateWithValue<float>(fill);
-  std::vector<tensorflow::StringPiece> pieces;
-  std::vector<tensorflow::StringPiece> coordinates;
+  std::vector<absl::string_view> pieces;
+  std::vector<absl::string_view> coordinates;
   std::vector<int64> coordinate_values;
   string line;
   while (buf.ReadLine(&line).ok()) {
-    SplitByDelimToStringPieces(line, ':', &pieces);
-    tensorflow::StringPiece coordinates_string = pieces[0];
-    tensorflow::StringPiece value_string = pieces[1];
-    tensorflow::str_util::RemoveWhitespaceContext(&coordinates_string);
-    tensorflow::str_util::RemoveWhitespaceContext(&value_string);
-    if (!tensorflow::str_util::ConsumePrefix(&coordinates_string, "(")) {
+    pieces = absl::StrSplit(line, ':');
+    absl::string_view coordinates_string =
+        absl::StripAsciiWhitespace(pieces[0]);
+    absl::string_view value_string = absl::StripAsciiWhitespace(pieces[1]);
+    if (!absl::ConsumePrefix(&coordinates_string, "(")) {
       return InvalidArgument(
           "expected '(' at the beginning of coordinates: \"%s\"", line.c_str());
     }
-    if (!tensorflow::str_util::ConsumeSuffix(&coordinates_string, ")")) {
+    if (!absl::ConsumeSuffix(&coordinates_string, ")")) {
       return InvalidArgument("expected ')' at the end of coordinates: \"%s\"",
                              line.c_str());
     }
     float value;
-    if (!tensorflow::strings::safe_strtof(std::string(value_string).c_str(),
-                                          &value)) {
+    if (!absl::SimpleAtof(absl::string_view(value_string), &value)) {
       return InvalidArgument("could not parse value as float: \"%s\"",
-                             std::string(value_string).c_str());
+                             string(value_string).c_str());
     }
-    SplitByDelimToStringPieces(coordinates_string, ',', &coordinates);
+    coordinates = absl::StrSplit(coordinates_string, ',');
     coordinate_values.clear();
-    for (tensorflow::StringPiece piece : coordinates) {
+    for (absl::string_view piece : coordinates) {
       int64 coordinate_value;
-      if (!tensorflow::strings::safe_strto64(piece, &coordinate_value)) {
+      if (!absl::SimpleAtoi(piece, &coordinate_value)) {
         return InvalidArgument(
             "could not parse coordinate member as int64: \"%s\"",
             std::string(piece).c_str());
diff --git a/tensorflow/compiler/xla/text_literal_reader.h b/tensorflow/compiler/xla/text_literal_reader.h
index 708e8c80d8..b265640802 100644
--- a/tensorflow/compiler/xla/text_literal_reader.h
+++ b/tensorflow/compiler/xla/text_literal_reader.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -41,8 +41,7 @@ class TextLiteralReader {
  public:
   // See class comment -- reads a file in its entirety (there must be only one
   // literal in the text file path provided).
-  static StatusOr<std::unique_ptr<Literal>> ReadPath(
-      tensorflow::StringPiece path);
+  static StatusOr<std::unique_ptr<Literal>> ReadPath(absl::string_view path);
 
  private:
   // Ownership of file is transferred.
diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc
index 24e0784741..00147015a6 100644
--- a/tensorflow/compiler/xla/text_literal_writer.cc
+++ b/tensorflow/compiler/xla/text_literal_writer.cc
@@ -17,23 +17,23 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-/* static */ Status TextLiteralWriter::WriteToPath(
-    const Literal& literal, tensorflow::StringPiece path) {
+/* static */ Status TextLiteralWriter::WriteToPath(const Literal& literal,
+                                                   absl::string_view path) {
   std::unique_ptr<tensorflow::WritableFile> f;
-  auto s = tensorflow::Env::Default()->NewWritableFile(std::string(path), &f);
+  auto s = tensorflow::Env::Default()->NewWritableFile(string(path), &f);
   if (!s.ok()) {
     return s;
   }
@@ -51,11 +51,10 @@ namespace xla {
         if (!status.ok()) {
           return;
         }
-        string coordinates = tensorflow::strings::StrCat(
-            "(", tensorflow::str_util::Join(indices, ", "), ")");
+        string coordinates =
+            absl::StrCat("(", absl::StrJoin(indices, ", "), ")");
 
-        status = f_ptr->Append(
-            tensorflow::strings::StrCat(coordinates, ": ", value, "\n"));
+        status = f_ptr->Append(absl::StrCat(coordinates, ": ", value, "\n"));
       });
   auto ignored = f->Close();
   return status;
diff --git a/tensorflow/compiler/xla/text_literal_writer.h b/tensorflow/compiler/xla/text_literal_writer.h
index 159ac1b7e1..34de8572d6 100644
--- a/tensorflow/compiler/xla/text_literal_writer.h
+++ b/tensorflow/compiler/xla/text_literal_writer.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_
 #define TENSORFLOW_COMPILER_XLA_TEXT_LITERAL_WRITER_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
@@ -37,8 +37,7 @@ namespace xla {
 // This should be readable by xla::TextLiteralReader.
 class TextLiteralWriter {
  public:
-  static Status WriteToPath(const Literal& literal,
-                            tensorflow::StringPiece path);
+  static Status WriteToPath(const Literal& literal, absl::string_view path);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(TextLiteralWriter);
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 40d28a57bf..1e45588148 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -24,6 +24,7 @@ tf_cc_binary(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -191,6 +192,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index f0af0580c1..7aedd1da98 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -44,10 +44,9 @@ class OperationDumper : public DfsHloVisitorWithDefault {
   explicit OperationDumper(const string& path) : path_(path) {}
 
   Status DefaultAction(HloInstruction* hlo) override {
-    string params = tensorflow::str_util::Join(
+    string params = absl::StrJoin(
         hlo->operands(), ", ", [](string* out, const HloInstruction* operand) {
-          tensorflow::strings::StrAppend(
-              out, ShapeUtil::HumanString(operand->shape()));
+          absl::StrAppend(out, ShapeUtil::HumanString(operand->shape()));
         });
     // Spit `op_name(params...) -> result_type :: path` to stdout.
     std::cout << tensorflow::strings::Printf(
diff --git a/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc b/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc
index eb7bff053b..75b63c3b84 100644
--- a/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc
+++ b/tensorflow/compiler/xla/tools/hex_floats_to_packed_literal.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/platform/env.h"
@@ -67,7 +67,7 @@ int main(int argc, char** argv) {
     floats.push_back(value);
   }
 
-  tensorflow::StringPiece content(
+  tensorflow::StringPiece content(  // non-absl ok
       tensorflow::bit_cast<const char*>(floats.data()),
       floats.size() * sizeof(float));
   TF_CHECK_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index b4774233e5..311a1bee8d 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -160,7 +160,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
   // concurrent infeed occur via the fake_infeed_shape, or when
   // --generate_fake_infeed is passed and there exists an infeed operation in
   // the HloSnapshot.
-  tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
+  absl::optional<tensorflow::thread::ThreadPool> pool;
   std::unique_ptr<Literal> data;
   if (provide_infeed) {
     data = std::move(MakeFakeLiteral(infeed_shape)).ValueOrDie();
@@ -196,7 +196,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
   StreamExecutorMemoryAllocator allocator(
       client->platform(),
       {client->platform()->ExecutorForDevice(0).ValueOrDie()});
-  tensorflow::gtl::optional<ScopedShapedBuffer> result;
+  absl::optional<ScopedShapedBuffer> result;
   for (int i = 0; i < opts.num_runs; ++i) {
     // If xla_hlo_profile is enabled, print a noisy message before the last run,
     // making it easier to separate this profile from the others in the logspam.
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index e43498e381..85f05b7b8d 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include <stdarg.h>
 #include <numeric>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -54,16 +56,16 @@ ScopedLoggingTimer::~ScopedLoggingTimer() {
   }
 }
 
-Status AddStatus(Status prior, tensorflow::StringPiece context) {
+Status AddStatus(Status prior, absl::string_view context) {
   CHECK(!prior.ok());
-  return Status{prior.code(), tensorflow::strings::StrCat(
-                                  context, ": ", prior.error_message())};
+  return Status{prior.code(),
+                absl::StrCat(context, ": ", prior.error_message())};
 }
 
-Status AppendStatus(Status prior, tensorflow::StringPiece context) {
+Status AppendStatus(Status prior, absl::string_view context) {
   CHECK(!prior.ok());
-  return Status{prior.code(), tensorflow::strings::StrCat(prior.error_message(),
-                                                          ": ", context)};
+  return Status{prior.code(),
+                absl::StrCat(prior.error_message(), ": ", context)};
 }
 
 // Implementation note: we can't common these out (without using macros) because
@@ -146,16 +148,13 @@ Status Unavailable(const char* format, ...) {
   return WithLogBacktrace(tensorflow::errors::Unavailable(message));
 }
 
-string Reindent(tensorflow::StringPiece original,
-                const tensorflow::StringPiece indentation) {
-  std::vector<string> pieces = tensorflow::str_util::Split(
-      tensorflow::StringPiece(original.data(), original.size()), '\n');
-  return tensorflow::str_util::Join(
-      pieces, "\n", [indentation](string* out, string s) {
-        tensorflow::StringPiece piece(s);
-        tensorflow::str_util::RemoveWhitespaceContext(&piece);
-        tensorflow::strings::StrAppend(out, indentation, piece);
-      });
+string Reindent(absl::string_view original,
+                const absl::string_view indentation) {
+  std::vector<string> pieces =
+      absl::StrSplit(absl::string_view(original.data(), original.size()), '\n');
+  return absl::StrJoin(pieces, "\n", [indentation](string* out, string s) {
+    absl::StrAppend(out, indentation, absl::StripAsciiWhitespace(s));
+  });
 }
 
 bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank) {
@@ -234,20 +233,20 @@ bool HasInteriorPadding(const PaddingConfig& config) {
 
 namespace {
 string HumanReadableNumOps(double flops, double nanoseconds,
-                           tensorflow::StringPiece op_prefix) {
+                           absl::string_view op_prefix) {
   if (nanoseconds == 0) {
-    return tensorflow::strings::StrCat("NaN ", op_prefix, "OP/s");
+    return absl::StrCat("NaN ", op_prefix, "OP/s");
   }
   double nano_flops = flops / nanoseconds;
   string throughput = tensorflow::strings::HumanReadableNum(
       static_cast<int64>(nano_flops * 1e9));
-  tensorflow::StringPiece sp(throughput);
+  absl::string_view sp(throughput);
   // Use the more common "G(FLOPS)", rather than "B(FLOPS)"
-  if (tensorflow::str_util::EndsWith(sp, "B") ||  // Ends in 'B', ignoring case
-      tensorflow::str_util::EndsWith(sp, "b")) {
+  if (absl::EndsWith(sp, "B") ||  // Ends in 'B', ignoring case
+      absl::EndsWith(sp, "b")) {
     *throughput.rbegin() = 'G';
   }
-  throughput += tensorflow::strings::StrCat(op_prefix, "OP/s");
+  throughput += absl::StrCat(op_prefix, "OP/s");
   return throughput;
 }
 }  // namespace
@@ -260,8 +259,7 @@ string HumanReadableNumTranscendentalOps(double trops, double nanoseconds) {
   return HumanReadableNumOps(trops, nanoseconds, "TR");
 }
 
-void LogLines(int sev, tensorflow::StringPiece text, const char* fname,
-              int lineno) {
+void LogLines(int sev, absl::string_view text, const char* fname, int lineno) {
   const int orig_sev = sev;
   if (sev == tensorflow::FATAL) {
     sev = tensorflow::ERROR;
@@ -275,7 +273,7 @@ void LogLines(int sev, tensorflow::StringPiece text, const char* fname,
   size_t cur = 0;
   while (cur < text.size()) {
     size_t eol = text.find('\n', cur);
-    if (eol == tensorflow::StringPiece::npos) {
+    if (eol == absl::string_view::npos) {
       eol = text.size();
     }
     auto msg = text.substr(cur, eol - cur);
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 5ae099a462..671ef17f36 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -24,17 +24,18 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -54,7 +55,7 @@ Status WithLogBacktrace(const Status& status);
 // the InlinedVector will just behave like an std::vector<> and allocate the
 // memory to store its values.
 static constexpr int kInlineRank = 8;
-using DimensionVector = tensorflow::gtl::InlinedVector<int64, kInlineRank>;
+using DimensionVector = absl::InlinedVector<int64, kInlineRank>;
 
 // RAII timer that logs with a given label the wall clock time duration in human
 // readable form. This differs from base's ElapsedTimer primarily in that it
@@ -201,8 +202,8 @@ void StridedCopy(tensorflow::gtl::MutableArraySlice<D> dest, int64 dest_base,
 // Adds some context information to the error message in a
 // Status.  This is useful as Statuses are
 // propagated upwards.
-Status AddStatus(Status prior, tensorflow::StringPiece context);
-Status AppendStatus(Status prior, tensorflow::StringPiece context);
+Status AddStatus(Status prior, absl::string_view context);
+Status AppendStatus(Status prior, absl::string_view context);
 
 // Status error shorthands -- printfs the arguments to be
 // used as an error message and returns a status in the canonical
@@ -221,26 +222,26 @@ Status InvalidArgumentV(const char* format, va_list args);
 
 template <typename... Args>
 Status InvalidArgumentStrCat(Args&&... concat) {
-  return InvalidArgument(
-      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+  return InvalidArgument("%s",
+                         absl::StrCat(std::forward<Args>(concat)...).c_str());
 }
 
 template <typename... Args>
 Status UnimplementedStrCat(Args&&... concat) {
-  return Unimplemented(
-      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+  return Unimplemented("%s",
+                       absl::StrCat(std::forward<Args>(concat)...).c_str());
 }
 
 template <typename... Args>
 Status InternalErrorStrCat(Args&&... concat) {
-  return InternalError(
-      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+  return InternalError("%s",
+                       absl::StrCat(std::forward<Args>(concat)...).c_str());
 }
 
 template <typename... Args>
 Status ResourceExhaustedStrCat(Args&&... concat) {
-  return ResourceExhausted(
-      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+  return ResourceExhausted("%s",
+                           absl::StrCat(std::forward<Args>(concat)...).c_str());
 }
 
 // Splits the lines of the original, replaces leading whitespace with the prefix
@@ -249,8 +250,7 @@ Status ResourceExhaustedStrCat(Args&&... concat) {
 //
 // Note: even different amounts of leading whitespace on different lines will be
 // uniformly replaced with "indentation".
-string Reindent(tensorflow::StringPiece original,
-                tensorflow::StringPiece indentation);
+string Reindent(absl::string_view original, absl::string_view indentation);
 
 // Checks whether permutation is a permutation of the [0, rank) integer range.
 bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
@@ -312,7 +312,7 @@ string CommaSeparatedString(const Container& c, const char* prefix = "",
   string comma_separated = prefix;
   const char* separator = "";
   for (const auto& entry : c) {
-    tensorflow::strings::StrAppend(&comma_separated, separator, entry);
+    absl::StrAppend(&comma_separated, separator, entry);
     separator = ", ";
   }
   comma_separated += suffix;
@@ -394,8 +394,7 @@ string HumanReadableNumTranscendentalOps(double trops, double nanoseconds);
 
 // Split the text into multiple lines and log each line with the given
 // severity, filename, and line number.
-void LogLines(int sev, tensorflow::StringPiece text, const char* fname,
-              int lineno);
+void LogLines(int sev, absl::string_view text, const char* fname, int lineno);
 
 template <typename T>
 inline bool IsPowerOfTwo(T x) {
@@ -434,122 +433,15 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 // Removes illegal characters from filenames.
 string SanitizeFileName(string file_name);
 
-template <typename Container, typename Predicate>
-bool c_all_of(const Container& container, Predicate&& predicate) {
-  return std::all_of(std::begin(container), std::end(container),
-                     std::forward<Predicate>(predicate));
-}
-
-template <typename Container, typename Predicate>
-bool c_any_of(const Container& container, Predicate&& predicate) {
-  return std::any_of(std::begin(container), std::end(container),
-                     std::forward<Predicate>(predicate));
-}
-
-template <typename InputContainer, typename OutputIterator,
-          typename UnaryOperation>
-OutputIterator c_transform(const InputContainer& input_container,
-                           OutputIterator output_iterator,
-                           UnaryOperation&& unary_op) {
-  return std::transform(std::begin(input_container), std::end(input_container),
-                        output_iterator,
-                        std::forward<UnaryOperation>(unary_op));
-}
-
-template <class InputContainer, class OutputIterator, class UnaryPredicate>
-OutputIterator c_copy_if(const InputContainer& input_container,
-                         OutputIterator output_iterator,
-                         UnaryPredicate&& predicate) {
-  return std::copy_if(std::begin(input_container), std::end(input_container),
-                      output_iterator, std::forward<UnaryPredicate>(predicate));
-}
-
-template <class InputContainer, class OutputIterator>
-OutputIterator c_copy(const InputContainer& input_container,
-                      OutputIterator output_iterator) {
-  return std::copy(std::begin(input_container), std::end(input_container),
-                   output_iterator);
-}
-
-template <class InputContainer>
-void c_sort(InputContainer& input_container) {
-  std::sort(std::begin(input_container), std::end(input_container));
-}
-
-template <class InputContainer, class Comparator>
-void c_sort(InputContainer& input_container, Comparator&& comparator) {
-  std::sort(std::begin(input_container), std::end(input_container),
-            std::forward<Comparator>(comparator));
-}
-
-template <typename Sequence, typename T>
-bool c_binary_search(const Sequence& sequence, T&& value) {
-  return std::binary_search(std::begin(sequence), std::end(sequence),
-                            std::forward<T>(value));
-}
-
-template <typename C>
-bool c_is_sorted(const C& c) {
-  return std::is_sorted(std::begin(c), std::end(c));
-}
-
-template <typename C, typename Compare>
-bool c_is_sorted(const C& c, Compare&& comp) {
-  return std::is_sorted(std::begin(c), std::end(c),
-                        std::forward<Compare>(comp));
-}
-
-template <typename C>
-auto c_adjacent_find(C& c) -> decltype(std::begin(c)) {
-  return std::adjacent_find(std::begin(c), std::end(c));
-}
-
-template <typename C, typename Pred>
-auto c_find_if(C& c, Pred&& pred) -> decltype(std::begin(c)) {
-  return std::find_if(std::begin(c), std::end(c), std::forward<Pred>(pred));
-}
-
-template <typename C, typename Value>
-auto c_find(C& c, Value&& value) -> decltype(std::begin(c)) {
-  return std::find(std::begin(c), std::end(c), std::forward<Value>(value));
-}
-
-template <typename Sequence>
-void c_reverse(Sequence& sequence) {
-  std::reverse(std::begin(sequence), std::end(sequence));
-}
-
-template <typename Sequence, typename T, typename BinaryOp>
-typename std::decay<T>::type c_accumulate(const Sequence& sequence, T&& init,
-                                          BinaryOp&& binary_op) {
-  return std::accumulate(std::begin(sequence), std::end(sequence),
-                         std::forward<T>(init),
-                         std::forward<BinaryOp>(binary_op));
-}
-
-template <typename C, typename Pred>
-typename std::iterator_traits<
-    decltype(std::begin(std::declval<C>()))>::difference_type
-c_count_if(const C& c, Pred&& pred) {
-  return std::count_if(std::begin(c), std::end(c), std::forward<Pred>(pred));
-}
-
-// Determines whether `value` is present in `c`.
-template <typename C, typename T>
-bool c_linear_search(const C& c, T&& value) {
-  auto last = std::end(c);
-  return std::find(std::begin(c), last, std::forward<T>(value)) != last;
-}
-
 template <typename C, typename Value>
 int64 FindIndex(const C& c, Value&& value) {
-  auto it = c_find(c, std::forward<Value>(value));
+  auto it = absl::c_find(c, std::forward<Value>(value));
   return std::distance(c.begin(), it);
 }
 
 template <typename T>
 bool ArrayContains(tensorflow::gtl::ArraySlice<T> c, const T& value) {
-  return c_find(c, value) != c.end();
+  return absl::c_find(c, value) != c.end();
 }
 
 template <typename C, typename Value>
@@ -567,9 +459,9 @@ std::vector<T> ArraySliceToVector(tensorflow::gtl::ArraySlice<T> slice) {
   return std::vector<T>(slice.begin(), slice.end());
 }
 
-template <typename T, int N>
+template <typename T, size_t N>
 std::vector<T> InlinedVectorToVector(
-    const tensorflow::gtl::InlinedVector<T, N>& inlined_vector) {
+    const absl::InlinedVector<T, N>& inlined_vector) {
   return std::vector<T>(inlined_vector.begin(), inlined_vector.end());
 }
 
@@ -584,8 +476,8 @@ bool IsInt32(T x) {
 
 template <typename T>
 Status EraseElementFromVector(std::vector<T>* container, const T& value) {
-  // c_find returns a const_iterator which does not seem to work on gcc 4.8.4,
-  // and this breaks the ubuntu/xla_gpu build bot.
+  // absl::c_find returns a const_iterator which does not seem to work on
+  // gcc 4.8.4, and this breaks the ubuntu/xla_gpu build bot.
   auto it = std::find(container->begin(), container->end(), value);
   TF_RET_CHECK(it != container->end());
   container->erase(it);
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index f11123ca24..44fb1bdc38 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -17,10 +17,9 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
@@ -49,8 +48,8 @@ PaddingConfig MakeSymmetricPadding(tensorflow::gtl::ArraySlice<int64> sizes) {
 }
 
 /* static */ string ToString(const WindowDimension& dim) {
-  using tensorflow::strings::StrAppend;
-  using tensorflow::strings::StrCat;
+  using absl::StrAppend;
+  using absl::StrCat;
   string str = StrCat("(size=", dim.size());
   if (dim.stride() != 1) {
     StrAppend(&str, ",stride=", dim.stride());
@@ -75,8 +74,8 @@ PaddingConfig MakeSymmetricPadding(tensorflow::gtl::ArraySlice<int64> sizes) {
 }
 
 string ToString(const Window& window) {
-  using tensorflow::strings::StrAppend;
-  using tensorflow::strings::StrCat;
+  using absl::StrAppend;
+  using absl::StrCat;
 
   string str;
   const auto add_field =
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 3b72eb17c6..b53f89d63b 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -195,8 +195,13 @@ message DebugOptions {
   bool xla_cpu_enable_fast_math = 99;
   bool xla_gpu_enable_fast_math = 100;
 
-  // Extra options to pass to the compilation backend; specific interpretation
-  // of these values is left to the backend.
+  // Crashes the program when any kind of verification fails, instead of just
+  // logging the failures. One example is cross checking of convolution results
+  // among different algorithms.
+  bool xla_gpu_crash_on_verification_failures = 101;
+
+  // Extra options to pass to the compilation backend (e.g. LLVM); specific
+  // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
 }
 
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 4c35e93d38..9451e0c315 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -424,25 +424,25 @@ message GatherDimensionNumbers {
   // "Window indices" is a term for a set of indices that index into the
   // interior of a dynamic-slice from the input tensor, the starting indices for
   // which were computed from output_gather_dims (see the operation semantic for
-  // how this is defined) and the gather_indices tensor.
+  // how this is defined) and the start_indices tensor.
   //
   // The window indices for a specific output index Out is computed as:
   //
   //  i = 0
   //  for (k : [0, input_tensor_shape.rank))
   //    window_indices[k] =
-  //      if k in elided_window_dims
+  //      if k in collapsed_slice_dims
   //      then 0
-  //      else Out[output_window_dims[i++]]
-  repeated int64 output_window_dims = 1;
-  repeated int64 elided_window_dims = 2;
+  //      else Out[offset_dims[i++]]
+  repeated int64 offset_dims = 1;
+  repeated int64 collapsed_slice_dims = 2;
 
-  // This is interpreted as a map from i to gather_dims_to_operand_dims[i]. It
-  // transforms the gather index looked up from the gather_indices tensor into
+  // This is interpreted as a map from i to start_index_map[i]. It
+  // transforms the gather index looked up from the start_indices tensor into
   // the starting index in the input space.
-  repeated int64 gather_dims_to_operand_dims = 3;
+  repeated int64 start_index_map = 3;
 
-  // The dimension in the gather_indices input that contains the starting
+  // The dimension in the start_indices input that contains the starting
   // indices.
   int64 index_vector_dim = 4;
 }
@@ -569,3 +569,18 @@ message ReplicaGroup {
   // ids matters in some op (e.g., all-to-all).
   repeated int64 replica_ids = 1;
 }
+
+// Used to indicate the precision configuration. It has backend specific
+// meaning.
+message PrecisionConfigProto {
+  enum Precision {
+    DEFAULT = 0;
+    HIGH = 1;
+    HIGHEST = 2;
+
+    // Next: 3
+  }
+  repeated Precision operand_precision = 1;
+
+  // Next: 2
+}
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 23bb783e22..66983801bf 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -20,7 +20,13 @@ py_library(
     ),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
+    deps = if_not_windows([
+        # TODO(aaroey): tensorrt dependency has to appear before tflite so the
+        # build can resolve its flatbuffers symbols within the tensorrt library.
+        # This is an issue with the tensorrt static library and will be fixed by
+        # the next tensorrt release, so fix the order here after that.
+        "//tensorflow/contrib/tensorrt:init_py",  # doesn't compile on windows
+    ]) + [
         "//tensorflow/contrib/all_reduce",
         "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
@@ -55,7 +61,6 @@ py_library(
         "//tensorflow/contrib/integrate:integrate_py",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
-        "//tensorflow/contrib/kfac",
         "//tensorflow/contrib/labeled_tensor",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
@@ -64,6 +69,7 @@ py_library(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
+        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
@@ -130,12 +136,6 @@ py_library(
         "//tensorflow/contrib/bigtable",  # depends on bigtable
         "//tensorflow/contrib/cloud:cloud_py",  # doesn't compile on Windows
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
-        # TODO(aaroey): tensorrt dependency has to appear before tflite so the
-        # build can resolve its flatbuffers symbols within the tensorrt library.
-        # This is an issue with the tensorrt static library and will be fixed by
-        # the next tensorrt release, so fix the order here after that.
-        "//tensorflow/contrib/tensorrt:init_py",  # doesn't compile on windows
-        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
     ]),
 )
 
@@ -181,6 +181,7 @@ cc_library(
         "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
         "//tensorflow/contrib/coder:all_ops",
         "//tensorflow/contrib/data:dataset_ops_op_lib",
+        "//tensorflow/contrib/data:indexed_dataset_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/hadoop:dataset_ops_op_lib",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e18ea8df4d..5f477a79a3 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -51,7 +51,6 @@ from tensorflow.contrib import input_pipeline
 from tensorflow.contrib import integrate
 from tensorflow.contrib import keras
 from tensorflow.contrib import kernel_methods
-from tensorflow.contrib import kfac
 from tensorflow.contrib import labeled_tensor
 from tensorflow.contrib import layers
 from tensorflow.contrib import learn
@@ -94,8 +93,7 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-if os.name != "nt":
-  from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.recurrent.python import recurrent_api as recurrent
diff --git a/tensorflow/contrib/autograph/converters/builtin_functions_test.py b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
index d5c3e2c250..d0a0cbbeb6 100644
--- a/tensorflow/contrib/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions_test.py
@@ -36,7 +36,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
 
     with self.converted(test_fn, builtin_functions, {'len': len},
                         array_ops.shape) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         ops = result.test_fn(constant_op.constant([0, 0, 0]))
         self.assertEqual(sess.run(ops), 3)
 
@@ -49,7 +49,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
       return print(a)
 
     with self.converted(test_fn, builtin_functions, {'print': print}) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.assertPrints('a\n'):
           sess.run(result.test_fn('a'))
 
@@ -62,7 +62,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
       return print(a, b, c)
 
     with self.converted(test_fn, builtin_functions, {'print': print}) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.assertPrints('a 1 [2, 3]\n'):
           sess.run(
               result.test_fn(
diff --git a/tensorflow/contrib/autograph/converters/call_trees_test.py b/tensorflow/contrib/autograph/converters/call_trees_test.py
index 8cdba659ee..ca4d1f2932 100644
--- a/tensorflow/contrib/autograph/converters/call_trees_test.py
+++ b/tensorflow/contrib/autograph/converters/call_trees_test.py
@@ -91,7 +91,7 @@ class CallTreesTest(converter_testing.TestCase):
       setattr(a, 'foo', 'bar')
 
     with self.converted(test_fn, call_trees, {'setattr': setattr}) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
 
         class Dummy(object):
           pass
@@ -110,7 +110,7 @@ class CallTreesTest(converter_testing.TestCase):
 
     with self.converted(test_fn, call_trees, {'np': np},
                         dtypes.int64) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
         self.assertIn(sess.run(result.test_fn()), (0, 1, 2))
 
@@ -129,7 +129,7 @@ class CallTreesTest(converter_testing.TestCase):
     node = call_trees.transform(node, ctx)
 
     with self.compiled(node, ns) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         result_tensor = result.test_fn(constant_op.constant(1))
         self.assertEquals(sess.run(result_tensor), 3)
 
diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
index 5a5a6ad63a..8d314250a0 100644
--- a/tensorflow/contrib/autograph/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -95,6 +95,18 @@ class ControlFlowTransformer(converter.Base):
       return 'no variables'
     return ', '.join(map(str, symbol_set))
 
+  def _validate_no_live_vars_created(self, node):
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    live_vars_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    live_vars_created_in_body = live_vars_out & body_scope.created
+    if live_vars_created_in_body:
+      raise ValueError(
+          'The following variables are created inside the loop and used later:'
+          '\n%s\n'
+          'Variables must be declared outside loops because loops may not'
+          ' necessarily execute.' % self._fmt_symbol_list(
+              live_vars_created_in_body))
+
   def visit_If(self, node):
     node = self.generic_visit(node)
 
@@ -197,6 +209,8 @@ class ControlFlowTransformer(converter.Base):
   def visit_While(self, node):
     self.generic_visit(node)
 
+    self._validate_no_live_vars_created(node)
+
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     body_closure = body_scope.modified - body_scope.created
     all_referenced = body_scope.referenced
@@ -262,6 +276,8 @@ class ControlFlowTransformer(converter.Base):
   def visit_For(self, node):
     self.generic_visit(node)
 
+    self._validate_no_live_vars_created(node)
+
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     body_closure = body_scope.modified - body_scope.created
     all_referenced = body_scope.referenced
@@ -294,7 +310,9 @@ class ControlFlowTransformer(converter.Base):
     template = """
       def extra_test_name(state_ssf):
         return extra_test_expr
-      def body_name(iterate, state_ssf):
+      def body_name(loop_vars, state_ssf):
+        # Workaround for PEP-3113
+        iterate = loop_vars
         body
         return state_ssf,
       state_ast_tuple = ag__.for_stmt(
diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
index ade3501426..2a6f3cb395 100644
--- a/tensorflow/contrib/autograph/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -33,7 +33,7 @@ class ControlFlowTest(converter_testing.TestCase):
       inputs = (inputs,)
     with self.converted(test_fn, control_flow, {},
                         constant_op.constant) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertEqual(sess.run(result.test_fn(*inputs)), expected)
 
   def test_while_basic(self):
@@ -57,6 +57,17 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(5), 0)
 
+  def test_while_variable_defined_in_body(self):
+    def bad_while_loop(n):
+      while n > 0:
+        n -= 1
+        s = n
+      return s
+
+    node, ctx = self.prepare(bad_while_loop, {})
+    with self.assertRaises(transformer.AutographParseError):
+      control_flow.transform(node, ctx)
+
   def test_if_basic(self):
 
     def test_fn(n):
@@ -89,7 +100,7 @@ class ControlFlowTest(converter_testing.TestCase):
       return obj
 
     with self.converted(test_fn, control_flow, {}) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
         self.assertEqual(sess.run((res_obj.a, res_obj.b)), (-1, 0))
         res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
@@ -196,6 +207,23 @@ class ControlFlowTest(converter_testing.TestCase):
       self.assertEqual(result.test_fn(5), 10)
       self.assertEqual(eval_count[0], 1)
 
+  def test_for_variable_defined_in_body(self):
+    def bad_for_loop(n):
+      for i in range(n):
+        s = i
+      return s
+
+    node, ctx = self.prepare(bad_for_loop, {})
+    with self.assertRaises(transformer.AutographParseError):
+      control_flow.transform(node, ctx)
+
+  def test_for_tuple_unpacking(self):
+    def test_fn(x_list):
+      z = tf.constant(0)  # pylint:disable=undefined-variable
+      for i, x in enumerate(x_list):
+        z = z + x + i
+      return z
 
+    self.assertTransformedResult(test_fn, [3, 3], 7)
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
index 996e99ee61..c5e2dcf75e 100644
--- a/tensorflow/contrib/autograph/converters/lists_test.py
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -65,7 +65,7 @@ class ListTest(converter_testing.TestCase):
 
     ns = {'special_functions': special_functions}
     with self.converted(test_fn, lists, ns) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
         self.assertAllEqual(sess.run(r), [1, 2, 3])
@@ -88,7 +88,7 @@ class ListTest(converter_testing.TestCase):
     node = lists.transform(node, ctx)
 
     with self.compiled(node, ns, dtypes.int32) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
         self.assertAllEqual(sess.run(r), [1, 2])
@@ -122,7 +122,7 @@ class ListTest(converter_testing.TestCase):
     node = lists.transform(node, ctx)
 
     with self.compiled(node, {}, array_ops.stack, dtypes.int32) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertAllEqual(sess.run(result.test_fn()), [1, 2, 3])
 
   # TODO(mdan): Add a test with tf.stack with axis kwarg.
diff --git a/tensorflow/contrib/autograph/converters/logical_expressions_test.py b/tensorflow/contrib/autograph/converters/logical_expressions_test.py
index ca07de5e8a..8f9eee7081 100644
--- a/tensorflow/contrib/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/contrib/autograph/converters/logical_expressions_test.py
@@ -33,7 +33,7 @@ class GradientsFunctionTest(converter_testing.TestCase):
 
     with self.converted(test_fn, logical_expressions, {},
                         math_ops.equal) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertTrue(sess.run(result.test_fn(1, 1)))
         self.assertFalse(sess.run(result.test_fn(1, 2)))
 
@@ -44,7 +44,7 @@ class GradientsFunctionTest(converter_testing.TestCase):
 
     with self.converted(test_fn, logical_expressions, {}, math_ops.logical_or,
                         math_ops.logical_and) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertTrue(sess.run(result.test_fn(True, False, True)))
 
 
diff --git a/tensorflow/contrib/autograph/converters/side_effect_guards_test.py b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
index bee512abbc..5fe5114d4b 100644
--- a/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
@@ -46,7 +46,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
         sess.run(v.initializer)
         sess.run(result.test_fn(v))
@@ -67,7 +67,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
         sess.run(v.initializer)
         sess.run(result.test_fn(v))
@@ -87,7 +87,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, control_flow_ops.Assert) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                      'expected in throw'):
           sess.run(result.test_fn(constant_op.constant(-1)))
@@ -107,7 +107,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign_add) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
         sess.run(v.initializer)
         sess.run(result.test_fn(v))
@@ -128,7 +128,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     self.assertEqual(len(node.body[0].body), 1)
 
     with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
         sess.run(v.initializer)
         sess.run(result.test_fn(v))
@@ -151,7 +151,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
 
     with self.compiled(node, {}, state_ops.assign,
                        state_ops.assign_add) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
         sess.run(v.initializer)
         sess.run(result.test_fn(v))
diff --git a/tensorflow/contrib/autograph/converters/slices_test.py b/tensorflow/contrib/autograph/converters/slices_test.py
index c822d53a4a..d74b2e025e 100644
--- a/tensorflow/contrib/autograph/converters/slices_test.py
+++ b/tensorflow/contrib/autograph/converters/slices_test.py
@@ -45,7 +45,7 @@ class SliceTest(converter_testing.TestCase):
     node = slices.transform(node, ctx)
 
     with self.compiled(node, {}, dtypes.int32) as result:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         tl = list_ops.tensor_list_from_tensor(
             [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
         y = result.test_fn(tl)
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/errors_test.py b/tensorflow/contrib/autograph/examples/integration_tests/errors_test.py
index f4b9159942..04a968be10 100644
--- a/tensorflow/contrib/autograph/examples/integration_tests/errors_test.py
+++ b/tensorflow/contrib/autograph/examples/integration_tests/errors_test.py
@@ -97,7 +97,7 @@ class ErrorsTest(tf.test.TestCase):
     compiled_fn = ag.to_graph(test_fn)
 
     with self.assertRaises(ag.TfRuntimeError) as error:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         x = compiled_fn(tf.constant([4, 8]))
         with ag.improved_errors(compiled_fn):
           sess.run(x)
@@ -144,7 +144,7 @@ class ErrorsTest(tf.test.TestCase):
     # frame with "g" as the function name but because we don't yet add
     # try/except blocks to inner functions the name is "tf__g".
     with self.assertRaises(ag.TfRuntimeError) as error:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         x = compiled_fn(tf.constant([4, 8]))
         with ag.improved_errors(compiled_fn):
           sess.run(x)
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py b/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py
index 680b6dbaf0..904246afb7 100644
--- a/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py
+++ b/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py
@@ -33,7 +33,7 @@ class ListLiteralsTest(tf.test.TestCase):
     converted = ag.to_graph(list_used_as_tuple)
     result = converted()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(result), [1, 2, 3])
 
 
diff --git a/tensorflow/contrib/autograph/operators/control_flow_test.py b/tensorflow/contrib/autograph/operators/control_flow_test.py
index b14d7edba3..677b7f8f62 100644
--- a/tensorflow/contrib/autograph/operators/control_flow_test.py
+++ b/tensorflow/contrib/autograph/operators/control_flow_test.py
@@ -34,7 +34,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual((10,), sess.run(s))
 
   def test_python(self):
@@ -52,7 +52,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual((10,), sess.run(s))
 
 
@@ -65,7 +65,7 @@ class WhileLoopTest(test.TestCase):
         body=lambda i, s: (i + 1, s + i,),
         init_state=(0, 0),
         extra_deps=(n,))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual((5, 10), sess.run(results))
 
   def test_python(self):
@@ -86,7 +86,8 @@ class IfStmtTest(test.TestCase):
           cond=cond,
           body=lambda: 1,
           orelse=lambda: -1)
-    with self.test_session() as sess:
+
+    with self.cached_session() as sess:
       self.assertEqual(1, sess.run(test_if_stmt(constant_op.constant(True))))
       self.assertEqual(-1, sess.run(test_if_stmt(constant_op.constant(False))))
 
diff --git a/tensorflow/contrib/autograph/operators/data_structures_test.py b/tensorflow/contrib/autograph/operators/data_structures_test.py
index 7ea11a839b..4b1e835d44 100644
--- a/tensorflow/contrib/autograph/operators/data_structures_test.py
+++ b/tensorflow/contrib/autograph/operators/data_structures_test.py
@@ -42,7 +42,7 @@ class ListTest(test.TestCase):
   def test_tf_tensor_list_new(self):
     l = data_structures.tf_tensor_list_new([3, 4, 5])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(t), [3, 4, 5])
 
   def test_tf_tensor_list_new_illegal_input(self):
@@ -63,7 +63,7 @@ class ListTest(test.TestCase):
   def test_tf_tensor_array_new(self):
     l = data_structures.tf_tensor_array_new([3, 4, 5])
     t = l.stack()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(t), [3, 4, 5])
 
   def test_tf_tensor_array_new_illegal_input(self):
@@ -88,14 +88,14 @@ class ListTest(test.TestCase):
     l = data_structures.list_append(l, x)
 
     t = list_ops.tensor_list_stack(l, element_dtype=x.dtype)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(t), [[1, 2, 3]])
 
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
     l2 = data_structures.list_append(l1, 2)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(l1.stack()), [1])
       self.assertAllEqual(sess.run(l2.stack()), [1, 2])
 
@@ -116,7 +116,7 @@ class ListTest(test.TestCase):
     with self.assertRaises(NotImplementedError):
       data_structures.list_pop(l, 0, opts)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       l, x = data_structures.list_pop(l, None, opts)
       self.assertAllEqual(sess.run(x), [3, 4])
 
@@ -137,7 +137,7 @@ class ListTest(test.TestCase):
     opts = data_structures.ListStackOpts(
         element_dtype=initial_list.dtype, original_call=None)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       t = data_structures.list_stack(l, opts)
       self.assertAllEqual(sess.run(t), sess.run(initial_list))
 
diff --git a/tensorflow/contrib/autograph/operators/slices_test.py b/tensorflow/contrib/autograph/operators/slices_test.py
index d4aacb9d20..56aafe07c8 100644
--- a/tensorflow/contrib/autograph/operators/slices_test.py
+++ b/tensorflow/contrib/autograph/operators/slices_test.py
@@ -32,7 +32,7 @@ class SlicesTest(test.TestCase):
     l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
     l = slices.set_item(l, 0, [5, 6])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
       self.assertAllEqual(sess.run(t), [[5, 6], [3, 4]])
 
@@ -43,7 +43,7 @@ class SlicesTest(test.TestCase):
     t = slices.get_item(
         l, 1, slices.GetItemOpts(element_dtype=initial_list.dtype))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(sess.run(t), [3, 4])
 
 
diff --git a/tensorflow/contrib/autograph/pyct/testing/BUILD b/tensorflow/contrib/autograph/pyct/testing/BUILD
index 9ef1ac9663..29a92444bb 100644
--- a/tensorflow/contrib/autograph/pyct/testing/BUILD
+++ b/tensorflow/contrib/autograph/pyct/testing/BUILD
@@ -34,8 +34,10 @@ py_test(
     srcs = ["codegen_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "manual",
         "no_windows",
         "nomsan",
+        "notap",
     ],
     deps = [
         ":testing",
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 68ead2f760..9afe3df585 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Monte Carlo integration and helpers.
 
-See the @{$python/contrib.bayesflow.monte_carlo} guide.
-
 @@expectation
 @@expectation_importance_sampler
 @@expectation_importance_sampler_logspace
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 8eac1243ef..f03eab510c 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -445,6 +445,7 @@ tf_kernel_library(
         "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
         "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_cc",
         "//tensorflow/contrib/boosted_trees/proto:split_info_proto_cc",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
         "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
         "//tensorflow/contrib/boosted_trees/resources:quantile_stream_resource",
         "//tensorflow/core:framework_headers_lib",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 68d710d713..c155128c0e 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -16,7 +16,10 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import tempfile
+import numpy as np
+
 from tensorflow.contrib.boosted_trees.estimator_batch import estimator
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
@@ -26,6 +29,7 @@ from tensorflow.python.feature_column import feature_column_lib as core_feature_
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
@@ -473,6 +477,63 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     classifier.evaluate(input_fn=_multiclass_train_input_fn, steps=1)
     classifier.predict(input_fn=_eval_input_fn)
 
+  def testWeightedCategoricalColumn(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    feature_columns = [
+        core_feature_column.weighted_categorical_column(
+            categorical_column=core_feature_column.
+            categorical_column_with_vocabulary_list(
+                key="word", vocabulary_list=["the", "cat", "dog"]),
+            weight_feature_key="weight")
+    ]
+
+    labels = np.array([[1], [1], [0], [0.]], dtype=np.float32)
+
+    def _make_input_fn():
+
+      def _input_fn():
+        features_dict = {}
+        # Sparse tensor representing
+        # example 0: "cat","the"
+        # examaple 1: "dog"
+        # example 2: -
+        # example 3: "the"
+        # Weights for the words are 5 - cat, 6- dog and 1 -the.
+        features_dict["word"] = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
+            values=constant_op.constant(
+                ["the", "cat", "dog", "the"], dtype=dtypes.string),
+            dense_shape=[4, 3])
+        features_dict["weight"] = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
+            values=[1., 5., 6., 1.],
+            dense_shape=[4, 3])
+        return features_dict, labels
+
+      return _input_fn
+
+    est = estimator.CoreGradientBoostedDecisionTreeEstimator(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=feature_columns)
+
+    input_fn = _make_input_fn()
+    est.train(input_fn=input_fn, steps=100)
+    est.evaluate(input_fn=input_fn, steps=1)
+    est.predict(input_fn=input_fn)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 401bec84a2..3a48635319 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -34,7 +34,9 @@
 
 namespace tensorflow {
 
+using boosted_trees::learner::LearnerConfig;
 using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
+using boosted_trees::learner::ObliviousSplitInfo;
 using boosted_trees::learner::SplitInfo;
 using boosted_trees::learner::stochastic::GradientStats;
 using boosted_trees::learner::stochastic::NodeStats;
@@ -158,6 +160,11 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
     const Tensor* hessians_t;
     OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
 
+    const Tensor* weak_learner_type_t;
+    OP_REQUIRES_OK(context,
+                   context->input("weak_learner_type", &weak_learner_type_t));
+    const int32 weak_learner_type = weak_learner_type_t->scalar<int32>()();
+
     // Find the number of unique partitions before we allocate the output.
     std::vector<int32> partition_boundaries;
     partition_boundaries.push_back(0);
@@ -188,20 +195,59 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
     tensorflow::TTypes<int32>::Vec output_partition_ids =
         output_partition_ids_t->vec<int32>();
 
-    Tensor* gains_t = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output("gains", TensorShape({num_elements}),
-                                          &gains_t));
+    // For a normal tree, we output a split per partition. For an oblivious
+    // tree, we output one split for all partitions of the layer
+    int32 size_output = num_elements;
+    if (weak_learner_type == LearnerConfig::OBLIVIOUS_DECISION_TREE &&
+        num_elements > 0) {
+      size_output = 1;
+    }
 
+    Tensor* gains_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "gains", TensorShape({size_output}), &gains_t));
     tensorflow::TTypes<float>::Vec gains = gains_t->vec<float>();
 
     Tensor* output_splits_t = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                "split_infos", TensorShape({num_elements}),
-                                &output_splits_t));
+    OP_REQUIRES_OK(context, context->allocate_output("split_infos",
+                                                     TensorShape({size_output}),
+                                                     &output_splits_t));
     tensorflow::TTypes<string>::Vec output_splits =
         output_splits_t->vec<string>();
+
+    if (num_elements == 0) {
+      return;
+    }
     SplitBuilderState state(context);
+    switch (weak_learner_type) {
+      case LearnerConfig::NORMAL_DECISION_TREE: {
+        ComputeNormalDecisionTree(
+            &state, normalizer_ratio, num_elements, partition_boundaries,
+            bucket_boundaries, partition_ids, bucket_ids, gradients_t,
+            hessians_t, &output_partition_ids, &gains, &output_splits);
+        break;
+      }
+      case LearnerConfig::OBLIVIOUS_DECISION_TREE: {
+        ComputeObliviousDecisionTree(
+            &state, normalizer_ratio, num_elements, partition_boundaries,
+            bucket_boundaries, partition_ids, bucket_ids, gradients_t,
+            hessians_t, &output_partition_ids, &gains, &output_splits);
+        break;
+      }
+    }
+  }
+
+ private:
+  void ComputeNormalDecisionTree(
+      SplitBuilderState* state, const float normalizer_ratio,
+      const int num_elements, const std::vector<int32>& partition_boundaries,
+      const tensorflow::TTypes<float>::ConstVec& bucket_boundaries,
+      const tensorflow::TTypes<int32>::ConstVec& partition_ids,
+      const tensorflow::TTypes<int64>::ConstMatrix& bucket_ids,
+      const Tensor* gradients_t, const Tensor* hessians_t,
+      tensorflow::TTypes<int32>::Vec* output_partition_ids,
+      tensorflow::TTypes<float>::Vec* gains,
+      tensorflow::TTypes<string>::Vec* output_splits) {
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
       float best_gain = std::numeric_limits<float>::lowest();
       int start_index = partition_boundaries[root_idx];
@@ -213,7 +259,7 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
             GradientStats(*gradients_t, *hessians_t, bucket_idx);
       }
       root_gradient_stats *= normalizer_ratio;
-      NodeStats root_stats = state.ComputeNodeStats(root_gradient_stats);
+      NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats);
       int32 best_bucket_idx = 0;
       NodeStats best_right_node_stats(0);
       NodeStats best_left_node_stats(0);
@@ -223,10 +269,10 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
         GradientStats g(*gradients_t, *hessians_t, bucket_idx);
         g *= normalizer_ratio;
         left_gradient_stats += g;
-        NodeStats left_stats = state.ComputeNodeStats(left_gradient_stats);
+        NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats);
         GradientStats right_gradient_stats =
             root_gradient_stats - left_gradient_stats;
-        NodeStats right_stats = state.ComputeNodeStats(right_gradient_stats);
+        NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats);
         if (left_stats.gain + right_stats.gain > best_gain) {
           best_gain = left_stats.gain + right_stats.gain;
           best_left_node_stats = left_stats;
@@ -237,20 +283,126 @@ class BuildDenseInequalitySplitsOp : public OpKernel {
       SplitInfo split_info;
       auto* dense_split =
           split_info.mutable_split_node()->mutable_dense_float_binary_split();
-      dense_split->set_feature_column(state.feature_column_group_id());
+      dense_split->set_feature_column(state->feature_column_group_id());
       dense_split->set_threshold(
           bucket_boundaries(bucket_ids(best_bucket_idx, 0)));
 
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
 
-      state.FillLeaf(best_left_node_stats, left_child);
-      state.FillLeaf(best_right_node_stats, right_child);
-      split_info.SerializeToString(&output_splits(root_idx));
-      gains(root_idx) =
-          best_gain - root_stats.gain - state.tree_complexity_regularization();
-      output_partition_ids(root_idx) = partition_ids(start_index);
+      state->FillLeaf(best_left_node_stats, left_child);
+      state->FillLeaf(best_right_node_stats, right_child);
+      split_info.SerializeToString(&(*output_splits)(root_idx));
+      (*gains)(root_idx) =
+          best_gain - root_stats.gain - state->tree_complexity_regularization();
+      (*output_partition_ids)(root_idx) = partition_ids(start_index);
+    }
+  }
+  void ComputeObliviousDecisionTree(
+      SplitBuilderState* state, const float normalizer_ratio,
+      const int num_elements, const std::vector<int32>& partition_boundaries,
+      const tensorflow::TTypes<float>::ConstVec& bucket_boundaries,
+      const tensorflow::TTypes<int32>::ConstVec& partition_ids,
+      const tensorflow::TTypes<int64>::ConstMatrix& bucket_ids,
+      const Tensor* gradients_t, const Tensor* hessians_t,
+      tensorflow::TTypes<int32>::Vec* output_partition_ids,
+      tensorflow::TTypes<float>::Vec* gains,
+      tensorflow::TTypes<string>::Vec* output_splits) {
+    // Holds the root stats per each node to be split.
+    std::vector<GradientStats> current_layer_stats;
+    current_layer_stats.reserve(num_elements);
+    for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+      const int start_index = partition_boundaries[root_idx];
+      const int end_index = partition_boundaries[root_idx + 1];
+      GradientStats root_gradient_stats;
+      for (int64 bucket_idx = start_index; bucket_idx < end_index;
+           ++bucket_idx) {
+        root_gradient_stats +=
+            GradientStats(*gradients_t, *hessians_t, bucket_idx);
+      }
+      root_gradient_stats *= normalizer_ratio;
+      current_layer_stats.push_back(root_gradient_stats);
+    }
+
+    float best_gain = std::numeric_limits<float>::lowest();
+    int64 best_bucket_idx = 0;
+    std::vector<NodeStats> best_right_node_stats(num_elements, NodeStats(0));
+    std::vector<NodeStats> best_left_node_stats(num_elements, NodeStats(0));
+    std::vector<NodeStats> current_left_node_stats(num_elements, NodeStats(0));
+    std::vector<NodeStats> current_right_node_stats(num_elements, NodeStats(0));
+    int64 current_bucket_id = 0;
+    int64 last_bucket_id = -1;
+    // Indexes offsets for each of the partitions that can be used to access
+    // gradients of a partition for a current bucket we consider.
+    std::vector<int> current_layer_offsets(num_elements, 0);
+    std::vector<GradientStats> left_gradient_stats(num_elements);
+    // The idea is to try every bucket id in increasing order. In each iteration
+    // we calculate the gain of the layer using the current bucket id as split
+    // value, and we also obtain the following bucket id to try.
+    while (current_bucket_id > last_bucket_id) {
+      last_bucket_id = current_bucket_id;
+      int64 next_bucket_id = -1;
+      for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+        int idx =
+            current_layer_offsets[root_idx] + partition_boundaries[root_idx];
+        const int end_index = partition_boundaries[root_idx + 1];
+        if (idx < end_index && bucket_ids(idx, 0) == current_bucket_id) {
+          GradientStats g(*gradients_t, *hessians_t, idx);
+          g *= normalizer_ratio;
+          left_gradient_stats[root_idx] += g;
+          current_layer_offsets[root_idx]++;
+          idx++;
+        }
+        if (idx < end_index &&
+            (bucket_ids(idx, 0) < next_bucket_id || next_bucket_id == -1)) {
+          next_bucket_id = bucket_ids(idx, 0);
+        }
+      }
+      float gain_of_split = 0.0;
+      for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+        GradientStats right_gradient_stats =
+            current_layer_stats[root_idx] - left_gradient_stats[root_idx];
+        NodeStats left_stat =
+            state->ComputeNodeStats(left_gradient_stats[root_idx]);
+        NodeStats right_stat = state->ComputeNodeStats(right_gradient_stats);
+        gain_of_split += left_stat.gain + right_stat.gain;
+        current_left_node_stats[root_idx] = left_stat;
+        current_right_node_stats[root_idx] = right_stat;
+      }
+      if (gain_of_split > best_gain) {
+        best_gain = gain_of_split;
+        best_left_node_stats = current_left_node_stats;
+        best_right_node_stats = current_right_node_stats;
+      }
+      current_bucket_id = next_bucket_id;
+    }
+
+    for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+      best_gain -= state->ComputeNodeStats(current_layer_stats[root_idx]).gain;
+    }
+    best_gain -= num_elements * state->tree_complexity_regularization();
+
+    ObliviousSplitInfo oblivious_split_info;
+    auto* oblivious_dense_split =
+        oblivious_split_info.mutable_split_node()
+            ->mutable_oblivious_dense_float_binary_split();
+    oblivious_dense_split->set_feature_column(state->feature_column_group_id());
+    oblivious_dense_split->set_threshold(
+        bucket_boundaries(bucket_ids(best_bucket_idx, 0)));
+    (*gains)(0) = best_gain;
+
+    for (int root_idx = 0; root_idx < num_elements; root_idx++) {
+      auto* left_child = oblivious_split_info.add_children();
+      auto* right_child = oblivious_split_info.add_children();
+
+      state->FillLeaf(best_left_node_stats[root_idx], left_child);
+      state->FillLeaf(best_right_node_stats[root_idx], right_child);
+
+      const int start_index = partition_boundaries[root_idx];
+      (*output_partition_ids)(root_idx) = partition_ids(start_index);
+      oblivious_split_info.add_children_parent_id(partition_ids(start_index));
     }
+    oblivious_split_info.SerializeToString(&(*output_splits)(0));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("BuildDenseInequalitySplits").Device(DEVICE_CPU),
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index 6d9a6ee5a0..ab2853352a 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
+#include <vector>
+
 #include "tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h"
 #include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
 #include "tensorflow/contrib/boosted_trees/proto/split_info.pb.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
 #include "tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -26,6 +29,7 @@ namespace boosted_trees {
 
 namespace {
 
+using boosted_trees::learner::LearnerConfig;
 using boosted_trees::learner::LearningRateConfig;
 using boosted_trees::trees::Leaf;
 using boosted_trees::trees::TreeNode;
@@ -42,6 +46,9 @@ struct SplitCandidate {
 
   // Split info.
   learner::SplitInfo split_info;
+
+  // Oblivious split info.
+  learner::ObliviousSplitInfo oblivious_split_info;
 };
 
 // Checks that the leaf is not empty.
@@ -343,7 +350,12 @@ class GrowTreeEnsembleOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
     float learning_rate = learning_rate_t->scalar<float>()();
 
-    // Read seed that was used for dropout.
+    // Read the weak learner type to use.
+    const Tensor* weak_learner_type_t;
+    OP_REQUIRES_OK(context,
+                   context->input("weak_learner_type", &weak_learner_type_t));
+    const int32 weak_learner_type = weak_learner_type_t->scalar<int32>()();
+
     const Tensor* seed_t;
     OP_REQUIRES_OK(context, context->input("dropout_seed", &seed_t));
     // Cast seed to uint64.
@@ -363,9 +375,18 @@ class GrowTreeEnsembleOp : public OpKernel {
 
     // Find best splits for each active partition.
     std::map<int32, SplitCandidate> best_splits;
-    FindBestSplitsPerPartition(context, partition_ids_list, gains_list,
-                               splits_list, &best_splits);
-
+    switch (weak_learner_type) {
+      case LearnerConfig::NORMAL_DECISION_TREE: {
+        FindBestSplitsPerPartitionNormal(context, partition_ids_list,
+                                         gains_list, splits_list, &best_splits);
+        break;
+      }
+      case LearnerConfig::OBLIVIOUS_DECISION_TREE: {
+        FindBestSplitsPerPartitionOblivious(context, gains_list, splits_list,
+                                            &best_splits);
+        break;
+      }
+    }
     // No-op if no new splits can be considered.
     if (best_splits.empty()) {
       LOG(WARNING) << "Not growing tree ensemble as no good splits were found.";
@@ -377,25 +398,34 @@ class GrowTreeEnsembleOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->input("max_tree_depth", &max_tree_depth_t));
     const int32 max_tree_depth = max_tree_depth_t->scalar<int32>()();
-
     // Update and retrieve the growable tree.
     // If the tree is fully built and dropout was applied, it also adjusts the
     // weights of dropped and the last tree.
     boosted_trees::trees::DecisionTreeConfig* const tree_config =
         UpdateAndRetrieveGrowableTree(ensemble_resource, learning_rate,
-                                      dropout_seed, max_tree_depth);
-
+                                      dropout_seed, max_tree_depth,
+                                      weak_learner_type);
     // Split tree nodes.
-    for (auto& split_entry : best_splits) {
-      SplitTreeNode(split_entry.first, &split_entry.second, tree_config,
-                    ensemble_resource);
+    switch (weak_learner_type) {
+      case LearnerConfig::NORMAL_DECISION_TREE: {
+        for (auto& split_entry : best_splits) {
+          SplitTreeNode(split_entry.first, &split_entry.second, tree_config,
+                        ensemble_resource);
+        }
+        break;
+      }
+      case LearnerConfig::OBLIVIOUS_DECISION_TREE: {
+        SplitTreeLayer(&best_splits[0], tree_config, ensemble_resource);
+      }
     }
-
     // Post-prune finalized tree if needed.
     if (learner_config_.pruning_mode() ==
             boosted_trees::learner::LearnerConfig::POST_PRUNE &&
         ensemble_resource->LastTreeMetadata()->is_finalized()) {
       VLOG(2) << "Post-pruning finalized tree.";
+      if (weak_learner_type == LearnerConfig::OBLIVIOUS_DECISION_TREE) {
+        LOG(FATAL) << "Post-prunning is not implemented for Oblivious trees.";
+      }
       PruneTree(tree_config);
 
       // If after post-pruning the whole tree has no gain, remove the tree
@@ -409,10 +439,9 @@ class GrowTreeEnsembleOp : public OpKernel {
  private:
   // Helper method which effectively does a reduce over all split candidates
   // and finds the best split for each partition.
-  void FindBestSplitsPerPartition(
-      OpKernelContext* const context,
-      const OpInputList& partition_ids_list, const OpInputList& gains_list,
-      const OpInputList& splits_list,
+  void FindBestSplitsPerPartitionNormal(
+      OpKernelContext* const context, const OpInputList& partition_ids_list,
+      const OpInputList& gains_list, const OpInputList& splits_list,
       std::map<int32, SplitCandidate>* best_splits) {
     // Find best split per partition going through every feature candidate.
     // TODO(salehay): Is this worth parallelizing?
@@ -446,6 +475,90 @@ class GrowTreeEnsembleOp : public OpKernel {
     }
   }
 
+  void FindBestSplitsPerPartitionOblivious(
+      OpKernelContext* const context, const OpInputList& gains_list,
+      const OpInputList& splits_list,
+      std::map<int32, SplitCandidate>* best_splits) {
+    // Find best split per partition going through every feature candidate.
+    for (int64 handler_id = 0; handler_id < num_handlers_; ++handler_id) {
+      const auto& gains = gains_list[handler_id].vec<float>();
+      const auto& splits = splits_list[handler_id].vec<string>();
+      OP_REQUIRES(context, gains.size() == 1,
+                  errors::InvalidArgument(
+                      "Gains size must be one for oblivious weak learner: ",
+                      gains.size(), " != ", 1));
+      OP_REQUIRES(context, splits.size() == 1,
+                  errors::InvalidArgument(
+                      "Splits size must be one for oblivious weak learner: ",
+                      splits.size(), " != ", 1));
+      // Get current split candidate.
+      const auto& gain = gains(0);
+      const auto& serialized_split = splits(0);
+      SplitCandidate split;
+      split.handler_id = handler_id;
+      split.gain = gain;
+      OP_REQUIRES(
+          context, split.oblivious_split_info.ParseFromString(serialized_split),
+          errors::InvalidArgument("Unable to parse oblivious split info."));
+
+      auto split_info = split.oblivious_split_info;
+      CHECK(split_info.children_size() % 2 == 0)
+          << "The oblivious split should generate an even number of children: "
+          << split_info.children_size();
+
+      // If every node is pure, then we shouldn't split.
+      bool only_pure_nodes = true;
+      for (int idx = 0; idx < split_info.children_size(); idx += 2) {
+        if (IsLeafWellFormed(*split_info.mutable_children(idx)) &&
+            IsLeafWellFormed(*split_info.mutable_children(idx + 1))) {
+          only_pure_nodes = false;
+          break;
+        }
+      }
+      if (only_pure_nodes) {
+        VLOG(1) << "The oblivious split does not actually split anything.";
+        continue;
+      }
+
+      // Don't consider negative splits if we're pre-pruning the tree.
+      if (learner_config_.pruning_mode() == learner::LearnerConfig::PRE_PRUNE &&
+          gain < 0) {
+        continue;
+      }
+
+      // Take the split if we don't have a candidate yet.
+      auto best_split_it = best_splits->find(0);
+      if (best_split_it == best_splits->end()) {
+        best_splits->insert(std::make_pair(0, std::move(split)));
+        continue;
+      }
+
+      // Determine if we should update best split.
+      SplitCandidate& best_split = best_split_it->second;
+      trees::TreeNode current_node = split_info.split_node();
+      trees::TreeNode best_node = best_split.oblivious_split_info.split_node();
+      if (TF_PREDICT_FALSE(gain == best_split.gain)) {
+        // Tie break on node case preferring simpler tree node types.
+        VLOG(2) << "Attempting to tie break with smaller node case. "
+                << "(current split: " << current_node.node_case()
+                << ", best split: " << best_node.node_case() << ")";
+        if (current_node.node_case() < best_node.node_case()) {
+          best_split = std::move(split);
+        } else if (current_node.node_case() == best_node.node_case()) {
+          // Tie break on handler Id.
+          VLOG(2) << "Tie breaking with higher handler Id. "
+                  << "(current split: " << handler_id
+                  << ", best split: " << best_split.handler_id << ")";
+          if (handler_id > best_split.handler_id) {
+            best_split = std::move(split);
+          }
+        }
+      } else if (gain > best_split.gain) {
+        best_split = std::move(split);
+      }
+    }
+  }
+
   void UpdateTreeWeightsIfDropout(
       boosted_trees::models::DecisionTreeEnsembleResource* const
           ensemble_resource,
@@ -501,7 +614,7 @@ class GrowTreeEnsembleOp : public OpKernel {
       boosted_trees::models::DecisionTreeEnsembleResource* const
           ensemble_resource,
       const float learning_rate, const uint64 dropout_seed,
-      const int32 max_tree_depth) {
+      const int32 max_tree_depth, const int32 weak_learner_type) {
     const auto num_trees = ensemble_resource->num_trees();
     if (num_trees <= 0 ||
         ensemble_resource->LastTreeMetadata()->is_finalized()) {
@@ -647,6 +760,71 @@ class GrowTreeEnsembleOp : public OpKernel {
     }
   }
 
+  void SplitTreeLayer(
+      SplitCandidate* split,
+      boosted_trees::trees::DecisionTreeConfig* tree_config,
+      boosted_trees::models::DecisionTreeEnsembleResource* ensemble_resource) {
+    int depth = 0;
+    while (depth < tree_config->nodes_size() &&
+           tree_config->nodes(depth).node_case() != TreeNode::kLeaf) {
+      depth++;
+    }
+    CHECK(tree_config->nodes_size() > 0)
+        << "A tree must have at least one dummy leaf.";
+    // The number of new children.
+    int num_children = 1 << (depth + 1);
+    auto split_info = split->oblivious_split_info;
+    CHECK(num_children >= split_info.children_size())
+        << "Too many new children, expected <= " << num_children << " and got "
+        << split_info.children_size();
+    std::vector<trees::Leaf> new_leaves;
+    new_leaves.reserve(num_children);
+    int next_id = 0;
+    for (int idx = 0; idx < num_children / 2; idx++) {
+      trees::Leaf old_leaf =
+          *tree_config->mutable_nodes(depth + idx)->mutable_leaf();
+      // Check if a split was made for this leaf.
+      if (next_id < split_info.children_parent_id_size() &&
+          depth + idx == split_info.children_parent_id(next_id)) {
+        // Add left leaf.
+        new_leaves.push_back(*MergeLeafWeights(
+            old_leaf, split_info.mutable_children(2 * next_id)));
+        // Add right leaf.
+        new_leaves.push_back(*MergeLeafWeights(
+            old_leaf, split_info.mutable_children(2 * next_id + 1)));
+        next_id++;
+      } else {
+        // If there is no split for this leaf, just duplicate it.
+        new_leaves.push_back(old_leaf);
+        new_leaves.push_back(old_leaf);
+      }
+    }
+    CHECK(next_id == split_info.children_parent_id_size());
+    TreeNodeMetadata* split_metadata =
+        split_info.mutable_split_node()->mutable_node_metadata();
+    split_metadata->set_gain(split->gain);
+
+    TreeNode new_split = *split_info.mutable_split_node();
+    // Move old children to metadata.
+    for (int idx = depth; idx < tree_config->nodes_size(); idx++) {
+      *new_split.mutable_node_metadata()->add_original_oblivious_leaves() =
+          *tree_config->mutable_nodes(idx)->mutable_leaf();
+    }
+    // Add the new split to the tree_config in place before the children start.
+    *tree_config->mutable_nodes(depth) = new_split;
+    // Add the new children
+    int nodes_size = tree_config->nodes_size();
+    for (int idx = 0; idx < num_children; idx++) {
+      if (idx + depth + 1 < nodes_size) {
+        // Update leaves that were already there.
+        *tree_config->mutable_nodes(idx + depth + 1)->mutable_leaf() =
+            new_leaves[idx];
+      } else {
+        // Add new leaves.
+        *tree_config->add_nodes()->mutable_leaf() = new_leaves[idx];
+      }
+    }
+  }
   void PruneTree(boosted_trees::trees::DecisionTreeConfig* tree_config) {
     // No-op if tree is empty.
     if (tree_config->nodes_size() <= 0) {
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 2559fe9913..f45010ec26 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -64,6 +64,7 @@ from __future__ import print_function
 import re
 
 from tensorflow.contrib.boosted_trees.lib.learner.batch import base_split_handler
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.boosted_trees.python.ops import gen_quantile_ops
 from tensorflow.contrib.boosted_trees.python.ops import gen_stats_accumulator_ops
 from tensorflow.contrib.boosted_trees.python.ops import quantile_ops
@@ -171,6 +172,7 @@ class DenseSplitHandler(InequalitySplitHandler):
                multiclass_strategy,
                init_stamp_token=0,
                loss_uses_sum_reduction=False,
+               weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE,
                name=None):
     """Initialize the internal state for this split handler.
 
@@ -192,6 +194,7 @@ class DenseSplitHandler(InequalitySplitHandler):
          stamped objects.
       loss_uses_sum_reduction: A scalar boolean tensor that specifies whether
           SUM or MEAN reduction was used for the loss.
+      weak_learner_type: Specifies the type of weak learner to use.
       name: An optional handler name.
     """
     super(DenseSplitHandler, self).__init__(
@@ -209,6 +212,7 @@ class DenseSplitHandler(InequalitySplitHandler):
         multiclass_strategy=multiclass_strategy,
         loss_uses_sum_reduction=loss_uses_sum_reduction)
     self._dense_float_column = dense_float_column
+    self._weak_learner_type = weak_learner_type
     # Register dense_make_stats_update function as an Op to the graph.
     g = ops.get_default_graph()
     dense_make_stats_update.add_to_graph(g)
@@ -269,16 +273,17 @@ class DenseSplitHandler(InequalitySplitHandler):
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
-                self._min_node_weight, self._loss_uses_sum_reduction))
-
+                self._min_node_weight, self._loss_uses_sum_reduction,
+                self._weak_learner_type))
     return are_splits_ready, partition_ids, gains, split_infos
 
 
-def _make_dense_split(
-    quantile_accumulator_handle, stats_accumulator_handle, stamp_token,
-    next_stamp_token, multiclass_strategy, class_id, feature_column_id,
-    l1_regularization, l2_regularization, tree_complexity_regularization,
-    min_node_weight, is_multi_dimentional, loss_uses_sum_reduction):
+def _make_dense_split(quantile_accumulator_handle, stats_accumulator_handle,
+                      stamp_token, next_stamp_token, multiclass_strategy,
+                      class_id, feature_column_id, l1_regularization,
+                      l2_regularization, tree_complexity_regularization,
+                      min_node_weight, is_multi_dimentional,
+                      loss_uses_sum_reduction, weak_learner_type):
   """Function that builds splits for a dense feature column."""
   # Get the bucket boundaries
   are_splits_ready, buckets = (
@@ -327,7 +332,8 @@ def _make_dense_split(
           l2_regularization=l2_regularization,
           tree_complexity_regularization=tree_complexity_regularization,
           min_node_weight=min_node_weight,
-          multiclass_strategy=multiclass_strategy))
+          multiclass_strategy=multiclass_strategy,
+          weak_learner_type=weak_learner_type))
   return are_splits_ready, partition_ids, gains, split_infos
 
 
@@ -507,7 +513,40 @@ def _make_sparse_split(
   return are_splits_ready, partition_ids, gains, split_infos
 
 
-def _specialize_make_split(func, is_multi_dimentional):
+def _specialize_make_split_dense(func, is_multi_dimentional):
+  """Builds a specialized version of the function."""
+
+  @function.Defun(
+      dtypes.resource,
+      dtypes.resource,
+      dtypes.int64,
+      dtypes.int64,
+      dtypes.int32,
+      dtypes.int32,
+      dtypes.int32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.bool,
+      dtypes.int32,
+      noinline=True)
+  def f(quantile_accumulator_handle, stats_accumulator_handle, stamp_token,
+        next_stamp_token, multiclass_strategy, class_id, feature_column_id,
+        l1_regularization, l2_regularization, tree_complexity_regularization,
+        min_node_weight, loss_uses_sum_reduction, weak_learner_type):
+    """Function that builds splits for a sparse feature column."""
+    return func(quantile_accumulator_handle, stats_accumulator_handle,
+                stamp_token, next_stamp_token, multiclass_strategy, class_id,
+                feature_column_id, l1_regularization, l2_regularization,
+                tree_complexity_regularization, min_node_weight,
+                is_multi_dimentional, loss_uses_sum_reduction,
+                weak_learner_type)
+
+  return f
+
+
+def _specialize_make_split_sparse(func, is_multi_dimentional):
   """Builds a specialized version of the function."""
 
   @function.Defun(
@@ -537,15 +576,17 @@ def _specialize_make_split(func, is_multi_dimentional):
 
   return f
 
-make_dense_split_scalar = _specialize_make_split(_make_dense_split,
-                                                 is_multi_dimentional=False)
-make_dense_split_tensor = _specialize_make_split(_make_dense_split,
-                                                 is_multi_dimentional=True)
 
-make_sparse_split_scalar = _specialize_make_split(_make_sparse_split,
-                                                  is_multi_dimentional=False)
-make_sparse_split_tensor = _specialize_make_split(_make_sparse_split,
-                                                  is_multi_dimentional=True)
+make_dense_split_scalar = _specialize_make_split_dense(
+    _make_dense_split, is_multi_dimentional=False)
+
+make_dense_split_tensor = _specialize_make_split_dense(
+    _make_dense_split, is_multi_dimentional=True)
+
+make_sparse_split_scalar = _specialize_make_split_sparse(
+    _make_sparse_split, is_multi_dimentional=False)
+make_sparse_split_tensor = _specialize_make_split_sparse(
+    _make_sparse_split, is_multi_dimentional=True)
 
 
 @function.Defun(
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index 5d82c4cae5..31043264a1 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -182,6 +182,138 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase):
 
     self.assertAllClose(0.52, split_node.threshold, 0.00001)
 
+  def testObliviousFeatureSplitGeneration(self):
+    with self.test_session() as sess:
+      # The data looks like the following:
+      # Example |  Gradients    | Partition | Dense Quantile |
+      # i0      |  (0.2, 0.12)  | 1         | 2              |
+      # i1      |  (-0.5, 0.07) | 1         | 2              |
+      # i2      |  (1.2, 0.2)   | 1         | 0              |
+      # i3      |  (4.0, 0.13)  | 2         | 1              |
+      dense_column = array_ops.constant([0.62, 0.62, 0.3, 0.52])
+      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
+      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
+      partition_ids = array_ops.constant([1, 1, 1, 2], dtype=dtypes.int32)
+      class_id = -1
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      split_handler = ordinal_split_handler.DenseSplitHandler(
+          l1_regularization=0.1,
+          l2_regularization=1.,
+          tree_complexity_regularization=0.,
+          min_node_weight=0.,
+          epsilon=0.001,
+          num_quantiles=10,
+          feature_column_group_id=0,
+          dense_float_column=dense_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+          weak_learner_type=learner_pb2.LearnerConfig.OBLIVIOUS_DECISION_TREE)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([4, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          partition_ids,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            partition_ids,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([1, 2], partitions)
+
+    oblivious_split_info = split_info_pb2.ObliviousSplitInfo()
+    oblivious_split_info.ParseFromString(splits[0])
+    split_node = oblivious_split_info.split_node
+    split_node = split_node.oblivious_dense_float_binary_split
+    self.assertAllClose(0.3, split_node.threshold, 0.00001)
+    self.assertEqual(0, split_node.feature_column)
+
+    # Check the split on partition 1.
+    # -(1.2 - 0.1) / (0.2 + 1)
+    expected_left_weight_1 = -0.9166666666666666
+
+    # expected_left_weight_1 * -(1.2 - 0.1)
+    expected_left_gain_1 = 1.008333333333333
+
+    # (-0.5 + 0.2 + 0.1) / (0.19 + 1)
+    expected_right_weight_1 = 0.1680672
+
+    # expected_right_weight_1 * -(-0.5 + 0.2 + 0.1))
+    expected_right_gain_1 = 0.033613445378151252
+
+    # (0.2 + -0.5 + 1.2 - 0.1) ** 2 / (0.12 + 0.07 + 0.2 + 1)
+    expected_bias_gain_1 = 0.46043165467625896
+
+    left_child = oblivious_split_info.children[0].vector
+    right_child = oblivious_split_info.children[1].vector
+
+    self.assertAllClose([expected_left_weight_1], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight_1], right_child.value, 0.00001)
+
+    # Check the split on partition 2.
+    expected_left_weight_2 = 0
+    expected_left_gain_2 = 0
+    # -(4 - 0.1) / (0.13 + 1)
+    expected_right_weight_2 = -3.4513274336283186
+    # expected_right_weight_2 * -(4 - 0.1)
+    expected_right_gain_2 = 13.460176991150442
+    # (-4 + 0.1) ** 2 / (0.13 + 1)
+    expected_bias_gain_2 = 13.460176991150442
+
+    left_child = oblivious_split_info.children[2].vector
+    right_child = oblivious_split_info.children[3].vector
+
+    self.assertAllClose([expected_left_weight_2], left_child.value, 0.00001)
+
+    self.assertAllClose([expected_right_weight_2], right_child.value, 0.00001)
+
+    # The layer gain is the sum of the gains of each partition
+    layer_gain = (
+        expected_left_gain_1 + expected_right_gain_1 - expected_bias_gain_1) + (
+            expected_left_gain_2 + expected_right_gain_2 - expected_bias_gain_2)
+    self.assertAllClose(layer_gain, gains[0], 0.00001)
+
+    # We have examples in both partitions, then we get both ids.
+    self.assertEqual(2, len(oblivious_split_info.children_parent_id))
+    self.assertEqual(1, oblivious_split_info.children_parent_id[0])
+    self.assertEqual(2, oblivious_split_info.children_parent_id[1])
+
   def testGenerateFeatureSplitCandidatesLossUsesSumReduction(self):
     with self.test_session() as sess:
       # The data looks like the following:
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index 69bb8fd4ad..8d71a6cdbc 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -36,12 +36,6 @@ class WeightedQuantilesSummary {
   struct SummaryEntry {
     SummaryEntry(const ValueType& v, const WeightType& w, const WeightType& min,
                  const WeightType& max) {
-      // Explicitly initialize all of memory (including padding from memory
-      // alignment) to allow the struct to be msan-resistant "plain old data".
-      //
-      // POD = http://en.cppreference.com/w/cpp/concept/PODType
-      memset(this, 0, sizeof(*this));
-
       value = v;
       weight = w;
       min_rank = min;
@@ -49,8 +43,6 @@ class WeightedQuantilesSummary {
     }
 
     SummaryEntry() {
-      memset(this, 0, sizeof(*this));
-
       value = ValueType();
       weight = 0;
       min_rank = 0;
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
index 0e5578693a..3ed6c5c04d 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
+#include <algorithm>
+
 #include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
 #include "tensorflow/core/platform/macros.h"
 
-#include <algorithm>
-
 namespace tensorflow {
 namespace boosted_trees {
 namespace trees {
@@ -28,14 +28,15 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
   if (TF_PREDICT_FALSE(config.nodes_size() <= sub_root_id)) {
     return kInvalidLeaf;
   }
-
   // Traverse tree starting at the provided sub-root.
   int32 node_id = sub_root_id;
+  // The index of the leave that holds this example in the oblivious case.
+  int oblivious_leaf_idx = 0;
   while (true) {
     const auto& current_node = config.nodes(node_id);
     switch (current_node.node_case()) {
       case TreeNode::kLeaf: {
-        return node_id;
+        return node_id + oblivious_leaf_idx;
       }
       case TreeNode::kDenseFloatBinarySplit: {
         const auto& split = current_node.dense_float_binary_split();
@@ -100,6 +101,16 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
         }
         break;
       }
+      case TreeNode::kObliviousDenseFloatBinarySplit: {
+        const auto& split = current_node.oblivious_dense_float_binary_split();
+        oblivious_leaf_idx <<= 1;
+        if (example.dense_float_features[split.feature_column()] >
+            split.threshold()) {
+          oblivious_leaf_idx++;
+        }
+        node_id++;
+        break;
+      }
       case TreeNode::NODE_NOT_SET: {
         LOG(QFATAL) << "Invalid node in tree: " << current_node.DebugString();
         break;
@@ -165,6 +176,11 @@ void DecisionTree::LinkChildren(const std::vector<int32>& children,
       split->set_right_id(*++children_it);
       break;
     }
+    case TreeNode::kObliviousDenseFloatBinarySplit: {
+      LOG(QFATAL)
+          << "Not implemented for the ObliviousDenseFloatBinarySplit case.";
+      break;
+    }
     case TreeNode::NODE_NOT_SET: {
       LOG(QFATAL) << "A non-set node cannot have children.";
       break;
@@ -199,6 +215,11 @@ std::vector<int32> DecisionTree::GetChildren(const TreeNode& node) {
       const auto& split = node.categorical_id_set_membership_binary_split();
       return {split.left_id(), split.right_id()};
     }
+    case TreeNode::kObliviousDenseFloatBinarySplit: {
+      LOG(QFATAL)
+          << "Not implemented for the ObliviousDenseFloatBinarySplit case.";
+      return {};
+    }
     case TreeNode::NODE_NOT_SET: {
       return {};
     }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
index ec06787e1d..1f3672bf85 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
-#define TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_PARALLEL_FOR_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_PARALLEL_FOR_H_
 
 #include "tensorflow/core/lib/core/threadpool.h"
 
@@ -30,4 +30,4 @@ void ParallelFor(int64 batch_size, int64 desired_parallelism,
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_PARALLEL_FOR_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/random.h b/tensorflow/contrib/boosted_trees/lib/utils/random.h
index 546d344f55..249651e99e 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/random.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/random.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
-#define TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_RANDOM_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_RANDOM_H_
 
 #include "tensorflow/core/lib/random/simple_philox.h"
 
@@ -36,4 +36,4 @@ inline int32 PoissonBootstrap(random::SimplePhilox* rng) {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_RANDOM_H_
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index ca5c7f3d8c..9b68a9de96 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -36,6 +36,7 @@ REGISTER_OP("BuildDenseInequalitySplits")
     .Input("tree_complexity_regularization: float")
     .Input("min_node_weight: float")
     .Input("multiclass_strategy: int32")
+    .Input("weak_learner_type: int32")
     .Output("output_partition_ids: int32")
     .Output("gains: float32")
     .Output("split_infos: string")
@@ -84,6 +85,8 @@ min_node_weight: A scalar, minimum sum of example hessian needed in a child.
     be considered.
 multiclass_strategy: A scalar, specifying the multiclass handling strategy.
     See LearnerConfig.MultiClassStrategy for valid values.
+weak_learner_type: A scalar, specifying the weak learner type to use.
+    See LearnerConfig.WeakLearnerType for valid values.
 output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
     for.
 gains: A rank 1 tensor, for the computed gain for the created splits.
diff --git a/tensorflow/contrib/boosted_trees/ops/training_ops.cc b/tensorflow/contrib/boosted_trees/ops/training_ops.cc
index 22ac9edb72..604ec8e0bf 100644
--- a/tensorflow/contrib/boosted_trees/ops/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/training_ops.cc
@@ -57,6 +57,7 @@ REGISTER_OP("GrowTreeEnsemble")
     .Input("learning_rate: float")
     .Input("dropout_seed: int64")
     .Input("max_tree_depth: int32")
+    .Input("weak_learner_type: int32")
     .Input("partition_ids: num_handlers * int32")
     .Input("gains: num_handlers * float")
     .Input("splits: num_handlers * string")
@@ -82,6 +83,7 @@ tree_ensemble_handle: Handle to the ensemble variable.
 stamp_token: Stamp token for validating operation consistency.
 next_stamp_token: Stamp token to be used for the next iteration.
 learning_rate: Scalar learning rate.
+weak_learner_type: The type of weak learner to use.
 partition_ids: List of Rank 1 Tensors containing partition Id per candidate.
 gains: List of Rank 1 Tensors containing gains per candidate.
 splits: List of Rank 1 Tensors containing serialized SplitInfo protos per candidate.
diff --git a/tensorflow/contrib/boosted_trees/proto/learner.proto b/tensorflow/contrib/boosted_trees/proto/learner.proto
index d84ba7438e..c49cb48cde 100644
--- a/tensorflow/contrib/boosted_trees/proto/learner.proto
+++ b/tensorflow/contrib/boosted_trees/proto/learner.proto
@@ -108,6 +108,11 @@ message LearnerConfig {
     DIAGONAL_HESSIAN = 3;
   }
 
+  enum WeakLearnerType {
+    NORMAL_DECISION_TREE = 0;
+    OBLIVIOUS_DECISION_TREE = 1;
+  }
+
   // Number of classes.
   uint32 num_classes = 1;
 
@@ -141,4 +146,7 @@ message LearnerConfig {
   // If you want to average the ensembles (for regularization), provide the
   // config below.
   AveragingConfig averaging_config = 11;
+
+  // By default we use NORMAL_DECISION_TREE as weak learner.
+  WeakLearnerType weak_learner_type = 12;
 }
diff --git a/tensorflow/contrib/boosted_trees/proto/split_info.proto b/tensorflow/contrib/boosted_trees/proto/split_info.proto
index a300c24c8e..784977af39 100644
--- a/tensorflow/contrib/boosted_trees/proto/split_info.proto
+++ b/tensorflow/contrib/boosted_trees/proto/split_info.proto
@@ -17,3 +17,12 @@ message SplitInfo {
   // Right Leaf node.
   tensorflow.boosted_trees.trees.Leaf right_child = 3;
 }
+
+message ObliviousSplitInfo {
+  tensorflow.boosted_trees.trees.TreeNode split_node = 1;
+  repeated tensorflow.boosted_trees.trees.Leaf children = 2;
+  // For each child, children_parent_id stores the node_id of its parent when it
+  // was a leaf. For the idx-th child it corresponds the idx/2-th
+  // children_parent_id.
+  repeated int32 children_parent_id = 3;
+}
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index 81411aa84a..500909bf2a 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -15,6 +15,7 @@ message TreeNode {
     CategoricalIdBinarySplit categorical_id_binary_split = 5;
     CategoricalIdSetMembershipBinarySplit
         categorical_id_set_membership_binary_split = 6;
+    ObliviousDenseFloatBinarySplit oblivious_dense_float_binary_split = 7;
   }
   TreeNodeMetadata node_metadata = 777;
 }
@@ -26,6 +27,9 @@ message TreeNodeMetadata {
 
   // The original leaf node before this node was split.
   Leaf original_leaf = 2;
+
+  // The original layer of leaves before that layer was converted to a split.
+  repeated Leaf original_oblivious_leaves = 3;
 }
 
 // Leaves can either hold dense or sparse information.
@@ -101,6 +105,17 @@ message CategoricalIdSetMembershipBinarySplit {
   int32 right_id = 4;
 }
 
+// Split rule for dense float features in the oblivious case.
+message ObliviousDenseFloatBinarySplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_column = 1;
+  float threshold = 2;
+  // We don't store children ids, because either the next node represents the
+  // whole next layer of the tree or starting with the next node we only have
+  // leaves.
+}
+
 // DecisionTreeConfig describes a list of connected nodes.
 // Node 0 must be the root and can carry any payload including a leaf
 // in the case of representing the bias.
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
index 63b9c5fddf..42d69645ac 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
@@ -98,7 +98,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
     self._seed = 123
 
   def testCreate(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree = tree_ensemble_config.trees.add()
       _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
@@ -133,7 +133,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
 
   def testSerialization(self):
     with ops.Graph().as_default() as graph:
-      with self.test_session(graph):
+      with self.session(graph):
         tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
         # Bias tree only for second class.
         tree1 = tree_ensemble_config.trees.add()
@@ -164,7 +164,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
         serialized_config = serialized_config.eval()
 
     with ops.Graph().as_default() as graph:
-      with self.test_session(graph):
+      with self.session(graph):
         tree_ensemble_handle2 = model_ops.tree_ensemble_variable(
             stamp_token=9,
             tree_ensemble_config=serialized_config,
@@ -204,14 +204,14 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
         self.assertAllClose(result.eval(), [[0.5, -0.2], [0, 1.0]])
 
   def testRestore(self):
-    # Calling self.test_session() without a graph specified results in
+    # Calling self.cached_session() without a graph specified results in
     # TensorFlowTestCase caching the session and returning the same one
     # every time. In this test, we need to create two different sessions
-    # which is why we also create a graph and pass it to self.test_session()
+    # which is why we also create a graph and pass it to self.cached_session()
     # to ensure no caching occurs under the hood.
     save_path = os.path.join(self.get_temp_dir(), "restore-test")
     with ops.Graph().as_default() as graph:
-      with self.test_session(graph) as sess:
+      with self.session(graph) as sess:
         # Prepare learner config.
         learner_config = learner_pb2.LearnerConfig()
         learner_config.num_classes = 2
@@ -288,7 +288,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
     # Start a second session.  In that session the parameter nodes
     # have not been initialized either.
     with ops.Graph().as_default() as graph:
-      with self.test_session(graph) as sess:
+      with self.session(graph) as sess:
         tree_ensemble_handle = model_ops.tree_ensemble_variable(
             stamp_token=0, tree_ensemble_config="", name="restore_tree")
         my_saver = saver.Saver()
@@ -311,7 +311,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
         self.assertAllClose(result.eval(), [[-1.1], [-1.1]])
 
   def testUsedHandlers(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree_ensemble_config.growing_metadata.used_handler_ids.append(1)
       tree_ensemble_config.growing_metadata.used_handler_ids.append(5)
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
index cf55759aaa..4278a30ba9 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -96,6 +96,20 @@ def _set_float_split(split, feat_col, thresh, l_id, r_id, feature_dim_id=None):
     split.dimension_id = feature_dim_id
 
 
+def _set_float_oblivious_split(split, feat_col, thresh):
+  """Helper method for building tree float splits.
+
+  Sets split feature column and threshold.
+
+  Args:
+    split: split node to update.
+    feat_col: feature column for the split.
+    thresh: threshold to split on forming rule x <= thresh.
+  """
+  split.feature_column = feat_col
+  split.threshold = thresh
+
+
 def _set_categorical_id_split(split, feat_col, feat_id, l_id, r_id):
   """Helper method for building tree categorical id splits.
 
@@ -119,15 +133,17 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
   def setUp(self):
     """Sets up the prediction tests.
 
-    Create a batch of two examples having one dense float, two sparse float
+    Creates, a batch of two examples having three dense float, two sparse float
     single valued, one sparse float multidimensional and one sparse int
     features.  The data looks like the following:
-    | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 | SparseM
-    | 0        |  7     |    -3    |          |    9,1   | __, 5.0
-    | 1        | -2     |          | 4        |          |  3, ___
+    |Instance |Dense0 |Dense1 |Dense2 |SparseF0 |SparseF1 |SparseI0 |SparseM
+    | 0       |  7    |  1    |  2    |  -3     |         |   9,1   | __, 5.0
+    | 1       | -2    |  2    |  0.5  |         |  4      |         |  3, ___
     """
     super(PredictionOpsTest, self).setUp()
-    self._dense_float_tensor = np.array([[7.0], [-2.0]])
+    self._dense_float_tensor1 = np.array([[7.0], [-2.0]])
+    self._dense_float_tensor2 = np.array([[1.0], [2.0]])
+    self._dense_float_tensor3 = np.array([[2.0], [0.5]])
     self._sparse_float_indices1 = np.array([[0, 0]])
     self._sparse_float_values1 = np.array([-3.0])
     self._sparse_float_shape1 = np.array([2, 1])
@@ -153,7 +169,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
                        reduce_dim=False):
     return prediction_ops.gradient_trees_prediction(
         tree_ensemble_handle,
-        self._seed, [self._dense_float_tensor],
+        self._seed, [self._dense_float_tensor1],
         [self._sparse_float_indices1, self._sparse_float_indices2],
         [self._sparse_float_values1, self._sparse_float_values2],
         [self._sparse_float_shape1, self._sparse_float_shape2],
@@ -165,8 +181,27 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
         center_bias=center_bias,
         reduce_dim=reduce_dim)
 
+  def _get_predictions_oblivious_case(self,
+                                      tree_ensemble_handle,
+                                      learner_config,
+                                      apply_dropout=False,
+                                      apply_averaging=False,
+                                      center_bias=False,
+                                      reduce_dim=False):
+    return prediction_ops.gradient_trees_prediction(
+        tree_ensemble_handle,
+        self._seed, [
+            self._dense_float_tensor1, self._dense_float_tensor2,
+            self._dense_float_tensor3
+        ], [], [], [], [], [], [],
+        learner_config=learner_config,
+        apply_dropout=apply_dropout,
+        apply_averaging=apply_averaging,
+        center_bias=center_bias,
+        reduce_dim=reduce_dim)
+
   def testEmptyEnsemble(self):
-    with self.test_session():
+    with self.cached_session():
       # Empty tree ensenble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
 
@@ -189,7 +224,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[], []], dropout_info.eval())
 
   def testBiasEnsembleSingleClass(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree = tree_ensemble_config.trees.add()
       tree_ensemble_config.tree_metadata.add().is_finalized = True
@@ -217,7 +252,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[], []], dropout_info.eval())
 
   def testBiasEnsembleMultiClass(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree = tree_ensemble_config.trees.add()
       tree_ensemble_config.tree_metadata.add().is_finalized = True
@@ -247,7 +282,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[], []], dropout_info.eval())
 
   def testFullEnsembleSingleClass(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree.
       tree1 = tree_ensemble_config.trees.add()
@@ -295,7 +330,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       # Empty dropout.
       self.assertAllEqual([[], []], dropout_info.eval())
 
-  def testFullEnsembleWithMultidimensionalSparseSingleClass(self):
+  def testObliviousEnsemble(self):
     with self.test_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree.
@@ -305,6 +340,53 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
       # Depth 3 tree.
       tree2 = tree_ensemble_config.trees.add()
+      _set_float_oblivious_split(
+          tree2.nodes.add().oblivious_dense_float_binary_split, 0, 5.0)
+      _set_float_oblivious_split(
+          tree2.nodes.add().oblivious_dense_float_binary_split, 1, 3.0)
+      _set_float_oblivious_split(
+          tree2.nodes.add().oblivious_dense_float_binary_split, 2, 1.0)
+      for i in range(1, 9):
+        _append_to_leaf(tree2.nodes.add().leaf, 0, i / 10.0)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
+      result, dropout_info = self._get_predictions_oblivious_case(
+          tree_ensemble_handle,
+          learner_config=learner_config.SerializeToString(),
+          reduce_dim=True)
+
+      # The first example will get bias -0.4 from first tree and 0.6 from
+      # the 5th leaf of the second tree corresponding to node_id = 8, hence a
+      # prediction of 0.2.
+      # The second example will get bias -0.4 and 0.1 from the 0th leaf of the
+      # second tree corresponding to node_id = 3, hence a prediction of -0.3
+      self.assertAllClose([[0.2], [-0.3]], result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testFullEnsembleWithMultidimensionalSparseSingleClass(self):
+    with self.cached_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)
+
+      # Depth 3 tree.
+      tree2 = tree_ensemble_config.trees.add()
       tree_ensemble_config.tree_metadata.add().is_finalized = True
       # Use feature column 2 (sparse multidimensional), split on first value
       # node 0.
@@ -358,7 +440,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
       result, dropout_info = prediction_ops.gradient_trees_prediction(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
+          self._seed, [self._dense_float_tensor1], [
               self._sparse_float_indices1, self._sparse_float_indices2,
               self._sparse_float_indices_m
           ], [
@@ -384,7 +466,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[], []], dropout_info.eval())
 
   def testExcludeNonFinalTree(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree.
       tree1 = tree_ensemble_config.trees.add()
@@ -431,7 +513,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[], []], dropout_info.eval())
 
   def testIncludeNonFinalTree(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree.
       tree1 = tree_ensemble_config.trees.add()
@@ -482,7 +564,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
   def testMetadataMissing(self):
     # Sometimes we want to do prediction on trees that are not added to ensemble
     # (for example in
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree.
       tree1 = tree_ensemble_config.trees.add()
@@ -530,7 +612,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
   # For TREE_PER_CLASS strategy, predictions size is num_classes-1
   def testFullEnsembleMultiClassTreePerClassStrategy(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree only for second class.
       tree1 = tree_ensemble_config.trees.add()
@@ -581,7 +663,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
   # This test is when leafs have SPARSE weights stored (class id and
   # contribution).
   def testFullEnsembleMultiNotClassTreePerClassStrategySparseVector(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree only for second class.
       tree1 = tree_ensemble_config.trees.add()
@@ -631,7 +713,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
   # will have the size of the number of classes.
   # This test is when leafs have DENSE weights stored (weight for each class)
   def testFullEnsembleMultiNotClassTreePerClassStrategyDenseVector(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Bias tree only for second class.
       tree1 = tree_ensemble_config.trees.add()
@@ -678,7 +760,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[], []], dropout_info.eval())
 
   def testDropout(self):
-    with self.test_session():
+    with self.cached_session():
       # Empty tree ensenble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 1000 trees with some weights.
@@ -741,7 +823,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
     # This is for normal non-batch mode where ensemble does not contain the tree
     # that is being built currently.
     num_trees = 10
-    with self.test_session():
+    with self.cached_session():
       # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 10 trees with some weights.
@@ -809,7 +891,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
     # This is batch mode where ensemble already contains the tree that we are
     # building. This tree should never be dropped.
     num_trees = 10
-    with self.test_session():
+    with self.cached_session():
       # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 10 trees with some weights.
@@ -877,7 +959,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
                           dropout_info_center[0][num_dropped_center - 1])
 
   def testDropoutSeed(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 10 trees with some weights.
       for i in range(0, 999):
@@ -917,7 +999,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       # Different seed.
       _, dropout_info_3 = prediction_ops.gradient_trees_prediction(
           tree_ensemble_handle,
-          112314, [self._dense_float_tensor],
+          112314, [self._dense_float_tensor1],
           [self._sparse_float_indices1, self._sparse_float_indices2],
           [self._sparse_float_values1, self._sparse_float_values2],
           [self._sparse_float_shape1, self._sparse_float_shape2],
@@ -950,7 +1032,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
           len(dropout_info_4.eval()[0]) + 1, len(dropout_info_1.eval()[0]))
 
   def testDropOutZeroProb(self):
-    with self.test_session():
+    with self.cached_session():
       # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Add 1000 trees with some weights.
@@ -993,7 +1075,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result.eval(), result_no_dropout.eval())
 
   def testAveragingAllTrees(self):
-    with self.test_session():
+    with self.cached_session():
       # Empty tree ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       adjusted_tree_ensemble_config = (
@@ -1057,7 +1139,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(dropout_info.eval(), pattern_dropout_info.eval())
 
   def testAveragingSomeTrees(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       adjusted_tree_ensemble_config = (
           tree_config_pb2.DecisionTreeEnsembleConfig())
@@ -1138,7 +1220,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(dropout_info_2.eval(), pattern_dropout_info.eval())
 
   def testAverageMoreThanNumTreesExist(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       adjusted_tree_ensemble_config = (
           tree_config_pb2.DecisionTreeEnsembleConfig())
@@ -1204,15 +1286,18 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
   def setUp(self):
     """Sets up the prediction tests.
 
-    Create a batch of two examples having one dense float, two sparse float and
-    one sparse int features.
+    Create a batch of two examples having three dense float, two sparse float
+    and one sparse int features.
     The data looks like the following:
-    | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 |
-    | 0        |  7     |    -3    |          |    9,1   |
-    | 1        | -2     |          | 4        |          |
+    |Instance |Dense0 |Dense1 |Dense2 |SparseF0 |SparseF1 |SparseI0 |
+    | 0       |  7    |  1    |  2    |   -3    |         |    9,1  |
+    | 1       | -2    |  2    |  0.5  |         |   4     |         |
+
     """
     super(PartitionExamplesOpsTest, self).setUp()
-    self._dense_float_tensor = np.array([[7.0], [-2.0]])
+    self._dense_float_tensor1 = np.array([[7.0], [-2.0]])
+    self._dense_float_tensor2 = np.array([[1.0], [2.0]])
+    self._dense_float_tensor3 = np.array([[2.0], [0.5]])
     self._sparse_float_indices1 = np.array([[0, 0]])
     self._sparse_float_values1 = np.array([-3.0])
     self._sparse_float_shape1 = np.array([2, 1])
@@ -1224,7 +1309,7 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
     self._sparse_int_shape1 = np.array([2, 2])
 
   def testEnsembleEmpty(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
 
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -1234,17 +1319,17 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       result = prediction_ops.gradient_trees_partition_examples(
-          tree_ensemble_handle, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1])
+          tree_ensemble_handle, [self._dense_float_tensor1],
+          [self._sparse_float_indices1, self._sparse_float_indices2],
+          [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1, self._sparse_float_shape2],
+          [self._sparse_int_indices1], [self._sparse_int_values1],
+          [self._sparse_int_shape1])
 
       self.assertAllEqual([0, 0], result.eval())
 
   def testTreeNonFinalized(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Depth 3 tree.
       tree1 = tree_ensemble_config.trees.add()
@@ -1269,17 +1354,17 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       result = prediction_ops.gradient_trees_partition_examples(
-          tree_ensemble_handle, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1])
+          tree_ensemble_handle, [self._dense_float_tensor1],
+          [self._sparse_float_indices1, self._sparse_float_indices2],
+          [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1, self._sparse_float_shape2],
+          [self._sparse_int_indices1], [self._sparse_int_values1],
+          [self._sparse_int_shape1])
 
       self.assertAllEqual([5, 3], result.eval())
 
   def testTreeFinalized(self):
-    with self.test_session():
+    with self.cached_session():
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       # Depth 3 tree.
       tree1 = tree_ensemble_config.trees.add()
@@ -1304,15 +1389,51 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       result = prediction_ops.gradient_trees_partition_examples(
-          tree_ensemble_handle, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1])
+          tree_ensemble_handle, [self._dense_float_tensor1],
+          [self._sparse_float_indices1, self._sparse_float_indices2],
+          [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1, self._sparse_float_shape2],
+          [self._sparse_int_indices1], [self._sparse_int_values1],
+          [self._sparse_int_shape1])
 
       self.assertAllEqual([0, 0], result.eval())
 
+  def testObliviousTreeNonFinalized(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Depth 3 tree.
+      tree1 = tree_ensemble_config.trees.add()
+      _set_float_oblivious_split(
+          tree1.nodes.add().oblivious_dense_float_binary_split, 0, 5.0)
+      _set_float_oblivious_split(
+          tree1.nodes.add().oblivious_dense_float_binary_split, 1, 3.0)
+      _set_float_oblivious_split(
+          tree1.nodes.add().oblivious_dense_float_binary_split, 2, 1.0)
+      for i in range(1, 9):
+        _append_to_leaf(tree1.nodes.add().leaf, 0, i / 10.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_metadata.add().is_finalized = False
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      result = prediction_ops.gradient_trees_partition_examples(
+          tree_ensemble_handle, [
+              self._dense_float_tensor1,
+              self._dense_float_tensor2,
+              self._dense_float_tensor3
+          ], [], [], [], [], [], [])
+
+      # The first example goes right, left, right in the tree and the second
+      # example goes lef, left, left. Since the depth of the tree is 3, the
+      # partition id's are as follows:
+      # First example: 3 + 5 = 8
+      # Second exampel: 3 + 0 = 3
+      self.assertAllEqual([8, 3], result.eval())
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index 074623699d..848c42b686 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -77,7 +77,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     example_weights = constant_op.constant(
         [10, 1, 1, 1, 1, 1], dtype=dtypes.float32)
 
-    with self.test_session():
+    with self.cached_session():
       config = self._gen_config(0.33, 3)
       dense_buckets, sparse_buckets = quantile_ops.quantile_buckets(
           [dense_float_tensor_0], [sparse_indices_0, sparse_indices_m],
@@ -107,7 +107,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     """
 
     num_quantiles = 3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=num_quantiles,
           epsilon=0.001, name="q1")
@@ -119,7 +119,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
         column=input_column,
         example_weights=weights)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for i in range(1, 23):
         # start = 1, 2, 4, 7, 11, 16 ... (see comment above)
         start = int((i * (i-1) / 2) + 1)
@@ -127,7 +127,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
                  {input_column: range(start, start+i),
                   weights: [1] * i})
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
       are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
       buckets, are_ready_flush = (sess.run(
@@ -142,7 +142,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     num_quantiles = 3
     # set generate_quantiles to True since the test will generate fewer
     # boundaries otherwise.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=num_quantiles,
           epsilon=0.001, name="q1", generate_quantiles=True)
@@ -154,7 +154,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
         column=input_column,
         example_weights=weights)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # This input is generated by integer in the range [2030, 2060]
       # but represented by with float16 precision. Integers <= 2048 are
       # exactly represented, whereas  numbers > 2048 are rounded; and hence
@@ -174,7 +174,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
                {input_column: inputs,
                 weights: [1] * len(inputs)})
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
       are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
       buckets, are_ready_flush = (sess.run(
@@ -189,7 +189,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
 
     # set generate_quantiles to True since the test will generate fewer
     # boundaries otherwise.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=num_quantiles,
           epsilon=0.001, name="q1", generate_quantiles=True)
@@ -201,12 +201,12 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
         column=input_column,
         example_weights=weights)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(update,
                {input_column: inputs,
                 weights: [1] * len(inputs)})
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
       are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
       buckets, are_ready_flush = (sess.run(
@@ -265,7 +265,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     [9900 9901 .. 9999]
     All the batches have 1 for all the example weights.
     """
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q1")
       resources.initialize_resources(resources.shared_resources()).run()
@@ -275,7 +275,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
         stamp_token=0,
         column=dense_placeholder,
         example_weights=weight_placeholder)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for i in range(100):
         dense_float = np.linspace(
             i * 100, (i + 1) * 100 - 1, num=100).reshape(-1, 1)
@@ -284,7 +284,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
             weight_placeholder: np.ones(shape=(100, 1), dtype=np.float32)
         })
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
       are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
       buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
@@ -301,7 +301,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     [9900 9901 .. 9999]
     All the batches have 1 for all the example weights.
     """
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q1")
       accumulator_2 = quantile_ops.QuantileAccumulator(
@@ -313,7 +313,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
         stamp_token=0,
         column=dense_placeholder,
         example_weights=weight_placeholder)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for i in range(100):
         dense_float = np.linspace(
             i * 100, (i + 1) * 100 - 1, num=100).reshape(-1, 1)
@@ -322,7 +322,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
             weight_placeholder: np.ones(shape=(100, 1), dtype=np.float32)
         })
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       summary = sess.run(
           accumulator.flush_summary(stamp_token=0, next_stamp_token=1))
       sess.run(
@@ -338,7 +338,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
 
@@ -366,7 +366,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(True, are_ready_flush)
       self.assertAllEqual([2, 4, 6.], buckets)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
       save = saver.Saver()
@@ -389,7 +389,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
 
@@ -413,7 +413,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([1, 3, 5], buckets)
       save.save(sess, save_path)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
       save = saver.Saver()
@@ -438,7 +438,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
         [1] * (int(math.pow(2, 16)) + 1), dtype=dtypes.float32)
     config = self._gen_config(0.1, 10)
 
-    with self.test_session():
+    with self.cached_session():
       dense_buckets, _ = quantile_ops.quantile_buckets(
           [dense_float_tensor_0], [], [], [],
           example_weights=example_weights,
@@ -464,7 +464,7 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
 
     config = self._gen_config(0.1, 10)
 
-    with self.test_session():
+    with self.cached_session():
       dense_buckets, _ = quantile_ops.quantile_buckets(
           [dense_float_tensor_0], [], [], [],
           example_weights=example_weights,
@@ -533,7 +533,7 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
     self._sparse_thresholds_m = [1, 2, 1000]
 
   def testDenseFeaturesOnly(self):
-    with self.test_session():
+    with self.cached_session():
       dense_quantiles, _ = quantile_ops.quantiles(
           [self._dense_float_tensor_0, self._dense_float_tensor_1], [],
           [self._dense_thresholds_0, self._dense_thresholds_1], [], [])
@@ -546,7 +546,7 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
                           dense_quantiles[1].eval())
 
   def testSparseFeaturesOnly(self):
-    with self.test_session():
+    with self.cached_session():
       _, sparse_quantiles = quantile_ops.quantiles([], [
           self._sparse_values_0, self._sparse_values_1, self._sparse_values_2,
           self._sparse_values_m
@@ -571,7 +571,7 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
                           sparse_quantiles[3].eval())
 
   def testDenseAndSparseFeatures(self):
-    with self.test_session():
+    with self.cached_session():
       dense_quantiles, sparse_quantiles = quantile_ops.quantiles(
           [self._dense_float_tensor_0, self._dense_float_tensor_1], [
               self._sparse_values_0, self._sparse_values_1,
@@ -602,14 +602,14 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
                           sparse_quantiles[3].eval())
 
   def testBucketizeWithInputBoundaries(self):
-    with self.test_session():
+    with self.cached_session():
       buckets = quantile_ops.bucketize_with_input_boundaries(
           input=[1, 2, 3, 4, 5],
           boundaries=[3])
       self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval())
 
   def testBucketizeWithInputBoundaries2(self):
-    with self.test_session():
+    with self.cached_session():
       boundaries = constant_op.constant([3], dtype=dtypes.float32)
       buckets = quantile_ops.bucketize_with_input_boundaries(
           input=[1, 2, 3, 4, 5],
@@ -617,7 +617,7 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval())
 
   def testBucketizeWithInputBoundaries3(self):
-    with self.test_session():
+    with self.cached_session():
       b = array_ops.placeholder(dtypes.float32)
       buckets = quantile_ops.bucketize_with_input_boundaries(
           input=[1, 2, 3, 4, 5],
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
index 5cd37ec67e..5e62bad672 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
@@ -33,7 +33,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeDenseSplit(self):
     """Tests split handler op."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # The data looks like the following after dividing by number of steps (2).
       # Gradients    | Partition | Dense Quantile |
       # (1.2, 0.2)   | 0         | 0              |
@@ -59,7 +59,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
               min_node_weight=0,
               class_id=-1,
               feature_column_group_id=0,
-              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+              weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE))
       partitions, gains, splits = sess.run([partitions, gains, splits])
     self.assertAllEqual([0, 1], partitions)
 
@@ -110,7 +111,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeMulticlassDenseSplit(self):
     """Tests split handler op."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       partition_ids = array_ops.constant([0, 0, 1], dtype=dtypes.int32)
       bucket_ids = array_ops.constant(
           [[0, 0], [1, 0], [1, 0]], dtype=dtypes.int64)
@@ -132,7 +133,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
               min_node_weight=0,
               class_id=-1,
               feature_column_group_id=0,
-              multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN))
+              multiclass_strategy=learner_pb2.LearnerConfig.FULL_HESSIAN,
+              weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE))
       partitions, gains, splits = sess.run([partitions, gains, splits])
     self.assertAllEqual([0, 1], partitions)
 
@@ -151,7 +153,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeDenseSplitEmptyInputs(self):
     """Tests empty inputs op."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       partition_ids = array_ops.constant([], dtype=dtypes.int32)
       bucket_ids = array_ops.constant([[]], dtype=dtypes.int64)
       gradients = array_ops.constant([])
@@ -171,7 +173,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
               min_node_weight=0,
               class_id=-1,
               feature_column_group_id=0,
-              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
+              weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE))
       partitions, gains, splits = sess.run([partitions, gains, splits])
     # .assertEmpty doesn't exist on ubuntu-contrib
     self.assertEqual(0, len(partitions))
@@ -180,7 +183,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeSparseSplit(self):
     """Tests split handler op."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # The data looks like the following after dividing by number of steps (2).
       # Gradients    | Partition | bucket ID       |
       # (0.9, 0.39)  | 0         | -1              |
@@ -271,7 +274,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeSparseSplitAllEmptyDimensions(self):
     """Tests split handler op when all dimensions have only bias bucket id."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # The data looks like the following after dividing by number of steps (2).
       # Gradients    | Partition | Dimension | bucket ID       |
       # (0.9, 0.39)  | 0         |    0      |  -1             |
@@ -304,7 +307,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeSparseMultidimensionalSplit(self):
     """Tests split handler op."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Num of steps is 2.
       # The feature column is three dimensional.
       # First dimension has bias bucket only, the second has bias bucket and
@@ -405,7 +408,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
     """Tests default direction is stable when no sparsity."""
     random.seed(1123)
     for _ in range(50):
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         grad = random.random()
         hessian = random.random()
         # The data looks like the following (divide by the num of steps 2).
@@ -462,7 +465,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeMulticlassSparseSplit(self):
     """Tests split handler op."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       partition_ids = array_ops.constant([0, 0, 0, 1, 1], dtype=dtypes.int32)
     bucket_ids = array_ops.constant(
         [[-1, 0], [0, 0], [1, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
@@ -511,7 +514,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeCategoricalEqualitySplit(self):
     """Tests split handler op for categorical equality split."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # The data looks like the following after dividing by number of steps (2).
       # Gradients    | Partition | Feature ID     |
       # (0.9, 0.39)  | 0         | -1             |
@@ -605,7 +608,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
   def testMakeMulticlassCategoricalEqualitySplit(self):
     """Tests split handler op for categorical equality split in multiclass."""
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
                                       [9.0, 3.1], [3.0, 0.8]])
 
@@ -652,7 +655,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, split_node.feature_id)
 
   def testMakeCategoricalEqualitySplitEmptyInput(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gradients = []
       hessians = []
       partition_ids = []
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
index 978bf530cd..05ce0884cc 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
@@ -29,7 +29,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
   """Tests for scalar gradients and hessians accumulator."""
 
   def testSimpleAcculumator(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
@@ -57,7 +57,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(2, 3, 0)], [0.3, 0.4])
 
   def testMultidimensionalAcculumator(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
@@ -86,7 +86,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(2, 3, 1)], [0.1, 0.2])
 
   def testDropStaleUpdate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
@@ -118,7 +118,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(2, 3, 0)], [0.3, 0.4])
 
   def testSerialize(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
@@ -159,7 +159,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
       self.assertEqual(0, stamp_token)
 
   def testDeserialize(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
@@ -196,7 +196,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(4, 6, 2)], [0.5, 0.7])
 
   def testMakeSummary(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
@@ -218,7 +218,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
   """Tests for tensor gradients and hessians accumulator."""
 
   def testSimpleAcculumator(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
@@ -256,7 +256,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(2, 3, 0)][1], [[0.05, 0.06], [0.07, 0.08]])
 
   def testMultidimensionalAcculumator(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
@@ -294,7 +294,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(2, 3, 1)][1], [[0.05, 0.06], [0.07, 0.08]])
 
   def testDropStaleUpdate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
@@ -331,7 +331,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(2, 3, 0)][1], [[0.05, 0.06], [0.07, 0.08]])
 
   def testSerialize(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
@@ -381,7 +381,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(result_1[2, 3, 0][1], result_2[2, 3, 0][1])
 
   def testDeserialize(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
@@ -425,7 +425,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       self.assertAllClose(result[(4, 5, 0)][1], [[0.07, 0.08], [0.09, 0.10]])
 
   def testMakeSummary(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = stats_accumulator_ops.StatsAccumulator(
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
index e39e1de8d1..b3e4c2e5f7 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
@@ -91,6 +91,31 @@ def _gen_dense_split_info(fc, threshold, left_weight, right_weight):
   return split.SerializeToString()
 
 
+def _gen_dense_oblivious_split_info(fc, threshold, leave_weights,
+                                    children_parent_id):
+  split_str = """
+    split_node {
+      oblivious_dense_float_binary_split {
+        feature_column: %d
+        threshold: %f
+      }
+    }""" % (fc, threshold)
+  for weight in leave_weights:
+    split_str += """
+    children {
+      vector {
+        value: %f
+      }
+    }""" % (
+        weight)
+  for x in children_parent_id:
+    split_str += """
+    children_parent_id: %d""" % (x)
+  split = split_info_pb2.ObliviousSplitInfo()
+  text_format.Merge(split_str, split)
+  return split.SerializeToString()
+
+
 def _gen_categorical_split_info(fc, feat_id, left_weight, right_weight):
   split_str = """
     split_node {
@@ -125,7 +150,7 @@ class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
 
   def testCenterBias(self):
     """Tests bias centering for multiple iterations."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create empty ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -276,7 +301,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowEmptyEnsemble(self):
     """Test growing an empty ensemble."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create empty ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -324,7 +349,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the simpler split from handler 1 to be chosen.
@@ -383,9 +409,122 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(stats.attempted_layers, 1)
       self.assertProtoEquals(expected_result, tree_ensemble_config)
 
+  def testGrowEmptyEnsembleObliviousCase(self):
+    """Test growing an empty ensemble in the oblivious case."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+
+      # Prepare handler inputs.
+      # Note that handlers 1 & 3 have the same gain but different splits.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [
+          _gen_dense_oblivious_split_info(0, 0.52, [-4.375, 7.143], [0])
+      ]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [
+          _gen_dense_oblivious_split_info(0, 0.23, [-0.6, 0.24], [0])
+      ]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [
+          _gen_dense_oblivious_split_info(0, 7, [-4.375, 7.143], [0])
+      ]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config.SerializeToString(),
+          dropout_seed=123,
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.OBLIVIOUS_DECISION_TREE)
+      session.run(grow_op)
+
+      # Expect the split with bigger handler_id, i.e. handler 3 to be chosen.
+      # The grown tree should be finalized as max tree depth is 1.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 0
+              threshold: 7
+            }
+            node_metadata {
+              gain: 7.62
+              original_oblivious_leaves {
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.143
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_layers, 1)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 1)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
   def testGrowExistingEnsembleTreeNotFinalized(self):
     """Test growing an existing ensemble with the last tree not finalized."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create existing ensemble with one root split
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       text_format.Merge("""
@@ -476,7 +615,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the split for partition 1 to be chosen from handler 1 and
@@ -575,7 +715,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowExistingEnsembleTreeFinalized(self):
     """Test growing an existing ensemble with the last tree finalized."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create existing ensemble with one root split
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       text_format.Merge("""
@@ -661,7 +801,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect a new tree to be added with the split from handler 1.
@@ -757,7 +898,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowEnsemblePrePrune(self):
     """Test growing an ensemble with pre-pruning."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create empty ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -798,7 +939,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the ensemble to be empty.
@@ -823,7 +965,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowEnsemblePostPruneNone(self):
     """Test growing an empty ensemble."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create empty ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -869,7 +1011,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the simpler split from handler 1 to be chosen.
@@ -930,7 +1073,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowEnsemblePostPruneAll(self):
     """Test growing an ensemble with post-pruning."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create empty ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -971,7 +1114,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the split from handler 2 to be chosen despite the negative gain.
@@ -1053,7 +1197,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the ensemble to be empty as post-pruning will prune
@@ -1079,7 +1224,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowEnsemblePostPrunePartial(self):
     """Test growing an ensemble with post-pruning."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create empty ensemble.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree_ensemble_handle = model_ops.tree_ensemble_variable(
@@ -1120,7 +1265,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the split from handler 2 to be chosen despite the negative gain.
@@ -1200,7 +1346,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the negative gain split of partition 1 to be pruned and the
@@ -1280,7 +1427,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowEnsembleTreeLayerByLayer(self):
     """Test growing an existing ensemble with the last tree not finalized."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create existing ensemble with one root split
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       text_format.Merge("""
@@ -1371,7 +1518,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect the split for partition 1 to be chosen from handler 1 and
@@ -1470,9 +1618,721 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(stats.attempted_layers, 2)
       self.assertProtoEquals(expected_result, tree_ensemble_config)
 
+  def testGrowEnsembleTreeLayerByLayerObliviousCase(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 4
+              threshold: 7
+            }
+            node_metadata {
+              gain: 7.62
+              original_oblivious_leaves {
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.143
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.375
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=3,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.LAYER_BY_LAYER)
+
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([1.4], dtype=np.float32)
+      handler1_split = [
+          _gen_dense_oblivious_split_info(0, 0.21, [-6.0, 1.65, 1.0, -0.5],
+                                          [1, 2])
+      ]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([2.7], dtype=np.float32)
+      handler2_split = [
+          _gen_dense_oblivious_split_info(0, 0.23, [-0.6, 0.24, 0.3, 0.4],
+                                          [1, 2])
+      ]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([1.7], dtype=np.float32)
+      handler3_split = [
+          _gen_dense_oblivious_split_info(0, 3, [-0.75, 1.93, 0.2, -0.1],
+                                          [1, 2])
+      ]
+
+      # Grow tree ensemble layer by layer.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config.SerializeToString(),
+          dropout_seed=123,
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.OBLIVIOUS_DECISION_TREE)
+      session.run(grow_op)
+
+      # Expect the split for partition 1 to be chosen from handler 1 and
+      # the split for partition 2 to be chosen from handler 2.
+      # The grown tree should not be finalized as max tree depth is 3 and
+      # it's only grown 2 layers.
+      # The partition 1 split weights get added to original leaf weight 7.143.
+      # The partition 2 split weights get added to original leaf weight -4.375.
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 4
+              threshold: 7
+            }
+            node_metadata {
+              gain: 7.62
+              original_oblivious_leaves {
+              }
+            }
+          }
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 0
+              threshold: 0.23
+            }
+            node_metadata {
+              gain: 2.7
+              original_oblivious_leaves {
+                vector {
+                  value: 7.143
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -4.375
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 6.543
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.383
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.075
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -3.975
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 2)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 2)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 2)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowEnsembleWithEmptyNodesMiddleCase(self):
+    """Test case: The middle existing leaves don't have examples."""
+    with self.test_session() as session:
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 4
+              threshold: 7
+            }
+            node_metadata {
+              gain: 7.62
+              original_oblivious_leaves {
+              }
+            }
+          }
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 1
+              threshold: 0.23
+            }
+            node_metadata {
+              gain: 2.7
+              original_oblivious_leaves {
+                vector {
+                  value: 7.143
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -4.375
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 6.543
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.5
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.075
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -3.975
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=6,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.LAYER_BY_LAYER)
+
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([1.8], dtype=np.float32)
+      handler1_split = [
+          _gen_dense_oblivious_split_info(0, 0.9, [1.0, 2.0, 3.0, 4.0], [2, 5])
+      ]
+      # The tree currently has depth 2, so the ids for the four leaves are in
+      # the range [2, 6). In this test case we are assuming that our examples
+      # only fall in leaves 2 and 5.
+
+      # Grow tree ensemble layer by layer.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[handler1_partitions],
+          gains=[handler1_gains],
+          splits=[handler1_split],
+          learner_config=learner_config.SerializeToString(),
+          dropout_seed=123,
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.OBLIVIOUS_DECISION_TREE)
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 4
+              threshold: 7
+            }
+            node_metadata {
+              gain: 7.62
+              original_oblivious_leaves {
+              }
+            }
+          }
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 1
+              threshold: 0.23
+            }
+            node_metadata {
+              gain: 2.7
+              original_oblivious_leaves {
+                vector {
+                  value: 7.143
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -4.375
+                }
+              }
+            }
+          }
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 0
+              threshold: 0.9
+            }
+            node_metadata {
+              gain: 1.8
+              original_oblivious_leaves {
+                vector {
+                  value: 6.543
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: 7.5
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -4.075
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -3.975
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.543
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 8.543
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.5
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.5
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.075
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.075
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.975
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 0.025
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 3
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 3)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 3)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 3)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
+  def testGrowEnsembleWithEmptyNodesBorderCase(self):
+    """Test case: The first and last existing leaves don't have examples."""
+    with self.test_session() as session:
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 4
+              threshold: 7
+            }
+            node_metadata {
+              gain: 7.62
+              original_oblivious_leaves {
+              }
+            }
+          }
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 1
+              threshold: 0.23
+            }
+            node_metadata {
+              gain: 2.7
+              original_oblivious_leaves {
+                vector {
+                  value: 7.143
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -4.375
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 6.543
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 7.5
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -4.075
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -3.975
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=6,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.LAYER_BY_LAYER)
+
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([1.8], dtype=np.float32)
+      handler1_split = [
+          _gen_dense_oblivious_split_info(0, 0.9, [1.0, 2.0, 3.0, 4.0], [3, 4])
+      ]
+      # The tree currently has depth 2, so the ids for the four leaves are in
+      # the range [2, 6). In this test case we are assuming that our examples
+      # only fall in leaves 3 and 4.
+
+      # Grow tree ensemble layer by layer.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=0.1,
+          partition_ids=[handler1_partitions],
+          gains=[handler1_gains],
+          splits=[handler1_split],
+          learner_config=learner_config.SerializeToString(),
+          dropout_seed=123,
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.OBLIVIOUS_DECISION_TREE)
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      stats = session.run(
+          training_ops.tree_ensemble_stats(tree_ensemble_handle, stamp_token=1))
+      tree_ensemble_config.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 4
+              threshold: 7
+            }
+            node_metadata {
+              gain: 7.62
+              original_oblivious_leaves {
+              }
+            }
+          }
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 1
+              threshold: 0.23
+            }
+            node_metadata {
+              gain: 2.7
+              original_oblivious_leaves {
+                vector {
+                  value: 7.143
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -4.375
+                }
+              }
+            }
+          }
+          nodes {
+            oblivious_dense_float_binary_split {
+              feature_column: 0
+              threshold: 0.9
+            }
+            node_metadata {
+              gain: 1.8
+              original_oblivious_leaves {
+                vector {
+                  value: 6.543
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: 7.5
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -4.075
+                }
+              }
+              original_oblivious_leaves {
+                vector {
+                  value: -3.975
+                }
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 6.543
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 6.543
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 8.5
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 9.5
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -1.075
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.075
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -3.975
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -3.975
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 3
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertEqual(stats.num_trees, 0)
+      self.assertEqual(stats.num_layers, 3)
+      self.assertEqual(stats.active_tree, 1)
+      self.assertEqual(stats.active_layer, 3)
+      self.assertEqual(stats.attempted_trees, 1)
+      self.assertEqual(stats.attempted_layers, 3)
+      self.assertProtoEquals(expected_result, tree_ensemble_config)
+
   def testGrowExistingEnsembleTreeFinalizedWithDropout(self):
     """Test growing an existing ensemble with the last tree finalized."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create existing ensemble with one root split and one bias tree.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       text_format.Merge("""
@@ -1575,7 +2435,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       # Expect a new tree to be added with the split from handler 1.
@@ -1596,7 +2457,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
 
   def testGrowExistingEnsembleTreeWithFeatureSelectionUsedHandlers(self):
     """Test growing a tree with feature selection."""
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Create existing ensemble with one root split and one bias tree.
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       text_format.Merge("""
@@ -1700,7 +2561,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
           center_bias=True,
-          max_tree_depth=learner_config.constraints.max_tree_depth)
+          max_tree_depth=learner_config.constraints.max_tree_depth,
+          weak_learner_type=learner_pb2.LearnerConfig.NORMAL_DECISION_TREE)
       session.run(grow_op)
 
       _, serialized = session.run(
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index d0d1249bd6..97743ba255 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -218,6 +218,21 @@ def extract_features(features, feature_columns, use_core_columns):
   sparse_int_shapes = []
   for key in sorted(features.keys()):
     tensor = features[key]
+    # TODO(nponomareva): consider iterating over feature columns instead.
+    if isinstance(tensor, tuple):
+      # Weighted categorical feature.
+      categorical_tensor = tensor[0]
+      weight_tensor = tensor[1]
+
+      shape = categorical_tensor.dense_shape
+      indices = array_ops.concat([
+          array_ops.slice(categorical_tensor.indices, [0, 0], [-1, 1]),
+          array_ops.expand_dims(
+              math_ops.to_int64(categorical_tensor.values), -1)
+      ], 1)
+      tensor = sparse_tensor.SparseTensor(
+          indices=indices, values=weight_tensor.values, dense_shape=shape)
+
     if isinstance(tensor, sparse_tensor.SparseTensor):
       if tensor.values.dtype == dtypes.float32:
         sparse_float_names.append(key)
@@ -672,6 +687,8 @@ class GradientBoostedDecisionTreeModel(object):
         self._learner_config.constraints.min_node_weight, dtypes.float32)
     loss_uses_sum_reduction = self._loss_reduction == losses.Reduction.SUM
     loss_uses_sum_reduction = constant_op.constant(loss_uses_sum_reduction)
+    weak_learner_type = constant_op.constant(
+        self._learner_config.weak_learner_type)
     epsilon = 0.01
     num_quantiles = 100
     strategy_tensor = constant_op.constant(strategy)
@@ -696,6 +713,7 @@ class GradientBoostedDecisionTreeModel(object):
                 multiclass_strategy=strategy_tensor,
                 init_stamp_token=init_stamp_token,
                 loss_uses_sum_reduction=loss_uses_sum_reduction,
+                weak_learner_type=weak_learner_type,
             ))
         fc_name_idx += 1
 
@@ -1058,7 +1076,8 @@ class GradientBoostedDecisionTreeModel(object):
             learner_config=self._learner_config_serialized,
             dropout_seed=dropout_seed,
             center_bias=self._center_bias,
-            max_tree_depth=self._max_tree_depth)
+            max_tree_depth=self._max_tree_depth,
+            weak_learner_type=self._learner_config.weak_learner_type)
 
       def _grow_ensemble_not_ready_fn():
         # Don't grow the ensemble, just update the stamp.
@@ -1073,7 +1092,8 @@ class GradientBoostedDecisionTreeModel(object):
             learner_config=self._learner_config_serialized,
             dropout_seed=dropout_seed,
             center_bias=self._center_bias,
-            max_tree_depth=self._max_tree_depth)
+            max_tree_depth=self._max_tree_depth,
+            weak_learner_type=self._learner_config.weak_learner_type)
 
       def _grow_ensemble_fn():
         # Conditionally grow an ensemble depending on whether the splits
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index e92f0bb841..150d734db6 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -34,6 +34,9 @@ Checkpointable data structures:
 
 Checkpoint management:
 @@CheckpointManager
+
+Saving and restoring Python state:
+@@NumpyState
 """
 
 from __future__ import absolute_import
@@ -41,6 +44,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.checkpoint.python.containers import UniqueNameTracker
+from tensorflow.contrib.checkpoint.python.python_state import NumpyState
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
 from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index 7b200a29bf..ada4168726 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -9,6 +9,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":containers",
+        ":python_state",
         ":split_dependency",
         ":visualize",
         "//tensorflow/python/training/checkpointable:data_structures",
@@ -41,6 +42,33 @@ py_test(
 )
 
 py_library(
+    name = "python_state",
+    srcs = ["python_state.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python/training/checkpointable:base",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "python_state_test",
+    srcs = ["python_state_test.py"],
+    deps = [
+        ":python_state",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
     name = "split_dependency",
     srcs = ["split_dependency.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/checkpoint/python/python_state.py b/tensorflow/contrib/checkpoint/python/python_state.py
new file mode 100644
index 0000000000..9b11035b6d
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/python_state.py
@@ -0,0 +1,166 @@
+"""Utilities for including Python state in TensorFlow checkpoints."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy
+
+from tensorflow.python.training.checkpointable import base
+
+# pylint: disable=g-import-not-at-top
+try:
+  # In Python 2.x, use the faster string buffering option.
+  from cStringIO import StringIO as BytesIO
+except ImportError:
+  from io import BytesIO
+# pylint: enable=g-import-not-at-top
+
+
+class NumpyState(base.CheckpointableBase):
+  """A checkpointable object whose NumPy array attributes are saved/restored.
+
+  Example usage:
+
+  ```python
+  arrays = tf.contrib.checkpoint.NumpyState()
+  checkpoint = tf.train.Checkpoint(numpy_arrays=arrays)
+  arrays.x = numpy.zeros([3, 4])
+  save_path = checkpoint.save("/tmp/ckpt")
+  arrays.x[1, 1] = 4.
+  checkpoint.restore(save_path)
+  assert (arrays.x == numpy.zeros([3, 4])).all()
+
+  second_checkpoint = tf.train.Checkpoint(
+      numpy_arrays=tf.contrib.checkpoint.NumpyState())
+  # Attributes of NumpyState objects are created automatically by restore()
+  second_checkpoint.restore(save_path)
+  assert (second_checkpoint.numpy_arrays.x == numpy.zeros([3, 4])).all()
+  ```
+
+  Note that `NumpyState` objects re-create the attributes of the previously
+  saved object on `restore()`. This is in contrast to TensorFlow variables, for
+  which a `Variable` object must be created and assigned to an attribute.
+
+  This snippet works both when graph building and when executing eagerly. On
+  save, the NumPy array(s) are fed as strings to be saved in the checkpoint (via
+  a placeholder when graph building, or as a string constant when executing
+  eagerly). When restoring they skip the TensorFlow graph entirely, and so no
+  restore ops need be run. This means that restoration always happens eagerly,
+  rather than waiting for `checkpoint.restore(...).run_restore_ops()` like
+  TensorFlow variables when graph building.
+  """
+
+  def _lookup_dependency(self, name):
+    """Create placeholder NumPy arrays for to-be-restored attributes.
+
+    Typically `_lookup_dependency` is used to check by name whether a dependency
+    exists. We cheat slightly by creating a checkpointable object for `name` if
+    we don't already have one, giving us attribute re-creation behavior when
+    loading a checkpoint.
+
+    Args:
+      name: The name of the dependency being checked.
+    Returns:
+      An existing dependency if one exists, or a new `_NumpyWrapper` placeholder
+      dependency (which will generally be restored immediately).
+    """
+    value = super(NumpyState, self)._lookup_dependency(name)
+    if value is None:
+      value = _NumpyWrapper(numpy.array([]))
+      new_reference = base.CheckpointableReference(name=name, ref=value)
+      self._unconditional_checkpoint_dependencies.append(new_reference)
+      self._unconditional_dependency_names[name] = value
+      super(NumpyState, self).__setattr__(name, value)
+    return value
+
+  def __getattribute__(self, name):
+    """Un-wrap `_NumpyWrapper` objects when accessing attributes."""
+    value = super(NumpyState, self).__getattribute__(name)
+    if isinstance(value, _NumpyWrapper):
+      return value.array
+    return value
+
+  def __setattr__(self, name, value):
+    """Automatically wrap NumPy arrays assigned to attributes."""
+    # TODO(allenl): Consider supporting lists/tuples, either ad-hoc or by making
+    # ndarrays checkpointable natively and using standard checkpointable list
+    # tracking.
+    if isinstance(value, numpy.ndarray):
+      try:
+        existing = super(NumpyState, self).__getattribute__(name)
+        existing.array = value
+        return
+      except AttributeError:
+        value = _NumpyWrapper(value)
+        self._track_checkpointable(value, name=name, overwrite=True)
+    elif (name not in ("_setattr_tracking", "_update_uid")
+          and getattr(self, "_setattr_tracking", True)):
+      # Mixing restore()-created attributes with user-added checkpointable
+      # objects is tricky, since we can't use the `_lookup_dependency` trick to
+      # re-create attributes (we might accidentally steal the restoration for
+      # another checkpointable object). For now `NumpyState` objects must be
+      # leaf nodes. Theoretically we could add some extra arguments to
+      # `_lookup_dependency` to figure out whether we should create a NumPy
+      # array for the attribute or not.
+      raise NotImplementedError(
+          ("Assigned %s to the %s property of %s, which is not a NumPy array. "
+           "Currently mixing NumPy arrays and other checkpointable objects is "
+           "not supported. File a feature request if this limitation bothers "
+           "you.")
+          % (value, name, self))
+    super(NumpyState, self).__setattr__(name, value)
+
+
+class _NumpyWrapper(base.CheckpointableBase):
+  """Wraps a NumPy array for storage in an object-based checkpoint."""
+
+  def __init__(self, array):
+    """Specify a NumPy array to wrap.
+
+    Args:
+      array: The NumPy array to save and restore (may be overwritten).
+    """
+    self.array = array
+
+  def _serialize(self):
+    """Callback for `PythonStringStateSaveable` to serialize the array."""
+    string_file = BytesIO()
+    try:
+      numpy.save(string_file, self.array, allow_pickle=False)
+      serialized = string_file.getvalue()
+    finally:
+      string_file.close()
+    return serialized
+
+  def _deserialize(self, string_value):
+    """Callback for `PythonStringStateSaveable` to deserialize the array."""
+    string_file = BytesIO(string_value)
+    try:
+      self.array = numpy.load(string_file, allow_pickle=False)
+    finally:
+      string_file.close()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Specify callbacks for saving and restoring `array`."""
+    return {
+        "array": functools.partial(
+            base.PythonStringStateSaveable,
+            state_callback=self._serialize,
+            restore_callback=self._deserialize)
+        }
diff --git a/tensorflow/contrib/checkpoint/python/python_state_test.py b/tensorflow/contrib/checkpoint/python/python_state_test.py
new file mode 100644
index 0000000000..0439a4755e
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/python_state_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy
+
+from tensorflow.contrib.checkpoint.python import python_state
+from tensorflow.python.client import session
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import util
+
+
+class NumpyStateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestoreNumpyState(self):
+    directory = self.get_temp_dir()
+    prefix = os.path.join(directory, "ckpt")
+    save_state = python_state.NumpyState()
+    saver = util.Checkpoint(numpy=save_state)
+    save_state.a = numpy.ones([2, 2])
+    save_state.b = numpy.ones([2, 2])
+    save_state.b = numpy.zeros([2, 2])
+    self.assertAllEqual(numpy.ones([2, 2]), save_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), save_state.b)
+    first_save_path = saver.save(prefix)
+    save_state.a[1, 1] = 2.
+    second_save_path = saver.save(prefix)
+
+    load_state = python_state.NumpyState()
+    loader = util.Checkpoint(numpy=load_state)
+    loader.restore(first_save_path).initialize_or_restore()
+    self.assertAllEqual(numpy.ones([2, 2]), load_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), load_state.b)
+    load_state.a[0, 0] = 42.
+    self.assertAllEqual([[42., 1.], [1., 1.]], load_state.a)
+    loader.restore(first_save_path).run_restore_ops()
+    self.assertAllEqual(numpy.ones([2, 2]), load_state.a)
+    loader.restore(second_save_path).run_restore_ops()
+    self.assertAllEqual([[1., 1.], [1., 2.]], load_state.a)
+    self.assertAllEqual(numpy.zeros([2, 2]), load_state.b)
+
+  def testNoGraphPollution(self):
+    graph = ops.Graph()
+    with graph.as_default(), session.Session():
+      directory = self.get_temp_dir()
+      prefix = os.path.join(directory, "ckpt")
+      save_state = python_state.NumpyState()
+      saver = util.Checkpoint(numpy=save_state)
+      save_state.a = numpy.ones([2, 2])
+      save_path = saver.save(prefix)
+      saver.restore(save_path)
+      graph.finalize()
+      saver.save(prefix)
+      save_state.a = numpy.zeros([2, 2])
+      saver.save(prefix)
+      saver.restore(save_path)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoMixedNumpyStateTF(self):
+    save_state = python_state.NumpyState()
+    save_state.a = numpy.ones([2, 2])
+    with self.assertRaises(NotImplementedError):
+      save_state.v = variables.Variable(1.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDocstringExample(self):
+    arrays = python_state.NumpyState()
+    checkpoint = util.Checkpoint(numpy_arrays=arrays)
+    arrays.x = numpy.zeros([3, 4])
+    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    arrays.x[1, 1] = 4.
+    checkpoint.restore(save_path)
+    self.assertAllEqual(numpy.zeros([3, 4]), arrays.x)
+
+    second_checkpoint = util.Checkpoint(numpy_arrays=python_state.NumpyState())
+    second_checkpoint.restore(save_path)
+    self.assertAllEqual(numpy.zeros([3, 4]), second_checkpoint.numpy_arrays.x)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
index 58fadffce3..e57a66b99f 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
@@ -33,7 +33,7 @@ bool IsPartitionEmpty(const BigQueryTablePartition& partition) {
 
 Status ParseJson(StringPiece json, Json::Value* result) {
   Json::Reader reader;
-  if (!reader.parse(json.ToString(), *result)) {
+  if (!reader.parse(string(json), *result)) {
     return errors::Internal("Couldn't parse JSON response from BigQuery.");
   }
   return Status::OK();
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
index 1af43a3e10..f1fcaff73b 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
-#define TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#ifndef TENSORFLOW_CONTRIB_CLOUD_KERNELS_BIGQUERY_TABLE_ACCESSOR_H_
+#define TENSORFLOW_CONTRIB_CLOUD_KERNELS_BIGQUERY_TABLE_ACCESSOR_H_
 
 #include <map>
 #include <memory>
@@ -198,4 +198,4 @@ class BigQueryTableAccessor {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#endif  // TENSORFLOW_CONTRIB_CLOUD_KERNELS_BIGQUERY_TABLE_ACCESSOR_H_
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
index fea6b15640..6f4d54ae4a 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
-#define TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#ifndef TENSORFLOW_CONTRIB_CLOUD_KERNELS_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#define TENSORFLOW_CONTRIB_CLOUD_KERNELS_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
 
 #include <string>
 
@@ -401,4 +401,4 @@ const string kTestEmptyRow = R"({
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#endif  // TENSORFLOW_CONTRIB_CLOUD_KERNELS_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
diff --git a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
index 95e7e744d3..cb45e42734 100644
--- a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import os
 
 from tensorflow.contrib.cloud.python.ops import gen_gcs_config_ops
 from tensorflow.python.framework import dtypes
@@ -188,6 +189,8 @@ def configure_colab_session(session):
     session: A `tf.Session` session.
   """
   # Read from the application default credentials (adc).
-  with open('/content/datalab/adc.json') as f:
+  adc_filename = os.environ.get(
+      'GOOGLE_APPLICATION_CREDENTIALS', '/content/adc.json')
+  with open(adc_filename) as f:
     data = json.load(f)
   configure_gcs(session, credentials=data)
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 1d638e6402..479609458c 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,16 +16,16 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 1.20.0)
+set(nsync_TAG 1.20.1)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
 if(WIN32)
   set(nsync_HEADERS "${nsync_BUILD}/public/*.h")
-  set(nsync_STATIC_LIBRARIES ${nsync_INSTALL}/lib/nsync.lib)
+  set(nsync_STATIC_LIBRARIES ${nsync_INSTALL}/lib/nsync_cpp.lib)
 else()
   set(nsync_HEADERS "${nsync_BUILD}/public/*.h")
-  set(nsync_STATIC_LIBRARIES ${nsync_INSTALL}/lib/libnsync.a)
+  set(nsync_STATIC_LIBRARIES ${nsync_INSTALL}/lib/libnsync_cpp.a)
 endif()
 
 ExternalProject_Add(nsync
@@ -35,12 +35,12 @@ ExternalProject_Add(nsync
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
     BUILD_BYPRODUCTS ${nsync_STATIC_LIBRARIES}
-    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/nsync/CMakeLists.txt ${nsync_BUILD}
     INSTALL_DIR ${nsync_INSTALL}
     CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${nsync_INSTALL}
+        -DCMAKE_INSTALL_LIBDIR:STRING=lib
     -DNSYNC_LANGUAGE:STRING=c++11)
 
 set(nsync_HEADERS
diff --git a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
deleted file mode 100644
index 6f059c7225..0000000000
--- a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
+++ /dev/null
@@ -1,325 +0,0 @@
-cmake_minimum_required (VERSION 2.8.12)
-
-# nsync provides portable synchronization primitives, such as mutexes and
-# condition variables.
-project (nsync)
-
-# Set variable NSYNC_LANGUAGE to "c++11" to build with C++11
-# rather than C.
-
-# Some builds need position-independent code.
-set (CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-# -----------------------------------------------------------------
-# Platform dependencies
-
-# Many platforms use these posix related sources; even Win32.
-set (NSYNC_POSIX_SRC
-  "platform/posix/src/nsync_panic.c"
-  "platform/posix/src/per_thread_waiter.c"
-  "platform/posix/src/time_rep.c"
-  "platform/posix/src/yield.c"
-)
-
-if (WIN32)
-  # Suppress warnings to reduce build log size.
-  add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
-  add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
-  add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
-  add_definitions(/wd4003 /wd4244 /wd4267 /wd4503 /wd4506 /wd4800 /wd4996)
-  add_definitions(/wd8029)
-endif()
-
-# Many of the string matches below use a literal "X" suffix on both sides.
-# This is because some versions of cmake treat (for example) "MSVC" (in quotes)
-# as a reference to the variable MSVC, thus the expression
-#      "${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC"
-# is false when ${CMAKE_C_COMPILER_ID} has the value "MSVC"!  See
-#    https://cmake.org/cmake/help/v3.1/policy/CMP0054.html
-
-# Pick the include directory for the operating system.
-if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/c++11")
-  add_definitions ("-DNSYNC_USE_CPP11_TIMEPOINT -DNSYNC_ATOMIC_CPP11")
-  set (NSYNC_OS_CPP_SRC
-    "platform/c++11/src/per_thread_waiter.cc"
-    "platform/c++11/src/yield.cc"
-    "platform/c++11/src/time_rep_timespec.cc"
-    "platform/c++11/src/nsync_panic.cc"
-  )
-  if ("${CMAKE_SYSTEM_NAME}X" STREQUAL "WindowsX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/win32")
-    add_compile_options ("/TP")
-    set (NSYNC_OS_SRC
-      "platform/c++11/src/nsync_semaphore_mutex.cc"
-      "platform/win32/src/clock_gettime.c"
-      "platform/win32/src/pthread_key_win32.cc"
-      ${NSYNC_OS_CPP_SRC}
-    )
-    set (NSYNC_TEST_OS_SRC
-      "platform/win32/src/start_thread.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "DarwinX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/macos")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-    # Some versions of MacOS, such as Sierra, require _DARWIN_C_SOURCE
-    # when including certin C++ standard header files, such as <mutex>.
-    add_definitions ("-D_DARWIN_C_SOURCE")
-    add_compile_options ("-std=c++11")
-    set (NSYNC_OS_SRC
-      ${NSYNC_OS_CPP_SRC}
-      "platform/c++11/src/nsync_semaphore_mutex.cc"
-      "platform/posix/src/clock_gettime.c"
-      "platform/posix/src/nsync_semaphore_mutex.c"
-    )
-    set (NSYNC_TEST_OS_SRC
-      "platform/posix/src/start_thread.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "LinuxX")
-    include_directories (BEFORE "${PROJECT_SOURCE_DIR}/platform/c++11.futex")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-    add_compile_options ("-std=c++11")
-    set (NSYNC_OS_SRC
-      "platform/linux/src/nsync_semaphore_futex.c"
-      ${NSYNC_OS_CPP_SRC}
-    )
-    set (NSYNC_TEST_OS_SRC
-      "platform/posix/src/start_thread.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "NetBSDX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-    add_compile_options ("-std=c++11")
-    set (NSYNC_OS_SRC
-      "platform/c++11/src/nsync_semaphore_mutex.cc"
-      ${NSYNC_OS_CPP_SRC}
-    )
-    set (NSYNC_TEST_OS_SRC
-      "platform/posix/src/start_thread.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "FreeBSDX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-    add_compile_options ("-std=c++11")
-    set (NSYNC_OS_SRC
-      "platform/c++11/src/nsync_semaphore_mutex.cc"
-      ${NSYNC_OS_CPP_SRC}
-    )
-    set (NSYNC_TEST_OS_SRC
-      "platform/posix/src/start_thread.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "OpenBSDX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-    add_compile_options ("-std=c++11")
-    set (NSYNC_OS_SRC
-      "platform/c++11/src/nsync_semaphore_mutex.cc"
-      ${NSYNC_OS_CPP_SRC}
-    )
-    set (NSYNC_TEST_OS_SRC
-      "platform/posix/src/start_thread.c"
-    )
-  endif ()
-endif ()
-
-# Pick the include directory for the compiler.
-if ("${CMAKE_C_COMPILER_ID}X" STREQUAL "GNUX")
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/gcc")
-  set (THREADS_HAVE_PTHREAD_ARG ON)
-elseif ("${CMAKE_C_COMPILER_ID}X" STREQUAL "ClangX")
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/clang")
-  set (THREADS_HAVE_PTHREAD_ARG ON)
-elseif ("${CMAKE_C_COMPILER_ID}X" STREQUAL "MSVCX")
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/msvc")
-else ()
-  message (WARNING "CMAKE_C_COMPILER_ID (${CMAKE_C_COMPILER_ID}) matched NOTHING")
-endif ()
-
-if (NOT "${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
-  if ("${CMAKE_SYSTEM_NAME}X" STREQUAL "WindowsX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/win32")
-    set (NSYNC_OS_SRC
-      ${NSYNC_POSIX_SRC}
-      "platform/win32/src/clock_gettime.c"
-      "platform/win32/src/init_callback_win32.c"
-      "platform/win32/src/nanosleep.c"
-      "platform/win32/src/nsync_semaphore_win32.c"
-      "platform/win32/src/pthread_cond_timedwait_win32.c"
-      "platform/win32/src/pthread_key_win32.cc"
-    )
-    set (NSYNC_TEST_OS_SRC
-      "platform/win32/src/start_thread.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "DarwinX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/macos")
-    set (NSYNC_POSIX ON)
-    set (NSYNC_OS_EXTRA_SRC
-      "platform/posix/src/clock_gettime.c"
-      "platform/posix/src/nsync_semaphore_mutex.c"
-    )
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "LinuxX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/linux")
-    set (NSYNC_POSIX ON)
-    set (NSYNC_OS_EXTRA_SRC
-         "platform/linux/src/nsync_semaphore_futex.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "NetBSDX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/netbsd")
-    set (NSYNC_POSIX ON)
-    set (NSYNC_OS_EXTRA_SRC
-      "platform/posix/src/nsync_semaphore_mutex.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "FreeBSDX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/freebsd")
-    set (NSYNC_POSIX ON)
-    set (NSYNC_OS_EXTRA_SRC
-      "platform/posix/src/nsync_semaphore_mutex.c"
-    )
-  elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "OpenBSDX")
-    include_directories ("${PROJECT_SOURCE_DIR}/platform/openbsd")
-    set (NSYNC_POSIX ON)
-    set (NSYNC_OS_EXTRA_SRC
-      "platform/posix/src/nsync_semaphore_mutex.c"
-    )
-  endif ()
-endif ()
-
-if (NSYNC_POSIX)
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-  set (NSYNC_OS_SRC
-    ${NSYNC_POSIX_SRC}
-    ${NSYNC_OS_EXTRA_SRC}
-  )
-  set (NSYNC_TEST_OS_SRC
-    "platform/posix/src/start_thread.c"
-  )
-endif ()
-
-# Pick the include directory for the architecture.
-if (("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "x86_64X") OR
-    ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "amd64X") OR
-    ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "AMD64X"))
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/x86_64")
-elseif (("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "x86_32X") OR
-  ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "i386X") OR
-        ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "i686X"))
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/x86_32")
-elseif (("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "armv6lX") OR
-  ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "armv7lX") OR
-  ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "armX"))
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/arm")
-elseif (("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "aarch64X") OR
-  ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "arm64X"))
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/aarch64")
-elseif (("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "ppcX") OR
-  ("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "ppc32X"))
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/ppc32")
-elseif (("${CMAKE_SYSTEM_PROCESSOR}X" STREQUAL "ppc64X"))
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/ppc64")
-endif ()
-
-# Windows uses some include files from the posix directory also.
-if ("${CMAKE_SYSTEM_NAME}X" STREQUAL "WindowsX")
-  include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
-endif ()
-
-# -----------------------------------------------------------------
-
-include_directories ("${PROJECT_SOURCE_DIR}/public")
-include_directories ("${PROJECT_SOURCE_DIR}/internal")
-
-set (NSYNC_SRC
-  "internal/common.c"
-  "internal/counter.c"
-  "internal/cv.c"
-  "internal/debug.c"
-  "internal/dll.c"
-  "internal/mu.c"
-  "internal/mu_wait.c"
-  "internal/note.c"
-  "internal/once.c"
-  "internal/sem_wait.c"
-  "internal/time_internal.c"
-  "internal/wait.c"
-  ${NSYNC_OS_SRC}
-)
-add_library (nsync ${NSYNC_SRC})
-
-set (NSYNC_TEST_SRC
-  "testing/array.c"
-  "testing/atm_log.c"
-  "testing/closure.c"
-  "testing/smprintf.c"
-  "testing/testing.c"
-  "testing/time_extra.c"
-  ${NSYNC_TEST_OS_SRC}
-)
-add_library (nsync_test ${NSYNC_TEST_SRC})
-
-set (NSYNC_TESTS
-  "counter_test"
-  "cv_mu_timeout_stress_test"
-  "cv_test"
-  "cv_wait_example_test"
-  "dll_test"
-  "mu_starvation_test"
-  "mu_test"
-  "mu_wait_example_test"
-  "mu_wait_test"
-  "note_test"
-  "once_test"
-  "pingpong_test"
-  "wait_test"
-)
-
-if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
-  foreach (s IN ITEMS ${NSYNC_SRC} ${NSYNC_TEST_SRC})
-    SET_SOURCE_FILES_PROPERTIES ("${s}" PROPERTIES LANGUAGE CXX)
-  endforeach (s)
-  foreach (t IN ITEMS ${NSYNC_TESTS})
-    SET_SOURCE_FILES_PROPERTIES ("testing/${t}.c" PROPERTIES LANGUAGE CXX)
-  endforeach (t)
-endif ()
-
-enable_testing ()
-foreach (t IN ITEMS ${NSYNC_TESTS})
-  add_executable (${t} "testing/${t}.c")
-endforeach (t)
-
-find_package (Threads REQUIRED)
-set (THREADS_PREFER_PTHREAD_FLAG ON)
-foreach (t IN ITEMS "nsync" "nsync_test" ${NSYNC_TESTS})
-  if (THREADS_HAVE_PTHREAD_ARG)
-    target_compile_options (${t} PUBLIC "-pthread")
-  endif ()
-  if (CMAKE_THREAD_LIBS_INIT)
-    target_link_libraries (${t} "${CMAKE_THREAD_LIBS_INIT}")
-  endif ()
-endforeach (t)
-
-foreach (t IN ITEMS ${NSYNC_TESTS})
-  target_link_libraries (${t} nsync_test nsync)
-  add_test (NAME ${t} COMMAND ${t})
-endforeach (t)
-
-install (TARGETS nsync
-  LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
-  ARCHIVE DESTINATION lib COMPONENT Development)
-
-set (NSYNC_INCLUDES
-  "public/nsync.h"
-  "public/nsync_atomic.h"
-  "public/nsync_counter.h"
-  "public/nsync_cpp.h"
-  "public/nsync_cv.h"
-  "public/nsync_debug.h"
-  "public/nsync_mu.h"
-  "public/nsync_mu_wait.h"
-  "public/nsync_note.h"
-  "public/nsync_once.h"
-  "public/nsync_time.h"
-  "public/nsync_time_internal.h"
-  "public/nsync_waiter.h"
-)
-
-foreach (NSYNC_INCLUDE ${NSYNC_INCLUDES})
-  install (FILES ${NSYNC_INCLUDE} DESTINATION include COMPONENT Development)
-endforeach ()
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index a5a947f726..fb871acae9 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -4,6 +4,8 @@ tensorflow
 tensorflow/core
 tensorflow/core/example
 tensorflow/core/framework
+tensorflow/core/kernels
+tensorflow/core/kernels/boosted_trees
 tensorflow/core/lib
 tensorflow/core/lib/core
 tensorflow/core/profiler
@@ -245,10 +247,6 @@ tensorflow/contrib/kernel_methods/python
 tensorflow/contrib/kernel_methods/python/mappers
 tensorflow/contrib/kinesis/python
 tensorflow/contrib/kinesis/python/ops
-tensorflow/contrib/kfac
-tensorflow/contrib/kfac/examples
-tensorflow/contrib/kfac/python
-tensorflow/contrib/kfac/python/ops
 tensorflow/contrib/labeled_tensor
 tensorflow/contrib/labeled_tensor/python
 tensorflow/contrib/labeled_tensor/python/ops
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index a56a01b163..42b3b9f026 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -48,7 +48,7 @@ class JITTest(test.TestCase):
 
   def compute(self, use_jit, compute_fn):
     random_seed.set_random_seed(1234)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       with jit.experimental_jit_scope(use_jit):
         r = compute_fn()
       sess.run(variables.global_variables_initializer())
@@ -88,7 +88,7 @@ class JITTest(test.TestCase):
     self.assertAllClose(v_false_1, v_true_1)
 
   def testJITXlaScope(self):
-    with self.test_session(graph=ops.Graph()):
+    with self.session(graph=ops.Graph()):
       with jit.experimental_jit_scope(True):
         # XlaScope 0
         a1 = constant_op.constant(1)
@@ -138,7 +138,8 @@ class JITTest(test.TestCase):
     self.assertAllClose(v_false_1, v_true_1)
 
   def testDefunNoJitScope(self):
-    with self.test_session(graph=ops.Graph()):
+    with self.session(graph=ops.Graph()):
+
       @function.Defun(compiled=True, noinline=True)
       def mulop(x1, x2):
         return x1 * x2
@@ -153,7 +154,7 @@ class JITTest(test.TestCase):
       self.assertEqual(b"function_mulop", func_attrs["_XlaScope"].s)
 
   def testDefunInheritsJitScope(self):
-    with self.test_session(graph=ops.Graph()):
+    with self.session(graph=ops.Graph()):
       with jit.experimental_jit_scope(True):
         @function.Defun(compiled=True, noinline=True)
         def mulop(x1, x2):
@@ -195,7 +196,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
       self.assertAllClose([[108]], x_grads.eval())
 
   def testCompilationGradientScopeNames(self):
-    with self.test_session(graph=ops.Graph()):
+    with self.session(graph=ops.Graph()):
       with jit.experimental_jit_scope():
         # XlaScope 0
         a1 = constant_op.constant([[1.]])
@@ -217,7 +218,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
       self.assertEqual(b"jit_scope_1", grad_a2.op.get_attr("_XlaScope"))
 
   def testCompilationSeparateGradientScopeNames(self):
-    with self.test_session(graph=ops.Graph()):
+    with self.session(graph=ops.Graph()):
       with jit.experimental_jit_scope(True, separate_compiled_gradients=True):
         # XlaScope 0
         a1 = constant_op.constant([[1.]])
@@ -241,7 +242,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
                        grad_a2.op.get_attr("_XlaScope"))
 
   def testPlaysNicelyWithDefun(self):
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       with jit.experimental_jit_scope(True):
         @function.Defun(compiled=True, noinline=True)
         def mulop(x1, x2):
@@ -266,7 +267,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
       self.assertAllClose([1.0, 1.0, 2.0], sess.run([x, r, g_r]))
 
   def testPlaysNicelyWithDefunSeparateGradientScope(self):
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       with jit.experimental_jit_scope(True):
 
         @function.Defun(
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates.py b/tensorflow/contrib/constrained_optimization/python/candidates.py
index ac86a6741b..66d7ebed74 100644
--- a/tensorflow/contrib/constrained_optimization/python/candidates.py
+++ b/tensorflow/contrib/constrained_optimization/python/candidates.py
@@ -204,7 +204,7 @@ def find_best_candidate_distribution(objective_vector,
   assert best_pp is not None
 
   # Throughout this loop, a maximum_violation of "lower" is not achievable,
-  # but a maximum_violation of "upper" is achiveable.
+  # but a maximum_violation of "upper" is achievable.
   while True:
     middle = 0.5 * (lower + upper)
     if (middle - lower <= epsilon) or (upper - middle <= epsilon):
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
index 70813fb217..41258edd90 100644
--- a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -72,7 +72,8 @@ class ConstrainedMinimizationProblem(object):
     else:
       proxy_constraints_shape = self.proxy_constraints.get_shape()
 
-    if (constraints_shape is None or proxy_constraints_shape is None or
+    if (constraints_shape.ndims is None or
+        proxy_constraints_shape.ndims is None or
         any([ii is None for ii in constraints_shape.as_list()]) or
         any([ii is None for ii in proxy_constraints_shape.as_list()])):
       raise ValueError(
@@ -121,3 +122,19 @@ class ConstrainedMinimizationProblem(object):
       A tensor of proxy constraint functions.
     """
     return None
+
+  # This is a property, instead of an abstract property, since it doesn't need
+  # to be overridden: if pre_train_ops returns None, then there are no ops to
+  # run before train_op.
+  @property
+  def pre_train_ops(self):
+    """Returns a list of `Operation`s to run before the train_op.
+
+    When a `ConstrainedOptimizer` creates a train_op (in `minimize`
+    `minimize_unconstrained`, or `minimize_constrained`), it will include these
+    ops before the main training step.
+
+    Returns:
+      A list of `Operation`s.
+    """
+    return None
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
index 8055545366..0b79bdf7c0 100644
--- a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py
@@ -55,20 +55,21 @@ class ConstrainedOptimizer(object):
     """Returns the `tf.train.Optimizer` used for optimization."""
     return self._optimizer
 
-  def minimize_unconstrained(self,
-                             minimization_problem,
-                             global_step=None,
-                             var_list=None,
-                             gate_gradients=train_optimizer.Optimizer.GATE_OP,
-                             aggregation_method=None,
-                             colocate_gradients_with_ops=False,
-                             name=None,
-                             grad_loss=None):
-    """Returns an `Op` for minimizing the unconstrained problem.
+  @abc.abstractmethod
+  def _minimize_constrained(self,
+                            minimization_problem,
+                            global_step=None,
+                            var_list=None,
+                            gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                            aggregation_method=None,
+                            colocate_gradients_with_ops=False,
+                            name=None,
+                            grad_loss=None):
+    """Version of `minimize_constrained` to be overridden by subclasses.
 
-    Unlike `minimize_constrained`, this function ignores the `constraints` (and
-    `proxy_constraints`) portion of the minimization problem entirely, and only
-    minimizes `objective`.
+    Implementations of this method should ignore the `pre_train_ops` property of
+    the `minimization_problem`. The public `minimize_constrained` method will
+    take care of executing these before the returned train_op.
 
     Args:
       minimization_problem: ConstrainedMinimizationProblem, the problem to
@@ -83,19 +84,10 @@ class ConstrainedOptimizer(object):
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
-    return self.optimizer.minimize(
-        minimization_problem.objective,
-        global_step=global_step,
-        var_list=var_list,
-        gate_gradients=gate_gradients,
-        aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        name=name,
-        grad_loss=grad_loss)
+    pass
 
-  @abc.abstractmethod
   def minimize_constrained(self,
                            minimization_problem,
                            global_step=None,
@@ -105,7 +97,7 @@ class ConstrainedOptimizer(object):
                            colocate_gradients_with_ops=False,
                            name=None,
                            grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+    """Returns an `Operation` for minimizing the constrained problem.
 
     Unlike `minimize_unconstrained`, this function attempts to find a solution
     that minimizes the `objective` portion of the minimization problem while
@@ -124,9 +116,83 @@ class ConstrainedOptimizer(object):
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
-    pass
+
+    def train_op_callback():
+      return self._minimize_constrained(
+          minimization_problem,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    # If we have pre_train_ops, use tf.control_dependencies() to ensure that
+    # they execute before the train_op.
+    pre_train_ops = minimization_problem.pre_train_ops
+    if pre_train_ops:
+      with ops.control_dependencies(pre_train_ops):
+        train_op = train_op_callback()
+    else:
+      train_op = train_op_callback()
+
+    return train_op
+
+  def minimize_unconstrained(self,
+                             minimization_problem,
+                             global_step=None,
+                             var_list=None,
+                             gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                             aggregation_method=None,
+                             colocate_gradients_with_ops=False,
+                             name=None,
+                             grad_loss=None):
+    """Returns an `Operation` for minimizing the unconstrained problem.
+
+    Unlike `minimize_constrained`, this function ignores the `constraints` (and
+    `proxy_constraints`) portion of the minimization problem entirely, and only
+    minimizes `objective`.
+
+    Args:
+      minimization_problem: ConstrainedMinimizationProblem, the problem to
+        optimize.
+      global_step: as in `tf.train.Optimizer`'s `minimize` method.
+      var_list: as in `tf.train.Optimizer`'s `minimize` method.
+      gate_gradients: as in `tf.train.Optimizer`'s `minimize` method.
+      aggregation_method: as in `tf.train.Optimizer`'s `minimize` method.
+      colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize`
+        method.
+      name: as in `tf.train.Optimizer`'s `minimize` method.
+      grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
+
+    Returns:
+      `Operation`, the train_op.
+    """
+
+    def train_op_callback():
+      return self.optimizer.minimize(
+          minimization_problem.objective,
+          global_step=global_step,
+          var_list=var_list,
+          gate_gradients=gate_gradients,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          name=name,
+          grad_loss=grad_loss)
+
+    # If we have pre_train_ops, use tf.control_dependencies() to ensure that
+    # they execute before the train_op.
+    pre_train_ops = minimization_problem.pre_train_ops
+    if pre_train_ops:
+      with ops.control_dependencies(pre_train_ops):
+        train_op = train_op_callback()
+    else:
+      train_op = train_op_callback()
+
+    return train_op
 
   def minimize(self,
                minimization_problem,
@@ -138,7 +204,7 @@ class ConstrainedOptimizer(object):
                colocate_gradients_with_ops=False,
                name=None,
                grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+    """Returns an `Operation` for minimizing the constrained problem.
 
     This method combines the functionality of `minimize_unconstrained` and
     `minimize_constrained`. If global_step < unconstrained_steps, it will
@@ -164,14 +230,14 @@ class ConstrainedOptimizer(object):
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
 
     Raises:
       ValueError: If unconstrained_steps is provided, but global_step is not.
     """
 
     def unconstrained_fn():
-      """Returns an `Op` for minimizing the unconstrained problem."""
+      """Returns an `Operation` for minimizing the unconstrained problem."""
       return self.minimize_unconstrained(
           minimization_problem=minimization_problem,
           global_step=global_step,
@@ -183,7 +249,7 @@ class ConstrainedOptimizer(object):
           grad_loss=grad_loss)
 
     def constrained_fn():
-      """Returns an `Op` for minimizing the constrained problem."""
+      """Returns an `Operation` for minimizing the constrained problem."""
       return self.minimize_constrained(
           minimization_problem=minimization_problem,
           global_step=global_step,
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
index 01c6e4f08a..d1af15f7e4 100644
--- a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
@@ -70,11 +70,13 @@ def _project_multipliers_wrt_euclidean_norm(multipliers, radius):
       region w.r.t. the Euclidean norm.
 
   Raises:
-    ValueError: if the `multipliers` tensor does not have a fully-known shape,
-      or is not one-dimensional.
+    ValueError: if the `multipliers` tensor is not floating-point, does not have
+      a fully-known shape, or is not one-dimensional.
   """
+  if not multipliers.dtype.is_floating:
+    raise ValueError("multipliers must have a floating-point dtype")
   multipliers_shape = multipliers.get_shape()
-  if multipliers_shape is None:
+  if multipliers_shape.ndims is None:
     raise ValueError("multipliers must have known shape")
   if multipliers_shape.ndims != 1:
     raise ValueError(
@@ -101,12 +103,12 @@ def _project_multipliers_wrt_euclidean_norm(multipliers, radius):
         (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum(
             1.0, standard_ops.reduce_sum(inactive)))
     multipliers += scale * inactive
-    new_inactive = standard_ops.to_float(multipliers > 0)
+    new_inactive = standard_ops.cast(multipliers > 0, multipliers.dtype)
     multipliers *= new_inactive
     return (iteration, multipliers, new_inactive, inactive)
 
   iteration = standard_ops.constant(0)
-  inactive = standard_ops.ones_like(multipliers)
+  inactive = standard_ops.ones_like(multipliers, dtype=multipliers.dtype)
 
   # We actually want a do-while loop, so we explicitly call while_loop_body()
   # once before tf.while_loop().
@@ -189,16 +191,16 @@ class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
   def _projection_op(self, state, name=None):
     pass
 
-  def minimize_constrained(self,
-                           minimization_problem,
-                           global_step=None,
-                           var_list=None,
-                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
-                           aggregation_method=None,
-                           colocate_gradients_with_ops=False,
-                           name=None,
-                           grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+  def _minimize_constrained(self,
+                            minimization_problem,
+                            global_step=None,
+                            var_list=None,
+                            gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                            aggregation_method=None,
+                            colocate_gradients_with_ops=False,
+                            name=None,
+                            grad_loss=None):
+    """Returns an `Operation` for minimizing the constrained problem.
 
     The `optimizer` constructor parameter will be used to update the model
     parameters, while the Lagrange multipliers will be updated using
@@ -216,8 +218,11 @@ class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
       name: as in `tf.train.Optimizer`'s `minimize` method.
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
+    Raises:
+      ValueError: If the minimization_problem tensors have different dtypes.
+
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
     objective = minimization_problem.objective
 
@@ -225,6 +230,14 @@ class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
     proxy_constraints = minimization_problem.proxy_constraints
     if proxy_constraints is None:
       proxy_constraints = constraints
+
+    # Make sure that the objective, constraints and proxy constraints all have
+    # the same dtype.
+    if (objective.dtype.base_dtype != constraints.dtype.base_dtype or
+        objective.dtype.base_dtype != proxy_constraints.dtype.base_dtype):
+      raise ValueError("objective, constraints and proxy_constraints must "
+                       "have the same dtype")
+
     # Flatten both constraints tensors to 1d.
     num_constraints = minimization_problem.num_constraints
     constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
@@ -241,8 +254,10 @@ class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
 
     multipliers = self._lagrange_multipliers(state)
     loss = (
-        objective + standard_ops.tensordot(multipliers, proxy_constraints, 1))
-    multipliers_gradient = constraints
+        objective + standard_ops.tensordot(
+            standard_ops.cast(multipliers, proxy_constraints.dtype),
+            proxy_constraints, 1))
+    multipliers_gradient = standard_ops.cast(constraints, multipliers.dtype)
 
     update_ops = []
     if self.constraint_optimizer is None:
@@ -356,6 +371,8 @@ class AdditiveExternalRegretOptimizer(_ExternalRegretOptimizer):
     # For an AdditiveExternalRegretOptimizer, the internal state is simply a
     # tensor of Lagrange multipliers with shape (m,), where m is the number of
     # constraints.
+    #
+    # FUTURE WORK: make the dtype a parameter.
     return standard_ops.zeros((num_constraints,), dtype=dtypes.float32)
 
   def _lagrange_multipliers(self, state):
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
index ff846b191a..2c673d9347 100644
--- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -79,9 +79,11 @@ def _maximal_eigenvector_power_method(matrix,
     The maximal right-eigenvector of `matrix`.
 
   Raises:
-    ValueError: If the epsilon or maximum_iterations parameters violate their
-      bounds.
+    ValueError: If the `matrix` tensor is not floating-point, or if the
+      `epsilon` or `maximum_iterations` parameters violate their bounds.
   """
+  if not matrix.dtype.is_floating:
+    raise ValueError("multipliers must have a floating-point dtype")
   if epsilon <= 0.0:
     raise ValueError("epsilon must be strictly positive")
   if maximum_iterations <= 0:
@@ -139,11 +141,13 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
       (i.e. the Frobenius norm).
 
   Raises:
-    ValueError: if the `matrix` tensor does not have a fully-known shape, or is
-      not two-dimensional and square.
+    ValueError: if the `matrix` tensor is not floating-point, does not have a
+      fully-known shape, or is not two-dimensional and square.
   """
+  if not matrix.dtype.is_floating:
+    raise ValueError("multipliers must have a floating-point dtype")
   matrix_shape = matrix.get_shape()
-  if matrix_shape is None:
+  if matrix_shape.ndims is None:
     raise ValueError("matrix must have known shape")
   if matrix_shape.ndims != 2:
     raise ValueError(
@@ -172,12 +176,12 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
         matrix, axis=0, keepdims=True)) / standard_ops.maximum(
             1.0, standard_ops.reduce_sum(inactive, axis=0, keepdims=True))
     matrix += scale * inactive
-    new_inactive = standard_ops.to_float(matrix > 0)
+    new_inactive = standard_ops.cast(matrix > 0, matrix.dtype)
     matrix *= new_inactive
     return (iteration, matrix, new_inactive, inactive)
 
   iteration = standard_ops.constant(0)
-  inactive = standard_ops.ones_like(matrix)
+  inactive = standard_ops.ones_like(matrix, dtype=matrix.dtype)
 
   # We actually want a do-while loop, so we explicitly call while_loop_body()
   # once before tf.while_loop().
@@ -218,7 +222,7 @@ class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
   """Base class representing a `_SwapRegretOptimizer`.
 
   This class contains most of the logic for performing constrained optimization,
-  minimizing external regret for the constraints player. What it *doesn't* do is
+  minimizing swap regret for the constraints player. What it *doesn't* do is
   keep track of the internal state (the stochastic matrix).  Instead, the state
   is accessed via the _initial_state(), _stochastic_matrix(),
   _constraint_grad_and_var() and _projection_op() methods.
@@ -291,16 +295,16 @@ class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
   def _projection_op(self, state, name=None):
     pass
 
-  def minimize_constrained(self,
-                           minimization_problem,
-                           global_step=None,
-                           var_list=None,
-                           gate_gradients=train_optimizer.Optimizer.GATE_OP,
-                           aggregation_method=None,
-                           colocate_gradients_with_ops=False,
-                           name=None,
-                           grad_loss=None):
-    """Returns an `Op` for minimizing the constrained problem.
+  def _minimize_constrained(self,
+                            minimization_problem,
+                            global_step=None,
+                            var_list=None,
+                            gate_gradients=train_optimizer.Optimizer.GATE_OP,
+                            aggregation_method=None,
+                            colocate_gradients_with_ops=False,
+                            name=None,
+                            grad_loss=None):
+    """Returns an `Operation` for minimizing the constrained problem.
 
     The `optimizer` constructor parameter will be used to update the model
     parameters, while the constraint/objective weight matrix (the analogue of
@@ -320,8 +324,11 @@ class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
       name: as in `tf.train.Optimizer`'s `minimize` method.
       grad_loss: as in `tf.train.Optimizer`'s `minimize` method.
 
+    Raises:
+      ValueError: If the minimization_problem tensors have different dtypes.
+
     Returns:
-      TensorFlow Op.
+      `Operation`, the train_op.
     """
     objective = minimization_problem.objective
 
@@ -329,6 +336,14 @@ class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
     proxy_constraints = minimization_problem.proxy_constraints
     if proxy_constraints is None:
       proxy_constraints = constraints
+
+    # Make sure that the objective, constraints and proxy constraints all have
+    # the same dtype.
+    if (objective.dtype.base_dtype != constraints.dtype.base_dtype or
+        objective.dtype.base_dtype != proxy_constraints.dtype.base_dtype):
+      raise ValueError("objective, constraints and proxy_constraints must "
+                       "have the same dtype")
+
     # Flatten both constraints tensors to 1d.
     num_constraints = minimization_problem.num_constraints
     constraints = standard_ops.reshape(constraints, shape=(num_constraints,))
@@ -344,15 +359,18 @@ class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer):
         name="swap_regret_optimizer_state")
 
     zero_and_constraints = standard_ops.concat(
-        (standard_ops.zeros((1,)), constraints), axis=0)
+        (standard_ops.zeros((1,), dtype=constraints.dtype), constraints),
+        axis=0)
     objective_and_proxy_constraints = standard_ops.concat(
         (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0)
 
     distribution = self._distribution(state)
-    loss = standard_ops.tensordot(distribution, objective_and_proxy_constraints,
-                                  1)
+    loss = standard_ops.tensordot(
+        standard_ops.cast(distribution, objective_and_proxy_constraints.dtype),
+        objective_and_proxy_constraints, 1)
     matrix_gradient = standard_ops.matmul(
-        standard_ops.expand_dims(zero_and_constraints, 1),
+        standard_ops.expand_dims(
+            standard_ops.cast(zero_and_constraints, distribution.dtype), 1),
         standard_ops.expand_dims(distribution, 0))
 
     update_ops = []
@@ -555,6 +573,7 @@ class MultiplicativeSwapRegretOptimizer(_SwapRegretOptimizer):
     log_initial_one = math.log(1.0 - (self._initial_multiplier_radius *
                                       (dimension - 1) / (dimension)))
     log_initial_zero = math.log(self._initial_multiplier_radius / dimension)
+    # FUTURE WORK: make the dtype a parameter.
     return standard_ops.concat(
         (standard_ops.constant(
             log_initial_one, dtype=dtypes.float32, shape=(1, dimension)),
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index 615e62b16f..fe5e34d258 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Linear-chain CRF layer.
 
-See the @{$python/contrib.crf} guide.
+See the [CRF](https://tensorflow.org/api_guides/python/contrib.crf) guide.
 
 @@crf_binary_score
 @@crf_decode
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 252ea1560d..fda1b9f1b3 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -802,7 +802,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
           [single_cell_fn() for _ in range(num_layers)])
     input_size = 3
     save_graph = ops.Graph()
-    with save_graph.as_default(), self.test_session(graph=save_graph):
+    with save_graph.as_default(), self.session(graph=save_graph):
       save_layer = _MultiCellFn()
       save_layer(inputs=array_ops.ones([1, input_size]),
                  state=save_layer.zero_state(1, dtypes.float32))
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 8bdbba83ef..9f710613dd 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -33,14 +33,22 @@ cc_library(
 
 tf_custom_op_library(
     name = "_dataset_ops.so",
-    srcs = ["ops/dataset_ops.cc"],
-    deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"] +
-           if_static(
-               extra_deps = [":lib_proto_parsing_for_dataset_ops"],
-               otherwise = [],
-           ),
+    srcs = [
+        "ops/dataset_ops.cc",
+        "ops/indexed_dataset_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/data/kernels:dataset_kernels",
+        "//tensorflow/contrib/data/kernels:indexed_dataset",
+    ] + if_static(
+        extra_deps = [":lib_proto_parsing_for_dataset_ops"],
+        otherwise = [],
+    ),
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["dataset_ops"],
+    op_lib_names = [
+        "dataset_ops",
+        "indexed_dataset_ops",
+    ],
 )
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index dbfff9b4f8..5e6c1520a2 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -20,11 +20,12 @@ be used in conjunction with the `tf.data.Dataset` API. Note that the
 guarantees as `tf.data`, but we will provide deprecation advice in advance of
 removing existing functionality.
 
-See @{$guide/datasets$Importing Data} for an overview.
+See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 
 @@Counter
 @@CheckpointInputPipelineHook
 @@CsvDataset
+@@LMDBDataset
 @@RandomDataset
 @@Reducer
 @@SqlDataset
@@ -49,6 +50,7 @@ See @{$guide/datasets$Importing Data} for an overview.
 @@map_and_batch
 @@padded_batch_and_drop_remainder
 @@parallel_interleave
+@@parse_example_dataset
 @@prefetch_to_device
 @@read_batch_features
 @@rejection_resample
@@ -89,10 +91,12 @@ from tensorflow.contrib.data.python.ops.interleave_ops import sample_from_datase
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import CheckpointInputPipelineHook
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
+from tensorflow.contrib.data.python.ops.parsing_ops import parse_example_dataset
 from tensorflow.contrib.data.python.ops.prefetching_ops import copy_to_device
 from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
 from tensorflow.contrib.data.python.ops.random_ops import RandomDataset
 from tensorflow.contrib.data.python.ops.readers import CsvDataset
+from tensorflow.contrib.data.python.ops.readers import LMDBDataset
 from tensorflow.contrib.data.python.ops.readers import make_batched_features_dataset
 from tensorflow.contrib.data.python.ops.readers import make_csv_dataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index 2e249f5c14..ec6cb37193 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -7,6 +7,31 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 cc_library(
+    name = "indexed_dataset_headers",
+    hdrs = ["indexed_dataset.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+cc_library(
+    name = "indexed_dataset",
+    srcs = [
+        "identity_indexed_dataset.cc",
+        "indexed_dataset.cc",
+    ],
+    deps = [
+        ":indexed_dataset_headers",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
     name = "prefetching_kernels",
     srcs = ["prefetching_kernels.cc"],
     deps = [
@@ -52,6 +77,17 @@ cc_library(
 )
 
 cc_library(
+    name = "lmdb_dataset_op",
+    srcs = ["lmdb_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@lmdb",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+cc_library(
     name = "threadpool_dataset_op",
     srcs = ["threadpool_dataset_op.cc"],
     deps = [
@@ -91,6 +127,8 @@ cc_library(
         ":csv_dataset_op",
         ":directed_interleave_dataset_op",
         ":ignore_errors_dataset_op",
+        ":indexed_dataset",
+        ":lmdb_dataset_op",
         ":prefetching_kernels",
         ":threadpool_dataset_op",
         ":unique_dataset_op",
diff --git a/tensorflow/contrib/data/kernels/identity_indexed_dataset.cc b/tensorflow/contrib/data/kernels/identity_indexed_dataset.cc
new file mode 100644
index 0000000000..4718c1c8b9
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/identity_indexed_dataset.cc
@@ -0,0 +1,153 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/data/kernels/indexed_dataset.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
+ public:
+  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
+
+  void MakeIndexedDataset(OpKernelContext* ctx,
+                          IndexedDataset** output) override {
+    uint64 size = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
+    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
+    *output = new Dataset(ctx, size);
+  }
+
+  class Dataset : public IndexedDataset {
+   public:
+    Dataset(OpKernelContext* ctx, uint64 size)
+        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
+
+    Status MaterializeDataset(
+        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
+      materialized->reset(new Materialized(this));
+      return Status::OK();
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
+    }
+
+    string DebugString() const override {
+      return "IdentityIndexedDataset::Dataset";
+    }
+
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** node) const override {
+      return errors::Unimplemented(
+          "identity_indexed_dataset.AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (cur_ < dataset()->size_) {
+          Tensor result_tensor(ctx->allocator({}), DT_UINT64, {});
+          result_tensor.scalar<uint64>()() = cur_++;
+          out_tensors->emplace_back(std::move(result_tensor));
+          *end_of_sequence = false;
+          return Status::OK();
+        }
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      uint64 cur_ GUARDED_BY(mu_);
+    };
+
+    class Materialized : public MaterializedIndexedDataset {
+     public:
+      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
+        dataset->Ref();
+      }
+
+      ~Materialized() override {
+        // TODO(saeta): Pull this into MaterializedIndexedDataset
+        dataset_->Unref();
+      }
+
+      const DataTypeVector& output_dtypes() const override {
+        return dataset_->output_dtypes();
+      }
+
+      const std::vector<PartialTensorShape>& output_shapes() const override {
+        return dataset_->output_shapes();
+      }
+
+      Status Get(IteratorContext&& ctx, uint64 index,
+                 std::vector<Tensor>* out_tensors) const override {
+        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
+                  << ")";
+        if (index >= dataset_->size_) {
+          // Note: use InvalidArgument instead of OutOfRange error because many
+          // things consider OutOfRange to be a "clean termination" error.
+          return errors::InvalidArgument(
+              "Index ", index,
+              " is out of range for this dataset. (Size is: ", dataset_->size_,
+              ".)");
+        }
+        Tensor result_tensor(ctx.allocator({}), DT_UINT64, {});
+        result_tensor.scalar<uint64>()() = index;
+        out_tensors->emplace_back(std::move(result_tensor));
+        return Status::OK();
+      }
+
+      Status Size(uint64* size) const override {
+        *size = dataset_->size_;
+        return Status::OK();
+      }
+
+     private:
+      const Dataset* const dataset_;  // Not owned.
+    };
+
+    const uint64 size_;
+    std::shared_ptr<Materialized> materialized_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("IdentityIndexedDataset").Device(DEVICE_CPU),
+                        IdentityIndexedDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/kernels/indexed_dataset.cc b/tensorflow/contrib/data/kernels/indexed_dataset.cc
new file mode 100644
index 0000000000..c69564a31b
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/indexed_dataset.cc
@@ -0,0 +1,372 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/data/kernels/indexed_dataset.h"
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status VerifyTypesMatch(const DataTypeVector& expected,
+                        const DataTypeVector& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " types but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (expected[i] != received[i]) {
+      return errors::InvalidArgument("Data type mismatch at component ", i,
+                                     ": expected ", DataTypeString(expected[i]),
+                                     " but got ", DataTypeString(received[i]),
+                                     ".");
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
+                              const std::vector<PartialTensorShape>& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " shapes but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (!expected[i].IsCompatibleWith(received[i])) {
+      return errors::InvalidArgument("Incompatible shapes at component ", i,
+                                     ": expected ", expected[i].DebugString(),
+                                     " but got ", received[i].DebugString(),
+                                     ".");
+    }
+  }
+
+  return Status::OK();
+}
+
+class MaterializedDatasetResource : public ResourceBase {
+ public:
+  MaterializedDatasetResource(
+      const DataTypeVector& output_dtypes,
+      const std::vector<PartialTensorShape>& output_shapes)
+      : output_dtypes_(output_dtypes), output_shapes_(output_shapes) {}
+
+  string DebugString() override {
+    return "Materialized IndexedDataset resource";
+  }
+
+  Status Get(IteratorContext&& ctx, uint64 index,
+             std::vector<Tensor>* out_tensors) {
+    std::shared_ptr<MaterializedIndexedDataset> captured(materialized_);
+    if (captured) {
+      return captured->Get(std::move(ctx), index, out_tensors);
+    } else {
+      return errors::FailedPrecondition(
+          "Get() failed because the MaterializedIndexedDataset has not been "
+          "initialized. Ensure that you have run the materialization operation "
+          "for this MaterializedIndexedDataset before retrieving elements.");
+    }
+  }
+
+  // TODO(saeta): Implement Save and Restore
+
+  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
+  const std::vector<PartialTensorShape>& output_shapes() const {
+    return output_shapes_;
+  }
+
+  Status set_materialized_dataset(
+      const std::shared_ptr<MaterializedIndexedDataset>& dataset) {
+    if (dataset) {
+      TF_RETURN_IF_ERROR(
+          VerifyTypesMatch(output_dtypes_, dataset->output_dtypes()));
+      TF_RETURN_IF_ERROR(
+          VerifyShapesCompatible(output_shapes_, dataset->output_shapes()));
+    }
+    materialized_ = dataset;
+    return Status::OK();
+  }
+
+ private:
+  std::shared_ptr<MaterializedIndexedDataset> materialized_;
+  const DataTypeVector output_dtypes_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+// A wrapper class for storing an `IndexedDataset` instance in a DT_VARIANT
+// tensor. Objects of the wrapper class own a reference on an instance of an
+// `IndexedTensor` and the wrapper's copy constructor and desctructor take care
+// of managing the reference count.
+//
+// NOTE: This is not a feature-complete implementation of the DT_VARIANT
+// specification. In particular, we cannot currently serialize an arbitrary
+// `IndexedDataset` object, so the `Encode()` and `Decode()` methods are not
+// implemented.
+//
+// NOTE(saeta): When `IndexedDataset`s get merged into core, we can instead just
+// use `tensorflow::DatasetVariantWrapper`.
+class IndexedDatasetVariantWrapper {
+ public:
+  IndexedDatasetVariantWrapper() : dataset_(nullptr) {}
+
+  // Transfers ownership of `dataset` to `*this`.
+  explicit IndexedDatasetVariantWrapper(IndexedDataset* dataset)
+      : dataset_(dataset) {}
+
+  IndexedDatasetVariantWrapper(const IndexedDatasetVariantWrapper& other)
+      : dataset_(other.dataset_) {
+    if (dataset_) dataset_->Ref();
+  }
+
+  ~IndexedDatasetVariantWrapper() {
+    if (dataset_) dataset_->Unref();
+  }
+
+  IndexedDataset* get() const { return dataset_; }
+
+  string TypeName() const { return "tensorflow::IndexedDatasetVariantWrapper"; }
+  string DebugString() const {
+    if (dataset_) {
+      return dataset_->DebugString();
+    } else {
+      return "<Uninitialized IndexedDatasetVariantWrapper>";
+    }
+  }
+
+  void Encode(VariantTensorData* data) const {
+    LOG(ERROR) << "The Encode() method is not implemented for "
+                  "IndexedDatasetVariantWrapper objects.";
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    LOG(ERROR) << "The Decode() method is not implemented for "
+                  "IndexedDatasetVariantWrapper objects.";
+    return false;
+  }
+
+ private:
+  IndexedDataset* const dataset_;  // Owns one reference.
+};
+
+}  // namespace
+
+Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
+                                          IndexedDataset** out_dataset) {
+  if (!(tensor.dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor.shape()))) {
+    return errors::InvalidArgument(
+        "IndexedDataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  const Variant& variant = tensor.scalar<Variant>()();
+  const IndexedDatasetVariantWrapper* wrapper =
+      variant.get<IndexedDatasetVariantWrapper>();
+  if (wrapper == nullptr) {
+    return errors::InvalidArgument("Tensor must be an IndexedDataset object.");
+  }
+  *out_dataset = wrapper->get();
+  if (*out_dataset == nullptr) {
+    return errors::Internal("Read uninitialized IndexedDataset variant.");
+  }
+  return Status::OK();
+}
+
+Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
+                                          Tensor* tensor) {
+  if (!(tensor->dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor->shape()))) {
+    return errors::InvalidArgument(
+        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  tensor->scalar<Variant>()() = IndexedDatasetVariantWrapper(dataset);
+  return Status::OK();
+}
+
+void IndexedDatasetOpKernel::Compute(OpKernelContext* ctx) {
+  IndexedDataset* dataset = nullptr;
+  MakeIndexedDataset(ctx, &dataset);
+
+  if (ctx->status().ok()) {
+    OP_REQUIRES(ctx, dataset != nullptr,
+                errors::Internal("MakeIndexedDataset did not correctly "
+                                 "construct the IndexedDataset"));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    OP_REQUIRES_OK(ctx, StoreIndexedDatasetInVariantTensor(dataset, output));
+  }
+}
+
+namespace {
+
+class MaterializedHandleOp : public OpKernel {
+ public:
+  explicit MaterializedHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  ~MaterializedHandleOp() override {
+    if (resource_ != nullptr) {
+      resource_->Unref();
+      if (cinfo_.resource_is_private_to_kernel()) {
+        if (!cinfo_.resource_manager()
+                 ->template Delete<MaterializedDatasetResource>(
+                     cinfo_.container(), cinfo_.name())
+                 .ok()) {
+          // Do nothing; the resource can have been deleted by session resets.
+          // Note: cargo-culted from $tf/core/framework/resource_op_kernel.h
+        }
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        MaterializedDatasetResource* resource;
+        OP_REQUIRES_OK(context,
+                       mgr->LookupOrCreate<MaterializedDatasetResource>(
+                           cinfo_.container(), cinfo_.name(), &resource,
+                           [this](MaterializedDatasetResource** ret)
+                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                 *ret = new MaterializedDatasetResource(
+                                     output_dtypes_, output_shapes_);
+                                 return Status::OK();
+                               }));
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
+
+        resource_ = resource;
+      }
+    }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<MaterializedDatasetResource>()));
+  }
+
+ private:
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  Status VerifyResource(MaterializedDatasetResource* resource) {
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+    return Status::OK();
+  }
+
+  mutex mu_;
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
+  MaterializedDatasetResource* resource_ GUARDED_BY(mu_) = nullptr;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+// TODO(saeta): Make async.
+class MaterializeDatasetOp : public OpKernel {
+ public:
+  explicit MaterializeDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    IndexedDataset* dataset;
+    OP_REQUIRES_OK(ctx,
+                   GetIndexedDatasetFromVariantTensor(ctx->input(0), &dataset));
+
+    MaterializedDatasetResource* materialized_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &materialized_resource));
+    core::ScopedUnref unref(materialized_resource);
+    std::shared_ptr<MaterializedIndexedDataset> materialized;
+    OP_REQUIRES_OK(ctx, dataset->MaterializeDataset(&materialized));
+    OP_REQUIRES_OK(
+        ctx, materialized_resource->set_materialized_dataset(materialized));
+  }
+};
+
+// TODO(saeta): Make async
+class IndexedDatasetGet : public OpKernel {
+ public:
+  explicit IndexedDatasetGet(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    MaterializedDatasetResource* materialized_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0),
+                                       &materialized_resource));
+    auto cleanup = gtl::MakeCleanup([materialized_resource] {
+      materialized_resource->Unref();  // Note: can't use core::ScopedUnref.
+    });
+
+    const Tensor* index_t;
+    OP_REQUIRES_OK(ctx, ctx->input("index", &index_t));
+    // TODO(saeta): Support batch reads (indexes should be non-scalar!)
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(index_t->shape()),
+                errors::InvalidArgument("index must be a scalar"));
+    const uint64 index = index_t->scalar<uint64>()();
+
+    std::vector<Tensor> out_tensors;
+    Status s =
+        materialized_resource->Get(IteratorContext(ctx), index, &out_tensors);
+
+    // Note: Unref materialized_resource to avoid destruction races. (Important
+    // in a [future] async op implementation.)
+    cleanup.release()();
+
+    if (!s.ok()) {
+      ctx->SetStatus(s);
+    } else {
+      auto expected_shapes = materialized_resource->output_shapes();
+      auto expected_types = materialized_resource->output_dtypes();
+      for (size_t i = 0; i < out_tensors.size(); ++i) {
+        OP_REQUIRES(
+            ctx, expected_shapes[i].IsCompatibleWith(out_tensors[i].shape()),
+            errors::Internal(
+                "Materialized dataset output at index ", i,
+                " is incompatible with the expected shape. (Expected: ",
+                expected_shapes[i], ", got: ", out_tensors[i].shape(), ")"));
+        OP_REQUIRES(ctx, out_tensors[i].dtype() == expected_types[i],
+                    errors::Internal("Materialized dataset output at index ", i,
+                                     " was not the expected dtype. (Expected: ",
+                                     expected_types[i],
+                                     ", got: ", out_tensors[i].dtype(), ")"));
+        ctx->set_output(i, out_tensors[i]);
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MaterializedIndexDatasetHandle").Device(DEVICE_CPU),
+    MaterializedHandleOp);
+REGISTER_KERNEL_BUILDER(Name("IndexedDatasetMaterialize").Device(DEVICE_CPU),
+                        MaterializeDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("IndexedDatasetGet").Device(DEVICE_CPU),
+                        IndexedDatasetGet);
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/kernels/indexed_dataset.h b/tensorflow/contrib/data/kernels/indexed_dataset.h
new file mode 100644
index 0000000000..6149de888c
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/indexed_dataset.h
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_DATA_KERNELS_INDEXED_DATASET_H_
+#define TENSORFLOW_CONTRIB_DATA_KERNELS_INDEXED_DATASET_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// TODO(saeta): Urgh, this is ugly.
+class MaterializedIndexedDataset {
+ public:
+  virtual ~MaterializedIndexedDataset() = default;
+
+  // Retrieve the element at a given index. The output tensors are stored in
+  // out_tensors.
+  //
+  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
+  // returned.
+  //
+  // Get is thread-safe.
+  virtual Status Get(IteratorContext&& ctx, uint64 index,
+                     std::vector<Tensor>* out_tensors) const = 0;
+
+  // Size determines the number of elements in this IndexedDataset.
+  //
+  // Size is thread-safe.
+  virtual Status Size(uint64* size) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
+
+// IndexedDataset represents a dataset that supports random access in addition
+// to iterator-based sequential access.
+//
+// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
+// significant (backwards incompatible) changes!
+class IndexedDataset : public DatasetBase {
+ public:
+  IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
+
+  // Materialize (if necessary) the dataset, and return a pointer.
+  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
+  virtual Status MaterializeDataset(
+      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
+};
+
+// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
+// rest of the TensorFlow runtime.
+//
+// Most IndexedDataset's will be private members of classes inheriting from this
+// class.
+class IndexedDatasetOpKernel : public OpKernel {
+ public:
+  IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeIndexedDataset(OpKernelContext* ctx,
+                                  IndexedDataset** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
+
+// Validates and extracts an `IndexedDataset` object from `tensor`.
+//
+// `tensor` must have been written by a call to
+// `StoreIndexedDatasetInVariantTensor`
+//
+// The retrieved pointer isa  borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
+                                          IndexedDataset** out_dataset);
+
+// Stores an `IndexedDataset` object in `tensor.`
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
+                                          Tensor* tensor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_DATA_KERNELS_INDEXED_DATASET_H_
diff --git a/tensorflow/contrib/data/kernels/lmdb_dataset_op.cc b/tensorflow/contrib/data/kernels/lmdb_dataset_op.cc
new file mode 100644
index 0000000000..80f39992fb
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/lmdb_dataset_op.cc
@@ -0,0 +1,215 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sys/stat.h>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/platform/file_system.h"
+
+#include "lmdb.h"  // NOLINT(build/include)
+
+namespace tensorflow {
+namespace {
+
+class LMDBDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    *output = new Dataset(ctx, filenames);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const std::vector<string>& filenames)
+        : DatasetBase(DatasetContext(ctx)), filenames_(filenames) {}
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::LMDB")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes =
+          new DataTypeVector({DT_STRING, DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}, {}});
+      return *shapes;
+    }
+
+    string DebugString() const override { return "LMDBDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* filenames = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {filenames}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          if (mdb_cursor_) {
+            Tensor key_tensor(ctx->allocator({}), DT_STRING, {});
+            key_tensor.scalar<string>()() = string(
+                static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
+            out_tensors->emplace_back(std::move(key_tensor));
+
+            Tensor value_tensor(ctx->allocator({}), DT_STRING, {});
+            value_tensor.scalar<string>()() =
+                string(static_cast<const char*>(mdb_value_.mv_data),
+                       mdb_value_.mv_size);
+            out_tensors->emplace_back(std::move(value_tensor));
+
+            int val;
+            val = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT);
+            if (val != MDB_SUCCESS && val != MDB_NOTFOUND) {
+              return errors::InvalidArgument(mdb_strerror(val));
+            }
+            if (val == MDB_NOTFOUND) {
+              ResetStreamsLocked();
+              ++current_file_index_;
+            }
+            *end_of_sequence = false;
+            return Status::OK();
+          }
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return errors::Unimplemented(
+            "Checkpointing is currently not supported for LMDBDataset.");
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return errors::Unimplemented(
+            "Checkpointing is currently not supported for LMDBDataset.");
+      }
+
+     private:
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_file_index_ >= dataset()->filenames_.size()) {
+          return errors::InvalidArgument(
+              "current_file_index_:", current_file_index_,
+              " >= filenames_.size():", dataset()->filenames_.size());
+        }
+        const string& filename = dataset()->filenames_[current_file_index_];
+
+        int val = mdb_env_create(&mdb_env_);
+        if (val != MDB_SUCCESS) {
+          return errors::InvalidArgument(mdb_strerror(val));
+        }
+        int flags = MDB_RDONLY | MDB_NOTLS | MDB_NOLOCK;
+
+        struct stat source_stat;
+        if (stat(filename.c_str(), &source_stat) == 0 &&
+            (source_stat.st_mode & S_IFREG)) {
+          flags |= MDB_NOSUBDIR;
+        }
+        val = mdb_env_open(mdb_env_, filename.c_str(), flags, 0664);
+        if (val != MDB_SUCCESS) {
+          return errors::InvalidArgument(mdb_strerror(val));
+        }
+        val = mdb_txn_begin(mdb_env_, nullptr, MDB_RDONLY, &mdb_txn_);
+        if (val != MDB_SUCCESS) {
+          return errors::InvalidArgument(mdb_strerror(val));
+        }
+        val = mdb_dbi_open(mdb_txn_, nullptr, 0, &mdb_dbi_);
+        if (val != MDB_SUCCESS) {
+          return errors::InvalidArgument(mdb_strerror(val));
+        }
+        val = mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_);
+        if (val != MDB_SUCCESS) {
+          return errors::InvalidArgument(mdb_strerror(val));
+        }
+        val = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_FIRST);
+        if (val != MDB_SUCCESS && val != MDB_NOTFOUND) {
+          return errors::InvalidArgument(mdb_strerror(val));
+        }
+        if (val == MDB_NOTFOUND) {
+          ResetStreamsLocked();
+        }
+        return Status::OK();
+      }
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (mdb_env_ != nullptr) {
+          if (mdb_cursor_) {
+            mdb_cursor_close(mdb_cursor_);
+            mdb_cursor_ = nullptr;
+          }
+          mdb_dbi_close(mdb_env_, mdb_dbi_);
+          mdb_txn_abort(mdb_txn_);
+          mdb_env_close(mdb_env_);
+          mdb_txn_ = nullptr;
+          mdb_dbi_ = 0;
+          mdb_env_ = nullptr;
+        }
+      }
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      MDB_env* mdb_env_ GUARDED_BY(mu_) = nullptr;
+      MDB_txn* mdb_txn_ GUARDED_BY(mu_) = nullptr;
+      MDB_dbi mdb_dbi_ GUARDED_BY(mu_) = 0;
+      MDB_cursor* mdb_cursor_ GUARDED_BY(mu_) = nullptr;
+
+      MDB_val mdb_key_ GUARDED_BY(mu_);
+      MDB_val mdb_value_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> filenames_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("LMDBDataset").Device(DEVICE_CPU), LMDBDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index 74df1e42a8..725f8933c9 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -548,7 +548,9 @@ class MultiDeviceIterator : public ResourceBase {
         devices_(devices),
         flib_def_(std::move(flib_def)),
         pflr_(std::move(pflr)),
-        lib_(lib) {}
+        lib_(lib) {
+    CHECK_NOTNULL(lib_);
+  }
 
   string DebugString() override {
     return strings::StrCat("MultiDeviceIterator for ", devices_.size(),
@@ -600,6 +602,11 @@ class MultiDeviceIterator : public ResourceBase {
     return lib_def_;
   }
 
+  FunctionLibraryRuntime* const lib() {
+    tf_shared_lock l(mu_);
+    return lib_;
+  }
+
  private:
   // A private class that uses a background thread to keep a per device buffer
   // full.
@@ -930,8 +937,10 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     core::ScopedUnref unref(resource);
 
     std::unique_ptr<IteratorBase> iterator;
-    OP_REQUIRES_OK(ctx, dataset->MakeIterator(IteratorContext(ctx), "Iterator",
-                                              &iterator));
+    IteratorContext iter_ctx(ctx);
+    iter_ctx.set_lib(resource->lib());
+    OP_REQUIRES_OK(
+        ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
     int64 incarnation_id;
     OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), max_buffer_size,
                                        &incarnation_id));
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index cc5e250ea1..ae104d55bd 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -266,4 +266,13 @@ REGISTER_OP("AssertNextDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("LMDBDataset")
+    .Input("filenames: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn(shape_inference::ScalarShape);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/data/ops/indexed_dataset_ops.cc b/tensorflow/contrib/data/ops/indexed_dataset_ops.cc
new file mode 100644
index 0000000000..cd9b7c68a0
--- /dev/null
+++ b/tensorflow/contrib/data/ops/indexed_dataset_ops.cc
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("IdentityIndexedDataset")
+    .Input("size: uint64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(
+        shape_inference::ScalarShape);  // TODO(saeta): check input shapes.
+
+///////////////////////////////////////////////////////////////////////////////
+//     IndexedDataset Internals
+///////////////////////////////////////////////////////////////////////////////
+
+// Creates the handle.
+REGISTER_OP("MaterializedIndexDatasetHandle")
+    .Output("handle: resource")
+    .Attr("container: string")
+    .Attr("shared_name: string")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+// Actually materialize the materialize handle.
+REGISTER_OP("IndexedDatasetMaterialize")
+    .Input("dataset: variant")
+    .Input("materialized: resource")
+    .SetShapeFn(shape_inference::NoOutputs);
+
+namespace {
+
+Status GetShapeFn(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  if (output_shapes.size() != c->num_outputs()) {
+    return errors::InvalidArgument(
+        "`output_shapes` must be the same length as `output_types` (",
+        output_shapes.size(), " vs. ", c->num_outputs());
+  }
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    shape_inference::ShapeHandle output_shape_handle;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+        output_shapes[i], &output_shape_handle));
+    c->set_output(static_cast<int>(i), output_shape_handle);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+REGISTER_OP("IndexedDatasetGet")
+    .Input("materialized: resource")
+    .Input("index: uint64")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(GetShapeFn)
+    .Doc(R"doc(
+Gets the element at `index` from `materialized` IndexedDataset.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 2b75aa2ca5..b86a543fc3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -134,12 +135,26 @@ py_test(
 )
 
 py_test(
+    name = "indexed_dataset_ops_test",
+    srcs = ["indexed_dataset_ops_test.py"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:contrib_op_loader",
+        "//tensorflow/contrib/data/python/ops:gen_dataset_ops",
+        "//tensorflow/contrib/data/python/ops:indexed_dataset_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
     name = "interleave_dataset_op_test",
     size = "medium",
     srcs = ["interleave_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "manual",
         "no_oss",
         "no_pip",
         "notap",
@@ -180,6 +195,31 @@ py_test(
 )
 
 py_test(
+    name = "lmdb_dataset_op_test",
+    size = "medium",
+    srcs = ["lmdb_dataset_op_test.py"],
+    data = ["//tensorflow/core:lmdb_testdata"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:readers",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
     name = "map_dataset_op_test",
     size = "medium",
     srcs = ["map_dataset_op_test.py"],
@@ -206,6 +246,25 @@ py_test(
 )
 
 py_test(
+    name = "filter_dataset_op_test",
+    size = "medium",
+    srcs = ["filter_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:optimization",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
     name = "map_defun_op_test",
     size = "small",
     srcs = ["map_defun_op_test.py"],
@@ -230,19 +289,35 @@ py_test(
     srcs = ["optimize_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":stats_dataset_test_base",
         "//tensorflow/contrib/data/python/ops:optimization",
-        "//tensorflow/contrib/data/python/ops:stats_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
+py_test(
+    name = "parsing_ops_test",
+    size = "small",
+    srcs = ["parsing_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:parsing_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "prefetching_ops_test",
     size = "small",
@@ -329,6 +404,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
 )
@@ -549,3 +625,13 @@ py_test(
         "//tensorflow/python/data/ops:readers",
     ],
 )
+
+py_library(
+    name = "test_utils",
+    srcs = ["test_utils.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
index 2a0e64caeb..63bffd023f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -51,7 +51,7 @@ class CsvDatasetOpTest(test.TestCase):
     assert ds1.output_classes == ds2.output_classes
     next1 = ds1.make_one_shot_iterator().get_next()
     next2 = ds2.make_one_shot_iterator().get_next()
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       # Run through datasets and check that outputs match, or errors match.
       while True:
         try:
@@ -138,7 +138,7 @@ class CsvDatasetOpTest(test.TestCase):
     filenames = self._setup_files(inputs, linebreak, compression_type)
     kwargs['compression_type'] = compression_type
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         dataset = readers.CsvDataset(filenames, **kwargs)
         self._verify_output_or_err(sess, dataset, expected_output,
                                    expected_err_re)
@@ -192,7 +192,7 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['1,"2"3",4', '1,"2"3",4",5,5', 'a,b,"c"d"', 'e,f,g']]
     filenames = self._setup_files(inputs)
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         dataset = readers.CsvDataset(filenames, record_defaults=record_defaults)
         dataset = dataset.apply(error_ops.ignore_errors())
         self._verify_output_or_err(sess, dataset, [['e', 'f', 'g']])
@@ -202,7 +202,7 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['1,2"3,4', 'a,b,c"d', '9,8"7,6,5', 'e,f,g']]
     filenames = self._setup_files(inputs)
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         dataset = readers.CsvDataset(filenames, record_defaults=record_defaults)
         dataset = dataset.apply(error_ops.ignore_errors())
         self._verify_output_or_err(sess, dataset, [['e', 'f', 'g']])
@@ -378,7 +378,7 @@ class CsvDatasetOpTest(test.TestCase):
           file_path, batch_size=1, shuffle=False, num_epochs=1)
       next_batch = ds.make_one_shot_iterator().get_next()
 
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       result = list(sess.run(next_batch).values())
 
     self.assertEqual(result, sorted(result))
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
new file mode 100644
index 0000000000..6d01bf585c
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks FilterDataset input pipeline op."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class FilterBenchmark(test.Benchmark):
+
+  # This benchmark compares the performance of pipeline with multiple chained
+  # filter with and without filter fusion.
+  def benchmarkFilters(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkFilters(chain_length, False)
+      self._benchmarkFilters(chain_length, True)
+
+  def _benchmarkFilters(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(5).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.filter(lambda x: math_ops.greater_equal(x - 5, 0))
+      if optimize_dataset:
+        dataset = dataset.apply(optimization.optimize(["filter_fusion"]))
+
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(10):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "no-opt"
+        print("Filter dataset {} chain length: {} Median wall time: {}".format(
+            opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=1000,
+            wall_time=median_wall_time,
+            name="benchmark_filter_dataset_chain_latency_{}_{}".format(
+                opt_mark, chain_length))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
new file mode 100644
index 0000000000..db2ab815ee
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/indexed_dataset_ops_test.py
@@ -0,0 +1,78 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for experimental indexed dataset ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.contrib.data.python.ops import indexed_dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class IndexedDatasetOpsTest(test.TestCase):
+
+  def testLowLevelIndexedDatasetOps(self):
+    identity = gen_dataset_ops.identity_indexed_dataset(
+        ops.convert_to_tensor(16, dtype=dtypes.uint64))
+    handle = gen_dataset_ops.materialized_index_dataset_handle(
+        container="",
+        shared_name="",
+        output_types=[dtypes.uint64],
+        output_shapes=[[]])
+    materialize = gen_dataset_ops.indexed_dataset_materialize(identity, handle)
+    index = array_ops.placeholder(dtypes.uint64)
+    get_op = gen_dataset_ops.indexed_dataset_get(
+        handle, index, output_types=[dtypes.uint64], output_shapes=[[]])
+
+    with self.test_session() as sess:
+      sess.run(materialize)
+      self.assertEqual([3], sess.run(get_op, feed_dict={index: 3}))
+
+  def testIdentityIndexedDataset(self):
+    ds = indexed_dataset_ops.IdentityIndexedDataset(16)
+    materialized = ds.materialize()
+    with self.test_session() as sess:
+      sess.run(materialized.initializer)
+      placeholder = array_ops.placeholder(dtypes.uint64, shape=[])
+      for i in range(16):
+        output = sess.run(
+            materialized.get(placeholder), feed_dict={placeholder: i})
+        self.assertEqual([i], output)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(materialized.get(placeholder), feed_dict={placeholder: 16})
+
+  @unittest.skip("Requisite functionality currently unimplemented.")
+  def testIdentityIndexedDatasetIterator(self):
+    ds = indexed_dataset_ops.IdentityIndexedDataset(16)
+    itr = ds.make_initializable_iterator()
+    n = itr.get_next()
+    with self.test_session() as sess:
+      sess.run(itr.initializer)
+      for i in range(16):
+        output = sess.run(n)
+        self.assertEqual(i, output)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(n)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 44c3325a3d..7a3215f6cc 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -777,6 +777,34 @@ class ParallelInterleaveDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(self.next_element)
 
+  def testShutdownRace(self):
+    dataset = dataset_ops.Dataset.range(20)
+    map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1))
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(
+            map_fn,
+            cycle_length=3,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    dataset = dataset.batch(32)
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    results = []
+    with self.test_session() as sess:
+      for _ in range(2):
+        elements = []
+        sess.run(iterator.initializer)
+        try:
+          while True:
+            elements.extend(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+        results.append(elements)
+
+    self.assertAllEqual(results[0], results[1])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
index 77148aceec..704c0d1eb2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -60,7 +60,7 @@ class CheckpointInputPipelineHookTest(test.TestCase):
       meta_filename = ckpt_path + '.meta'
       saver_lib.import_meta_graph(meta_filename)
       saver = saver_lib.Saver()
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         saver.restore(sess, ckpt_path)
         return sess.run(ops.get_collection('my_vars'))
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
new file mode 100644
index 0000000000..7bc582ebaa
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -0,0 +1,66 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LMDBDatasetOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+prefix_path = "tensorflow/core/lib"
+
+
+class LMDBDatasetTest(test.TestCase):
+
+  def setUp(self):
+    super(LMDBDatasetTest, self).setUp()
+    # Copy database out because we need the path to be writable to use locks.
+    path = os.path.join(prefix_path, "lmdb", "testdata", "data.mdb")
+    self.db_path = os.path.join(self.get_temp_dir(), "data.mdb")
+    shutil.copy(path, self.db_path)
+
+  def testReadFromFile(self):
+    filename = self.db_path
+
+    filenames = constant_op.constant([filename], dtypes.string)
+    num_repeats = 2
+
+    dataset = readers.LMDBDataset(filenames).repeat(num_repeats)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(num_repeats):  # Dataset is repeated.
+        for i in range(10):  # 10 records.
+          k = compat.as_bytes(str(i))
+          v = compat.as_bytes(str(chr(ord("a") + i)))
+          self.assertEqual((k, v), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index 009e21a34c..dc9d56dd53 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -139,7 +139,7 @@ class MapDatasetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       captured_init_op, init_op, get_next = _build_graph()
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(captured_init_op)
         sess.run(init_op)
         for i in range(10):
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
index a711325dae..73cde40305 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py
@@ -31,47 +31,57 @@ from tensorflow.python.platform import test
 
 class MapDefunTest(test.TestCase):
 
-  def testMapDefun_Simple(self):
+  def testMapDefunSimple(self):
 
     @function.Defun(dtypes.int32)
     def simple_fn(x):
       return x * 2 + 3
 
-    with self.test_session():
-      nums = [[1, 2], [3, 4], [5, 6]]
-      elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
-      r = map_defun.map_defun(simple_fn, [elems], [dtypes.int32], [(2,)])[0]
-      expected = elems * 2 + 3
-      self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
+    nums = [[1, 2], [3, 4], [5, 6]]
+    elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+    r = map_defun.map_defun(simple_fn, [elems], [dtypes.int32], [(2,)])[0]
+    expected = elems * 2 + 3
+    self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
 
-  def testMapDefun_MismatchedTypes(self):
+  def testMapDefunMismatchedTypes(self):
 
     @function.Defun(dtypes.int32)
     def fn(x):
       return math_ops.cast(x, dtypes.float64)
 
-    with self.test_session():
-      nums = [1, 2, 3, 4, 5, 6]
-      elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
-      r = map_defun.map_defun(fn, [elems], [dtypes.int32], [()])[0]
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(r)
+    nums = [1, 2, 3, 4, 5, 6]
+    elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+    r = map_defun.map_defun(fn, [elems], [dtypes.int32], [()])[0]
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(r)
+
+  def testMapDefunReduceDim(self):
+    # Tests where the output has a different rank from the input
+
+    @function.Defun(dtypes.int32)
+    def fn(x):
+      return array_ops.gather(x, 0)
+
+    nums = [[1, 2], [3, 4], [5, 6]]
+    elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+    r = map_defun.map_defun(fn, [elems], [dtypes.int32], [()])[0]
+    expected = constant_op.constant([1, 3, 5])
+    self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
 
-  def testMapDefun_MultipleOutputs(self):
+  def testMapDefunMultipleOutputs(self):
 
     @function.Defun(dtypes.int32)
     def fn(x):
       return (x, math_ops.cast(x * 2 + 3, dtypes.float64))
 
-    with self.test_session():
-      nums = [[1, 2], [3, 4], [5, 6]]
-      elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
-      r = map_defun.map_defun(fn, [elems], [dtypes.int32, dtypes.float64],
-                              [(2,), (2,)])
-      expected = [elems, elems * 2 + 3]
-      self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
+    nums = [[1, 2], [3, 4], [5, 6]]
+    elems = constant_op.constant(nums, dtype=dtypes.int32, name="data")
+    r = map_defun.map_defun(fn, [elems], [dtypes.int32, dtypes.float64], [(2,),
+                                                                          (2,)])
+    expected = [elems, elems * 2 + 3]
+    self.assertAllEqual(self.evaluate(r), self.evaluate(expected))
 
-  def testMapDefun_ShapeInference(self):
+  def testMapDefunShapeInference(self):
 
     @function.Defun(dtypes.int32)
     def fn(x):
@@ -82,7 +92,7 @@ class MapDefunTest(test.TestCase):
     result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])[0]
     self.assertEqual(result.get_shape(), (3, 2))
 
-  def testMapDefun_PartialShapeInference(self):
+  def testMapDefunPartialShapeInference(self):
 
     @function.Defun(dtypes.int32)
     def fn(x):
@@ -92,7 +102,7 @@ class MapDefunTest(test.TestCase):
     result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])
     self.assertEqual(result[0].get_shape().as_list(), [None, 2])
 
-  def testMapDefun_RaisesErrorOnRuntimeShapeMismatch(self):
+  def testMapDefunRaisesErrorOnRuntimeShapeMismatch(self):
 
     @function.Defun(dtypes.int32, dtypes.int32)
     def fn(x, y):
@@ -108,7 +118,7 @@ class MapDefunTest(test.TestCase):
           "All inputs must have the same dimension 0."):
         sess.run(result, feed_dict={elems1: [1, 2, 3, 4, 5], elems2: [1, 2, 3]})
 
-  def testMapDefun_RaisesDefunError(self):
+  def testMapDefunRaisesDefunError(self):
 
     @function.Defun(dtypes.int32)
     def fn(x):
@@ -117,9 +127,8 @@ class MapDefunTest(test.TestCase):
 
     elems = constant_op.constant([0, 0, 0, 37, 0])
     result = map_defun.map_defun(fn, [elems], [dtypes.int32], [()])
-    with self.test_session():
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(result)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD b/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD
new file mode 100644
index 0000000000..b299e0736f
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/BUILD
@@ -0,0 +1,61 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_test(
+    name = "map_vectorization_test",
+    size = "small",
+    srcs = ["map_vectorization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/kernel_tests:test_utils",
+        "//tensorflow/contrib/data/python/ops:optimization",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "map_and_filter_fusion_test",
+    size = "medium",
+    srcs = ["map_and_filter_fusion_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:optimization",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "latency_all_edges_test",
+    size = "small",
+    srcs = ["latency_all_edges_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/kernel_tests:stats_dataset_test_base",
+        "//tensorflow/contrib/data/python/ops:optimization",
+        "//tensorflow/contrib/data/python/ops:stats_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/latency_all_edges_test.py
new file mode 100644
index 0000000000..1850b6921a
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/latency_all_edges_test.py
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the LatencyAllEdges optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.kernel_tests import stats_dataset_test_base
+from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.contrib.data.python.ops import stats_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class OptimizeStatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
+
+  def testLatencyStatsOptimization(self):
+
+    stats_aggregator = stats_ops.StatsAggregator()
+    dataset = dataset_ops.Dataset.from_tensors(1).apply(
+        optimization.assert_next(
+            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
+             "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply(
+                 optimization.optimize(["latency_all_edges"])).apply(
+                     stats_ops.set_stats_aggregator(stats_aggregator))
+    iterator = dataset.make_initializable_iterator()
+    get_next = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      self.assertEqual(1 * 1, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      summary_str = sess.run(summary_t)
+      self._assertSummaryHasCount(summary_str,
+                                  "record_latency_TensorDataset/_1", 1)
+      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
+                                  1)
+      self._assertSummaryHasCount(summary_str,
+                                  "record_latency_PrefetchDataset/_6", 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py
new file mode 100644
index 0000000000..586b4bee5f
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -0,0 +1,224 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the MapAndFilterFusion optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MapAndFilterFusionTest(test.TestCase, parameterized.TestCase):
+
+  @staticmethod
+  def map_functions():
+    identity = lambda x: x
+    increment = lambda x: x + 1
+
+    def increment_and_square(x):
+      y = x + 1
+      return y * y
+
+    functions = [identity, increment, increment_and_square]
+    tests = []
+    for i, fun1 in enumerate(functions):
+      for j, fun2 in enumerate(functions):
+        tests.append((
+            "test_{}_{}".format(i, j),
+            [fun1, fun2],
+        ))
+        for k, fun3 in enumerate(functions):
+          tests.append((
+              "test_{}_{}_{}".format(i, j, k),
+              [fun1, fun2, fun3],
+          ))
+
+    swap = lambda x, n: (n, x)
+    tests.append((
+        "swap1",
+        [lambda x: (x, 42), swap],
+    ))
+    tests.append((
+        "swap2",
+        [lambda x: (x, 42), swap, swap],
+    ))
+    return tuple(tests)
+
+  @parameterized.named_parameters(*map_functions.__func__())
+  def testMapFusion(self, functions):
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map", "Prefetch"]))
+    for function in functions:
+      dataset = dataset.map(function)
+
+    dataset = dataset.prefetch(0).apply(optimization.optimize(["map_fusion"]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      for x in range(5):
+        result = sess.run(get_next)
+        r = x
+        for function in functions:
+          if isinstance(r, tuple):
+            r = function(*r)  # Pass tuple as multiple arguments.
+          else:
+            r = function(r)
+        self.assertAllEqual(r, result)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @staticmethod
+  def map_and_filter_functions():
+    identity = lambda x: x
+    increment = lambda x: x + 1
+    minus_five = lambda x: x - 5
+
+    def increment_and_square(x):
+      y = x + 1
+      return y * y
+
+    take_all = lambda x: constant_op.constant(True)
+    is_zero = lambda x: math_ops.equal(x, 0)
+    is_odd = lambda x: math_ops.equal(x % 2, 0)
+    greater = lambda x: math_ops.greater(x + 5, 0)
+
+    functions = [identity, increment, minus_five, increment_and_square]
+    filters = [take_all, is_zero, is_odd, greater]
+    tests = []
+
+    for x, fun in enumerate(functions):
+      for y, predicate in enumerate(filters):
+        tests.append(("mixed_{}_{}".format(x, y), fun, predicate))
+
+    # Multi output
+    tests.append(("multiOne", lambda x: (x, x),
+                  lambda x, y: constant_op.constant(True)))
+    tests.append(
+        ("multiTwo", lambda x: (x, 2),
+         lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0)))
+    return tuple(tests)
+
+  @parameterized.named_parameters(*map_and_filter_functions.__func__())
+  def testMapFilterFusion(self, function, predicate):
+    dataset = dataset_ops.Dataset.range(10).apply(
+        optimization.assert_next(
+            ["Map",
+             "FilterByLastComponent"])).map(function).filter(predicate).apply(
+                 optimization.optimize(["map_and_filter_fusion"]))
+    self._testMapAndFilter(dataset, function, predicate)
+
+  def _testMapAndFilter(self, dataset, function, predicate):
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      for x in range(10):
+        r = function(x)
+        if isinstance(r, tuple):
+          b = predicate(*r)  # Pass tuple as multiple arguments.
+        else:
+          b = predicate(r)
+        if sess.run(b):
+          result = sess.run(get_next)
+          self.assertAllEqual(r, result)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testAdditionalInputs(self):
+    a = constant_op.constant(3, dtype=dtypes.int64)
+    b = constant_op.constant(4, dtype=dtypes.int64)
+    some_tensor = math_ops.mul(a, b)
+    function = lambda x: x * x
+
+    def predicate(y):
+      return math_ops.less(math_ops.cast(y, dtypes.int64), some_tensor)
+
+    # We are currently not supporting functions with additional inputs.
+    dataset = dataset_ops.Dataset.range(10).apply(
+        optimization.assert_next(
+            ["Map", "Filter"])).map(function).filter(predicate).apply(
+                optimization.optimize(["map_and_filter_fusion"]))
+
+    self._testMapAndFilter(dataset, function, predicate)
+
+  @staticmethod
+  def filter_functions():
+    take_all = lambda x: constant_op.constant(True)
+    is_zero = lambda x: math_ops.equal(x, 0)
+    greater = lambda x: math_ops.greater(x + 5, 0)
+
+    tests = []
+    filters = [take_all, is_zero, greater]
+    identity = lambda x: x
+    for x, predicate_1 in enumerate(filters):
+      for y, predicate_2 in enumerate(filters):
+        tests.append(("mixed_{}_{}".format(x, y), identity,
+                      [predicate_1, predicate_2]))
+        for z, predicate_3 in enumerate(filters):
+          tests.append(("mixed_{}_{}_{}".format(x, y, z), identity,
+                        [predicate_1, predicate_2, predicate_3]))
+
+    take_all_multiple = lambda x, y: constant_op.constant(True)
+    # Multi output
+    tests.append(("multiOne", lambda x: (x, x),
+                  [take_all_multiple, take_all_multiple]))
+    tests.append(("multiTwo", lambda x: (x, 2), [
+        take_all_multiple,
+        lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0)
+    ]))
+    return tuple(tests)
+
+  @parameterized.named_parameters(*filter_functions.__func__())
+  def testFilterFusion(self, map_function, predicates):
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map", "Filter",
+                                  "Prefetch"])).map(map_function)
+    for predicate in predicates:
+      dataset = dataset.filter(predicate)
+
+    dataset = dataset.prefetch(0).apply(
+        optimization.optimize(["filter_fusion"]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      for x in range(5):
+        r = map_function(x)
+        filtered = False
+        for predicate in predicates:
+          if isinstance(r, tuple):
+            b = predicate(*r)  # Pass tuple as multiple arguments.
+          else:
+            b = predicate(r)
+          if not sess.run(b):
+            filtered = True
+            break
+
+        if not filtered:
+          result = sess.run(get_next)
+          self.assertAllEqual(r, result)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py
new file mode 100644
index 0000000000..57bf22591a
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/optimization/map_vectorization_test.py
@@ -0,0 +1,220 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the MapVectorization optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.data.python.kernel_tests import test_utils
+from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MapVectorizationTest(test_utils.DatasetTestBase, parameterized.TestCase):
+
+  def _get_test_datasets(self,
+                         base_dataset,
+                         map_fn,
+                         num_parallel_calls=None,
+                         expect_optimized=True):
+    """Given base dataset and map fn, creates test datasets.
+
+    Returns a tuple of (unoptimized, dataset, optimized dataset). The
+    unoptimized dataset has the assertion that Batch follows Map. The optimized
+    dataset has the assertion that Map follows Batch, and has the
+    "map_vectorization" optimization applied.
+
+    Args:
+      base_dataset: Input dataset to map->batch
+      map_fn: Map function to use
+      num_parallel_calls: (Optional.) num_parallel_calls argument for map
+      expect_optimized: (Optional.) Whether we expect the optimization to take
+        place, in which case we will assert that Batch is followed by Map,
+        otherwise Map followed by Batch. Defaults to True.
+
+    Returns:
+      Tuple of (unoptimized dataset, optimized dataset).
+    """
+    map_node_name = "Map" if num_parallel_calls is None else "ParallelMap"
+    batch_size = 100
+
+    def _make_dataset(node_names):
+      return base_dataset.apply(optimization.assert_next(node_names)).map(
+          map_fn, num_parallel_calls=num_parallel_calls).batch(batch_size)
+
+    unoptimized = _make_dataset([map_node_name, "Batch"])
+    optimized = _make_dataset(["Batch", map_node_name] if expect_optimized else
+                              [map_node_name, "Batch"]).apply(
+                                  optimization.optimize(["map_vectorization"]))
+
+    return unoptimized, optimized
+
+  @parameterized.named_parameters(
+      ("Basic", lambda x: (x, x + 1), None),
+      ("Parallel", lambda x: (x, x + 1), 12),
+      ("Gather", lambda x: array_ops.gather(x, 0), 12),
+  )
+  def testOptimization(self, map_fn, num_parallel_calls):
+    base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
+                                                           [3, 4]]).repeat(5)
+    unoptimized, optimized = self._get_test_datasets(base_dataset, map_fn,
+                                                     num_parallel_calls)
+    self._assert_datasets_equal(unoptimized, optimized)
+
+  def testOptimizationBadMapFn(self):
+    # Test map functions that give an error
+    def map_fn(x):
+      # x has leading dimension 5, this will raise an error
+      return array_ops.gather(x, 10)
+
+    base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
+        5, drop_remainder=True)
+    _, optimized = self._get_test_datasets(base_dataset, map_fn)
+    nxt = optimized.make_one_shot_iterator().get_next()
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"indices = 10 is not in \[0, 5\)"):
+      self.evaluate(nxt)
+
+  def testOptimizationWithCapturedInputs(self):
+    # Tests that vectorization works with captured inputs
+    def map_fn(x):
+      return x + y
+
+    y = constant_op.constant(1, shape=(2,))
+    base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
+                                                           [3, 4]]).repeat(5)
+    # TODO(rachelim): when this optimization works, turn on expect_optimized
+    unoptimized, optimized = self._get_test_datasets(
+        base_dataset, map_fn, expect_optimized=False)
+    self._assert_datasets_equal(optimized, unoptimized)
+
+  def testOptimizationIgnoreStateful(self):
+
+    def map_fn(x):
+      with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
+        return array_ops.identity(x)
+
+    base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
+                                                           [3, 4]]).repeat(5)
+    _, optimized = self._get_test_datasets(
+        base_dataset, map_fn, expect_optimized=False)
+    nxt = optimized.make_one_shot_iterator().get_next()
+
+    # NOTE: Right now, it raises an error because we can't save datasets that
+    # are stateful, and we rely on this saving mechanism to optimize datasets,
+    # so stateful functions can't be optimized.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "[Ss]tateful"):
+      self.evaluate(nxt)
+
+  def testOptimizationIgnoreRagged(self):
+    # Make sure we ignore inputs that might not be uniformly sized
+    def map_fn(x):
+      return array_ops.gather(x, 0)
+
+    # output_shape = (?,)
+    base_dataset = dataset_ops.Dataset.range(20).batch(3, drop_remainder=False)
+    unoptimized, optimized = self._get_test_datasets(
+        base_dataset, map_fn, expect_optimized=False)
+    self._assert_datasets_equal(unoptimized, optimized)
+
+  def testOptimizationIgnoreRaggedMap(self):
+    # Don't optimize when the output of the map fn shapes are unknown.
+    def map_fn(x):
+      return array_ops.tile(x, x)
+
+    base_dataset = dataset_ops.Dataset.range(20).batch(1, drop_remainder=True)
+    unoptimized, optimized = self._get_test_datasets(
+        base_dataset, map_fn, expect_optimized=False)
+    self._assert_datasets_raise_same_error(unoptimized, optimized,
+                                           errors.InvalidArgumentError)
+
+
+class MapVectorizationBenchmark(test.Benchmark):
+  # TODO(rachelim): Add a benchmark for more expensive transformations, such as
+  # vgg_preprocessing.
+
+  def _run(self, x, num_iters=100, name=None):
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        # Warm up session...
+        sess.run(x)
+      for _ in range(num_iters):
+        start = time.time()
+        sess.run(x)
+        end = time.time()
+        deltas.append(end - start)
+    median_time = np.median(deltas)
+    self.report_benchmark(iters=num_iters, wall_time=median_time, name=name)
+    return median_time
+
+  def benchmark_CheapFns(self):
+
+    input_sizes = [(10, 10, 3), (10, 100, 300)]
+    batch_size = 1000
+    for input_size in input_sizes:
+      input_dataset = dataset_ops.Dataset.from_tensor_slices(
+          (np.random.rand(*input_size), np.random.rand(*input_size))).repeat()
+      for map_fn, str_id in self._get_known_cheap_fns():
+        self._compare(input_dataset, map_fn, batch_size, input_size, str_id)
+
+  def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
+    num_elems = np.prod(input_size)
+    name_template = "{}__batch_size_{}_input_size_{}_{}"
+    unoptimized = input_dataset.map(map_fn).batch(batch_size)
+    unoptimized_op = unoptimized.make_one_shot_iterator().get_next()
+
+    optimized = unoptimized.apply(optimization.optimize(["map_vectorization"]))
+    optimized_op = optimized.make_one_shot_iterator().get_next()
+
+    unoptimized_time = self._run(
+        unoptimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
+    optimized_time = self._run(
+        optimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "optimized"))
+
+    print("Batch size: {}\n"
+          "Input size: {}\n"
+          "Transformation: {}\n"
+          "Speedup: {}\n".format(batch_size, input_size, str_id,
+                                 (unoptimized_time / optimized_time)))
+
+  def _get_known_cheap_fns(self):
+    return [
+        (lambda *args: [array_ops.identity(x) for x in args], "identity"),
+        (lambda *args: [x + 1 for x in args], "add_const"),
+        (lambda *args: args[0], "select"),
+        (lambda *args: [math_ops.cast(x, dtypes.float64) for x in args],
+         "cast"),
+    ]
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
index ae147b4fa7..ca38f8e2f9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
@@ -19,14 +19,9 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.contrib.data.python.kernel_tests import stats_dataset_test_base
 from tensorflow.contrib.data.python.ops import optimization
-from tensorflow.contrib.data.python.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -105,7 +100,10 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testFunctionLibraryDefinitionModification(self):
+  # TODO(b/112914454): Remove the test or figure out way to copy only new
+  # functions in optimize_dataset_op instead of taking union of old and new
+  # functions.
+  def _testFunctionLibraryDefinitionModification(self):
     dataset = dataset_ops.Dataset.from_tensors(0).map(lambda x: x).apply(
         optimization.optimize(["_test_only_function_rename"]))
     iterator = dataset.make_one_shot_iterator()
@@ -116,166 +114,6 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase):
                                    "Function .* is not defined."):
         sess.run(get_next)
 
-  @staticmethod
-  def map_functions():
-    identity = lambda x: x
-    increment = lambda x: x + 1
-
-    def increment_and_square(x):
-      y = x + 1
-      return y * y
-
-    functions = [identity, increment, increment_and_square]
-    tests = []
-    for i, fun1 in enumerate(functions):
-      for j, fun2 in enumerate(functions):
-        tests.append((
-            "test_{}_{}".format(i, j),
-            [fun1, fun2],
-        ))
-        for k, fun3 in enumerate(functions):
-          tests.append((
-              "test_{}_{}_{}".format(i, j, k),
-              [fun1, fun2, fun3],
-          ))
-
-    swap = lambda x, n: (n, x)
-    tests.append((
-        "swap1",
-        [lambda x: (x, 42), swap],
-    ))
-    tests.append((
-        "swap2",
-        [lambda x: (x, 42), swap, swap],
-    ))
-    return tuple(tests)
-
-  @parameterized.named_parameters(*map_functions.__func__())
-  def testMapFusion(self, functions):
-    dataset = dataset_ops.Dataset.range(5).apply(
-        optimization.assert_next(["Map", "Prefetch"]))
-    for function in functions:
-      dataset = dataset.map(function)
-
-    dataset = dataset.prefetch(0).apply(optimization.optimize(["map_fusion"]))
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        r = x
-        for function in functions:
-          if isinstance(r, tuple):
-            r = function(*r)  # Pass tuple as multiple arguments.
-          else:
-            r = function(r)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @staticmethod
-  def map_and_filter_functions():
-    identity = lambda x: x
-    increment = lambda x: x + 1
-    minus_five = lambda x: x - 5
-
-    def increment_and_square(x):
-      y = x + 1
-      return y * y
-
-    take_all = lambda x: constant_op.constant(True)
-    is_zero = lambda x: math_ops.equal(x, 0)
-    is_odd = lambda x: math_ops.equal(x % 2, 0)
-    greater = lambda x: math_ops.greater(x + 5, 0)
-
-    functions = [identity, increment, minus_five, increment_and_square]
-    filters = [take_all, is_zero, is_odd, greater]
-    tests = []
-
-    for x, fun in enumerate(functions):
-      for y, predicate in enumerate(filters):
-        tests.append(("mixed_{}_{}".format(x, y), fun, predicate))
-
-    # Multi output
-    tests.append(("multiOne", lambda x: (x, x),
-                  lambda x, y: constant_op.constant(True)))
-    tests.append(
-        ("multiTwo", lambda x: (x, 2),
-         lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0)))
-    return tuple(tests)
-
-  @parameterized.named_parameters(*map_and_filter_functions.__func__())
-  def testMapFilterFusion(self, function, predicate):
-    dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(
-            ["Map",
-             "FilterByLastComponent"])).map(function).filter(predicate).apply(
-                 optimization.optimize(["map_and_filter_fusion"]))
-    self._testMapAndFilter(dataset, function, predicate)
-
-  def _testMapAndFilter(self, dataset, function, predicate):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.test_session() as sess:
-      for x in range(10):
-        r = function(x)
-        if isinstance(r, tuple):
-          b = predicate(*r)  # Pass tuple as multiple arguments.
-        else:
-          b = predicate(r)
-        if sess.run(b):
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testAdditionalInputs(self):
-    a = constant_op.constant(3, dtype=dtypes.int64)
-    b = constant_op.constant(4, dtype=dtypes.int64)
-    some_tensor = math_ops.mul(a, b)
-    function = lambda x: x * x
-
-    def predicate(y):
-      return math_ops.less(math_ops.cast(y, dtypes.int64), some_tensor)
-
-    # We are currently not supporting functions with additional inputs.
-    dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(
-            ["Map", "Filter"])).map(function).filter(predicate).apply(
-                optimization.optimize(["map_and_filter_fusion"]))
-
-    self._testMapAndFilter(dataset, function, predicate)
-
-
-class OptimizeStatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
-
-  def testLatencyStatsOptimization(self):
-
-    stats_aggregator = stats_ops.StatsAggregator()
-    dataset = dataset_ops.Dataset.from_tensors(1).apply(
-        optimization.assert_next(
-            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
-             "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply(
-                 optimization.optimize(["latency_all_edges"])).apply(
-                     stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
-
-    with self.test_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertEqual(1 * 1, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      summary_str = sess.run(summary_t)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_TensorDataset/_1", 1)
-      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
-                                  1)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_PrefetchDataset/_6", 1)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py
new file mode 100644
index 0000000000..f6c4a984b8
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/parsing_ops_test.py
@@ -0,0 +1,850 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.parsing_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import parsing_ops as contrib_parsing_ops
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+# Helpers for creating Example objects
+example = example_pb2.Example
+feature = feature_pb2.Feature
+features = lambda d: feature_pb2.Features(feature=d)
+bytes_feature = lambda v: feature(bytes_list=feature_pb2.BytesList(value=v))
+int64_feature = lambda v: feature(int64_list=feature_pb2.Int64List(value=v))
+float_feature = lambda v: feature(float_list=feature_pb2.FloatList(value=v))
+# Helpers for creating SequenceExample objects
+feature_list = lambda l: feature_pb2.FeatureList(feature=l)
+feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d)
+sequence_example = example_pb2.SequenceExample
+
+
+def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
+                                flat_output):
+  tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
+
+  i = 0  # Index into the flattened output of session.run()
+  for k, v in sorted(dict_tensors.items()):
+    # TODO(shivaniagrawal): flat_output is same as v.
+    expected_v = expected_tensors[k]
+    tf_logging.info("Comparing key: %s", k)
+    print("i", i, "flat_output", flat_output[i], "expected_v", expected_v)
+    if sparse_tensor.is_sparse(v):
+      # Three outputs for SparseTensor : indices, values, shape.
+      tester.assertEqual([k, len(expected_v)], [k, 3])
+      print("i", i, "flat_output", flat_output[i].indices, "expected_v",
+            expected_v[0])
+      tester.assertAllEqual(expected_v[0], flat_output[i].indices)
+      tester.assertAllEqual(expected_v[1], flat_output[i].values)
+      tester.assertAllEqual(expected_v[2], flat_output[i].dense_shape)
+    else:
+      # One output for standard Tensor.
+      tester.assertAllEqual(expected_v, flat_output[i])
+    i += 1
+
+
+class ParseExampleTest(test.TestCase):
+
+  def _test(self,
+            input_tensor,
+            feature_val,
+            expected_values=None,
+            expected_err=None):
+
+    with self.test_session() as sess:
+      if expected_err:
+        with self.assertRaisesWithPredicateMatch(expected_err[0],
+                                                 expected_err[1]):
+          dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply(
+              contrib_parsing_ops.parse_example_dataset(feature_val))
+          get_next = dataset.make_one_shot_iterator().get_next()
+          sess.run(get_next)
+        return
+      else:
+        # Returns dict w/ Tensors and SparseTensors.
+        # Check values.
+        dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply(
+            contrib_parsing_ops.parse_example_dataset(feature_val))
+        get_next = dataset.make_one_shot_iterator().get_next()
+        result = sess.run(get_next)
+        flattened = nest.flatten(result)
+        print("result", result, "expected_values", expected_values)
+        _compare_output_to_expected(self, result, expected_values, flattened)
+
+      # Check shapes; if serialized is a Tensor we need its size to
+      # properly check.
+      batch_size = (
+          input_tensor.eval().size if isinstance(input_tensor, ops.Tensor) else
+          np.asarray(input_tensor).size)
+      for k, f in feature_val.items():
+        print("output_shapes as list ",
+              tuple(dataset.output_shapes[k].as_list()))
+        if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
+          self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size)
+        elif isinstance(f, parsing_ops.VarLenFeature):
+          self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
+
+  def testEmptySerializedWithAllDefaults(self):
+    sparse_name = "st_a"
+    a_name = "a"
+    b_name = "b"
+    c_name = "c:has_a_tricky_name"
+    a_default = [0, 42, 0]
+    b_default = np.random.rand(3, 3).astype(bytes)
+    c_default = np.random.rand(2).astype(np.float32)
+
+    expected_st_a = (  # indices, values, shape
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array(
+            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+
+    expected_output = {
+        sparse_name: expected_st_a,
+        a_name: np.array(2 * [[a_default]]),
+        b_name: np.array(2 * [b_default]),
+        c_name: np.array(2 * [c_default]),
+    }
+
+    self._test(
+        ops.convert_to_tensor(["", ""]), {
+            sparse_name:
+                parsing_ops.VarLenFeature(dtypes.int64),
+            a_name:
+                parsing_ops.FixedLenFeature(
+                    (1, 3), dtypes.int64, default_value=a_default),
+            b_name:
+                parsing_ops.FixedLenFeature(
+                    (3, 3), dtypes.string, default_value=b_default),
+            c_name:
+                parsing_ops.FixedLenFeature(
+                    (2,), dtypes.float32, default_value=c_default),
+        },
+        expected_values=expected_output)
+
+  def testEmptySerializedWithoutDefaultsShouldFail(self):
+    input_features = {
+        "st_a":
+            parsing_ops.VarLenFeature(dtypes.int64),
+        "a":
+            parsing_ops.FixedLenFeature(
+                (1, 3), dtypes.int64, default_value=[0, 42, 0]),
+        "b":
+            parsing_ops.FixedLenFeature(
+                (3, 3),
+                dtypes.string,
+                default_value=np.random.rand(3, 3).astype(bytes)),
+        # Feature "c" is missing a default, this gap will cause failure.
+        "c":
+            parsing_ops.FixedLenFeature(
+                (2,), dtype=dtypes.float32),
+    }
+
+    # Edge case where the key is there but the feature value is empty
+    original = example(features=features({"c": feature()}))
+    self._test(
+        [original.SerializeToString()],
+        input_features,
+        expected_err=(errors_impl.InvalidArgumentError,
+                      "Feature: c \\(data type: float\\) is required"))
+
+    # Standard case of missing key and value.
+    self._test(
+        ["", ""],
+        input_features,
+        expected_err=(errors_impl.InvalidArgumentError,
+                      "Feature: c \\(data type: float\\) is required"))
+
+  def testDenseNotMatchingShapeShouldFail(self):
+    original = [
+        example(features=features({
+            "a": float_feature([1, 1, 3]),
+        })), example(features=features({
+            "a": float_feature([-1, -1]),
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    self._test(
+        ops.convert_to_tensor(serialized),
+        {"a": parsing_ops.FixedLenFeature((1, 3), dtypes.float32)},
+        expected_err=(errors_impl.InvalidArgumentError,
+                      "Key: a, Index: 1.  Number of float values"))
+
+  def testDenseDefaultNoShapeShouldFail(self):
+    original = [example(features=features({"a": float_feature([1, 1, 3]),})),]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    self._test(
+        ops.convert_to_tensor(serialized),
+        {"a": parsing_ops.FixedLenFeature(None, dtypes.float32)},
+        expected_err=(ValueError, "Missing shape for feature a"))
+
+  def testSerializedContainingSparse(self):
+    original = [
+        example(features=features({
+            "st_c": float_feature([3, 4])
+        })),
+        example(features=features({
+            "st_c": float_feature([]),  # empty float list
+        })),
+        example(features=features({
+            "st_d": feature(),  # feature with nothing in it
+        })),
+        example(features=features({
+            "st_c": float_feature([1, 2, -1]),
+            "st_d": bytes_feature([b"hi"])
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_st_c = (  # indices, values, shape
+        np.array(
+            [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array(
+                [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array(
+                    [4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
+
+    expected_st_d = (  # indices, values, shape
+        np.array(
+            [[3, 0]], dtype=np.int64), np.array(
+                ["hi"], dtype=bytes), np.array(
+                    [4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
+
+    expected_output = {
+        "st_c": expected_st_c,
+        "st_d": expected_st_d,
+    }
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            "st_c": parsing_ops.VarLenFeature(dtypes.float32),
+            "st_d": parsing_ops.VarLenFeature(dtypes.string)
+        },
+        expected_values=expected_output)
+
+  def testSerializedContainingSparseFeature(self):
+    original = [
+        example(features=features({
+            "val": float_feature([3, 4]),
+            "idx": int64_feature([5, 10])
+        })),
+        example(features=features({
+            "val": float_feature([]),  # empty float list
+            "idx": int64_feature([])
+        })),
+        example(features=features({
+            "val": feature(),  # feature with nothing in it
+            # missing idx feature
+        })),
+        example(features=features({
+            "val": float_feature([1, 2, -1]),
+            "idx":
+                int64_feature([0, 9, 3])  # unsorted
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_sp = (  # indices, values, shape
+        np.array(
+            [[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
+        np.array(
+            [3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), np.array(
+                [4, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+
+    expected_output = {"sp": expected_sp,}
+
+    self._test(
+        ops.convert_to_tensor(serialized),
+        {"sp": parsing_ops.SparseFeature(["idx"], "val", dtypes.float32, [13])},
+        expected_values=expected_output)
+
+  def testSerializedContainingSparseFeatureReuse(self):
+    original = [
+        example(features=features({
+            "val1": float_feature([3, 4]),
+            "val2": float_feature([5, 6]),
+            "idx": int64_feature([5, 10])
+        })),
+        example(features=features({
+            "val1": float_feature([]),  # empty float list
+            "idx": int64_feature([])
+        })),
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_sp1 = (  # indices, values, shape
+        np.array(
+            [[0, 5], [0, 10]], dtype=np.int64), np.array(
+                [3.0, 4.0], dtype=np.float32), np.array(
+                    [2, 13], dtype=np.int64))  # batch == 2, max_elems = 13
+
+    expected_sp2 = (  # indices, values, shape
+        np.array(
+            [[0, 5], [0, 10]], dtype=np.int64), np.array(
+                [5.0, 6.0], dtype=np.float32), np.array(
+                    [2, 7], dtype=np.int64))  # batch == 2, max_elems = 13
+
+    expected_output = {
+        "sp1": expected_sp1,
+        "sp2": expected_sp2,
+    }
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            "sp1":
+                parsing_ops.SparseFeature("idx", "val1", dtypes.float32, 13),
+            "sp2":
+                parsing_ops.SparseFeature(
+                    "idx", "val2", dtypes.float32, size=7, already_sorted=True)
+        },
+        expected_values=expected_output)
+
+  def testSerializedContaining3DSparseFeature(self):
+    original = [
+        example(features=features({
+            "val": float_feature([3, 4]),
+            "idx0": int64_feature([5, 10]),
+            "idx1": int64_feature([0, 2]),
+        })),
+        example(features=features({
+            "val": float_feature([]),  # empty float list
+            "idx0": int64_feature([]),
+            "idx1": int64_feature([]),
+        })),
+        example(features=features({
+            "val": feature(),  # feature with nothing in it
+            # missing idx feature
+        })),
+        example(features=features({
+            "val": float_feature([1, 2, -1]),
+            "idx0": int64_feature([0, 9, 3]),  # unsorted
+            "idx1": int64_feature([1, 0, 2]),
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_sp = (
+        # indices
+        np.array(
+            [[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
+            dtype=np.int64),
+        # values
+        np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
+        # shape batch == 4, max_elems = 13
+        np.array([4, 13, 3], dtype=np.int64))
+
+    expected_output = {"sp": expected_sp,}
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            "sp":
+                parsing_ops.SparseFeature(["idx0", "idx1"], "val",
+                                          dtypes.float32, [13, 3])
+        },
+        expected_values=expected_output)
+
+  def testSerializedContainingDense(self):
+    aname = "a"
+    bname = "b*has+a:tricky_name"
+    original = [
+        example(features=features({
+            aname: float_feature([1, 1]),
+            bname: bytes_feature([b"b0_str"]),
+        })), example(features=features({
+            aname: float_feature([-1, -1]),
+            bname: bytes_feature([b""]),
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_output = {
+        aname:
+            np.array(
+                [[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1),
+        bname:
+            np.array(
+                ["b0_str", ""], dtype=bytes).reshape(2, 1, 1, 1, 1),
+    }
+
+    # No defaults, values required
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32),
+            bname:
+                parsing_ops.FixedLenFeature((1, 1, 1, 1), dtype=dtypes.string),
+        },
+        expected_values=expected_output)
+
+  # This test is identical as the previous one except
+  # for the creation of 'serialized'.
+  def testSerializedContainingDenseWithConcat(self):
+    aname = "a"
+    bname = "b*has+a:tricky_name"
+    # TODO(lew): Feature appearing twice should be an error in future.
+    original = [
+        (example(features=features({
+            aname: float_feature([10, 10]),
+        })), example(features=features({
+            aname: float_feature([1, 1]),
+            bname: bytes_feature([b"b0_str"]),
+        }))),
+        (
+            example(features=features({
+                bname: bytes_feature([b"b100"]),
+            })),
+            example(features=features({
+                aname: float_feature([-1, -1]),
+                bname: bytes_feature([b"b1"]),
+            })),),
+    ]
+
+    serialized = [
+        m.SerializeToString() + n.SerializeToString() for (m, n) in original
+    ]
+
+    expected_output = {
+        aname:
+            np.array(
+                [[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1),
+        bname:
+            np.array(
+                ["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
+    }
+
+    # No defaults, values required
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32),
+            bname:
+                parsing_ops.FixedLenFeature((1, 1, 1, 1), dtype=dtypes.string),
+        },
+        expected_values=expected_output)
+
+  def testSerializedContainingDenseScalar(self):
+    original = [
+        example(features=features({
+            "a": float_feature([1]),
+        })), example(features=features({}))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_output = {
+        "a":
+            np.array(
+                [[1], [-1]], dtype=np.float32)  # 2x1 (column vector)
+    }
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            "a":
+                parsing_ops.FixedLenFeature(
+                    (1,), dtype=dtypes.float32, default_value=-1),
+        },
+        expected_values=expected_output)
+
+  def testSerializedContainingDenseWithDefaults(self):
+    original = [
+        example(features=features({
+            "a": float_feature([1, 1]),
+        })),
+        example(features=features({
+            "b": bytes_feature([b"b1"]),
+        })),
+        example(features=features({
+            "b": feature()
+        })),
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_output = {
+        "a":
+            np.array(
+                [[1, 1], [3, -3], [3, -3]], dtype=np.float32).reshape(3, 1, 2,
+                                                                      1),
+        "b":
+            np.array(
+                ["tmp_str", "b1", "tmp_str"], dtype=bytes).reshape(3, 1, 1, 1,
+                                                                   1),
+    }
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            "a":
+                parsing_ops.FixedLenFeature(
+                    (1, 2, 1), dtype=dtypes.float32, default_value=[3.0, -3.0]),
+            "b":
+                parsing_ops.FixedLenFeature(
+                    (1, 1, 1, 1), dtype=dtypes.string, default_value="tmp_str"),
+        },
+        expected_values=expected_output)
+
+  def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
+    expected_st_a = (  # indices, values, shape
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array(
+            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+    expected_sp = (  # indices, values, shape
+        np.array(
+            [[0, 0], [0, 3], [1, 7]], dtype=np.int64), np.array(
+                ["a", "b", "c"], dtype="|S"), np.array(
+                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+
+    original = [
+        example(features=features({
+            "c": float_feature([3, 4]),
+            "val": bytes_feature([b"a", b"b"]),
+            "idx": int64_feature([0, 3])
+        })), example(features=features({
+            "c": float_feature([1, 2]),
+            "val": bytes_feature([b"c"]),
+            "idx": int64_feature([7])
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    a_default = [1, 2, 3]
+    b_default = np.random.rand(3, 3).astype(bytes)
+    expected_output = {
+        "st_a": expected_st_a,
+        "sp": expected_sp,
+        "a": np.array(2 * [[a_default]]),
+        "b": np.array(2 * [b_default]),
+        "c": np.array(
+            [[3, 4], [1, 2]], dtype=np.float32),
+    }
+
+    self._test(
+        ops.convert_to_tensor(serialized),
+        {
+            "st_a":
+                parsing_ops.VarLenFeature(dtypes.int64),
+            "sp":
+                parsing_ops.SparseFeature("idx", "val", dtypes.string, 13),
+            "a":
+                parsing_ops.FixedLenFeature(
+                    (1, 3), dtypes.int64, default_value=a_default),
+            "b":
+                parsing_ops.FixedLenFeature(
+                    (3, 3), dtypes.string, default_value=b_default),
+            # Feature "c" must be provided, since it has no default_value.
+            "c":
+                parsing_ops.FixedLenFeature((2,), dtypes.float32),
+        },
+        expected_values=expected_output)
+
+  def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
+    expected_idx = (  # indices, values, shape
+        np.array(
+            [[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
+        np.array([0, 3, 7, 1]), np.array(
+            [2, 2], dtype=np.int64))  # batch == 4, max_elems = 2
+
+    expected_sp = (  # indices, values, shape
+        np.array(
+            [[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), np.array(
+                ["a", "b", "d", "c"], dtype="|S"), np.array(
+                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+
+    original = [
+        example(features=features({
+            "val": bytes_feature([b"a", b"b"]),
+            "idx": int64_feature([0, 3])
+        })), example(features=features({
+            "val": bytes_feature([b"c", b"d"]),
+            "idx": int64_feature([7, 1])
+        }))
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_output = {
+        "idx": expected_idx,
+        "sp": expected_sp,
+    }
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            "idx":
+                parsing_ops.VarLenFeature(dtypes.int64),
+            "sp":
+                parsing_ops.SparseFeature(["idx"], "val", dtypes.string, [13]),
+        },
+        expected_values=expected_output)
+
+  def _testSerializedContainingVarLenDenseLargerBatch(self, batch_size):
+    # During parsing, data read from the serialized proto is stored in buffers.
+    # For small batch sizes, a buffer will contain one minibatch entry.
+    # For larger batch sizes, a buffer may contain several minibatch
+    # entries.  This test identified a bug where the code that copied
+    # data out of the buffers and into the output tensors assumed each
+    # buffer only contained one minibatch entry.  The bug has since been fixed.
+    truth_int = [i for i in range(batch_size)]
+    truth_str = [[("foo%d" % i).encode(), ("bar%d" % i).encode()]
+                 for i in range(batch_size)]
+
+    expected_str = copy.deepcopy(truth_str)
+
+    # Delete some intermediate entries
+    for i in range(batch_size):
+      col = 1
+      if np.random.rand() < 0.25:
+        # w.p. 25%, drop out the second entry
+        expected_str[i][col] = b"default"
+        col -= 1
+        truth_str[i].pop()
+      if np.random.rand() < 0.25:
+        # w.p. 25%, drop out the second entry (possibly again)
+        expected_str[i][col] = b"default"
+        truth_str[i].pop()
+
+    expected_output = {
+        # Batch size batch_size, 1 time step.
+        "a": np.array(truth_int, dtype=np.int64).reshape(batch_size, 1),
+        # Batch size batch_size, 2 time steps.
+        "b": np.array(expected_str, dtype="|S").reshape(batch_size, 2),
+    }
+
+    original = [
+        example(features=features(
+            {"a": int64_feature([truth_int[i]]),
+             "b": bytes_feature(truth_str[i])}))
+        for i in range(batch_size)
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    self._test(
+        ops.convert_to_tensor(serialized, dtype=dtypes.string), {
+            "a":
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=(),
+                    dtype=dtypes.int64,
+                    allow_missing=True,
+                    default_value=-1),
+            "b":
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[],
+                    dtype=dtypes.string,
+                    allow_missing=True,
+                    default_value="default"),
+        },
+        expected_values=expected_output)
+
+  def testSerializedContainingVarLenDenseLargerBatch(self):
+    np.random.seed(3456)
+    for batch_size in (1, 10, 20, 100, 256):
+      self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
+
+  def testSerializedContainingVarLenDense(self):
+    aname = "a"
+    bname = "b"
+    cname = "c"
+    dname = "d"
+    original = [
+        example(features=features({
+            cname: int64_feature([2]),
+        })),
+        example(features=features({
+            aname: float_feature([1, 1]),
+            bname: bytes_feature([b"b0_str", b"b1_str"]),
+        })),
+        example(features=features({
+            aname: float_feature([-1, -1, 2, 2]),
+            bname: bytes_feature([b"b1"]),
+        })),
+        example(features=features({
+            aname: float_feature([]),
+            cname: int64_feature([3]),
+        })),
+    ]
+
+    serialized = [m.SerializeToString() for m in original]
+
+    expected_output = {
+        aname:
+            np.array(
+                [
+                    [0, 0, 0, 0],
+                    [1, 1, 0, 0],
+                    [-1, -1, 2, 2],
+                    [0, 0, 0, 0],
+                ],
+                dtype=np.float32).reshape(4, 2, 2, 1),
+        bname:
+            np.array(
+                [["", ""], ["b0_str", "b1_str"], ["b1", ""], ["", ""]],
+                dtype=bytes).reshape(4, 2, 1, 1, 1),
+        cname:
+            np.array([2, 0, 0, 3], dtype=np.int64).reshape(4, 1),
+        dname:
+            np.empty(shape=(4, 0), dtype=bytes),
+    }
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=True),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        },
+        expected_values=expected_output)
+
+    # Test with padding values.
+    expected_output_custom_padding = dict(expected_output)
+    expected_output_custom_padding[aname] = np.array(
+        [
+            [-2, -2, -2, -2],
+            [1, 1, -2, -2],
+            [-1, -1, 2, 2],
+            [-2, -2, -2, -2],
+        ],
+        dtype=np.float32).reshape(4, 2, 2, 1)
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1),
+                    dtype=dtypes.float32,
+                    allow_missing=True,
+                    default_value=-2.0),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=True),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        }, expected_output_custom_padding)
+
+    # Change number of required values so the inputs are not a
+    # multiple of this size.
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+        },
+        expected_err=(
+            errors_impl.OpError, "Key: b, Index: 2.  "
+            "Number of bytes values is not a multiple of stride length."))
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1),
+                    dtype=dtypes.float32,
+                    allow_missing=True,
+                    default_value=[]),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+        },
+        expected_err=(ValueError,
+                      "Cannot reshape a tensor with 0 elements to shape"))
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenFeature((None, 2, 1), dtype=dtypes.float32),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+        },
+        expected_err=(ValueError,
+                      "First dimension of shape for feature a unknown. "
+                      "Consider using FixedLenSequenceFeature."))
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            cname:
+                parsing_ops.FixedLenFeature(
+                    (1, None), dtype=dtypes.int64, default_value=[[1]]),
+        },
+        expected_err=(ValueError,
+                      "All dimensions of shape for feature c need to be known "
+                      r"but received \(1, None\)."))
+
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1), dtype=dtypes.float32, allow_missing=True),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+            cname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.int64, allow_missing=False),
+            dname:
+                parsing_ops.FixedLenSequenceFeature(
+                    shape=[], dtype=dtypes.string, allow_missing=True),
+        },
+        expected_err=(ValueError,
+                      "Unsupported: FixedLenSequenceFeature requires "
+                      "allow_missing to be True."))
+
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 15b342d30f..64fe6dae24 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -43,7 +43,7 @@ class ReadBatchFeaturesTest(
     for batch_size in [1, 2]:
       for num_epochs in [1, 10]:
         with ops.Graph().as_default() as g:
-          with self.test_session(graph=g) as sess:
+          with self.session(graph=g) as sess:
             # Basic test: read from file 0.
             self.outputs = self.make_batch_feature(
                 filenames=self.test_filenames[0],
@@ -54,7 +54,7 @@ class ReadBatchFeaturesTest(
               self._next_actual_batch(sess)
 
         with ops.Graph().as_default() as g:
-          with self.test_session(graph=g) as sess:
+          with self.session(graph=g) as sess:
             # Basic test: read from file 1.
             self.outputs = self.make_batch_feature(
                 filenames=self.test_filenames[1],
@@ -65,7 +65,7 @@ class ReadBatchFeaturesTest(
               self._next_actual_batch(sess)
 
         with ops.Graph().as_default() as g:
-          with self.test_session(graph=g) as sess:
+          with self.session(graph=g) as sess:
             # Basic test: read from both files.
             self.outputs = self.make_batch_feature(
                 filenames=self.test_filenames,
@@ -104,7 +104,7 @@ class ReadBatchFeaturesTest(
     for batch_size in [1, 2]:
       # Test that shuffling with same seed produces the same result.
       with ops.Graph().as_default() as g:
-        with self.test_session(graph=g) as sess:
+        with self.session(graph=g) as sess:
           outputs1 = self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
@@ -125,7 +125,7 @@ class ReadBatchFeaturesTest(
 
       # Test that shuffling with different seeds produces a different order.
       with ops.Graph().as_default() as g:
-        with self.test_session(graph=g) as sess:
+        with self.session(graph=g) as sess:
           outputs1 = self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
@@ -152,7 +152,7 @@ class ReadBatchFeaturesTest(
       for reader_num_threads in [2, 4]:
         for parser_num_threads in [2, 4]:
           with ops.Graph().as_default() as g:
-            with self.test_session(graph=g) as sess:
+            with self.session(graph=g) as sess:
               self.outputs = self.make_batch_feature(
                   filenames=self.test_filenames,
                   num_epochs=num_epochs,
@@ -275,7 +275,7 @@ class MakeCsvDatasetTest(test.TestCase):
     filenames = self._setup_files(
         inputs, compression_type=kwargs.get("compression_type", None))
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         dataset = self._make_csv_dataset(
             filenames,
             batch_size=batch_size,
@@ -740,7 +740,7 @@ class MakeCsvDatasetTest(test.TestCase):
     total_records = 20
     for batch_size in [1, 2]:
       with ops.Graph().as_default() as g:
-        with self.test_session(graph=g) as sess:
+        with self.session(graph=g) as sess:
           # Test that shuffling with the same seed produces the same result
           dataset1 = self._make_csv_dataset(
               filenames,
@@ -771,7 +771,7 @@ class MakeCsvDatasetTest(test.TestCase):
               self.assertAllEqual(batch1[i], batch2[i])
 
       with ops.Graph().as_default() as g:
-        with self.test_session(graph=g) as sess:
+        with self.session(graph=g) as sess:
           # Test that shuffling with a different seed produces different results
           dataset1 = self._make_csv_dataset(
               filenames,
@@ -909,7 +909,7 @@ class MakeTFRecordDatasetTest(
       fn = None
 
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         outputs = readers.make_tf_record_dataset(
             file_pattern=file_pattern,
             num_epochs=num_epochs,
@@ -965,7 +965,7 @@ class MakeTFRecordDatasetTest(
   def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
                     seed=None):
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         dataset = readers.make_tf_record_dataset(
             file_pattern=self.test_filenames,
             num_epochs=num_epochs,
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD b/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD
index 7b9ea191a4..4881f63ab9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD
@@ -318,6 +318,19 @@ py_test(
 )
 
 py_test(
+    name = "parse_example_dataset_serialization_test",
+    size = "medium",
+    srcs = ["parse_example_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/contrib/data/python/kernel_tests:reader_dataset_ops_test_base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
     name = "prefetch_dataset_serialization_test",
     size = "small",
     srcs = ["prefetch_dataset_serialization_test.py"],
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py
index 3ed4dfb729..595cecef4d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -252,7 +252,7 @@ class DatasetSerializationTestBase(test.TestCase):
       init_op, get_next_op = self._get_iterator_ops_from_collection(
           ds_fn, sparse_tensors=sparse_tensors)
       get_next_op = remove_variants(get_next_op)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         self._restore(saver, sess)
         self._initialize(init_op, sess)
         for _ in range(num_outputs):
@@ -315,7 +315,7 @@ class DatasetSerializationTestBase(test.TestCase):
       _, get_next_op, saver = self._build_graph(
           ds_fn2, sparse_tensors=sparse_tensors)
       get_next_op = remove_variants(get_next_op)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
           actual.append(sess.run(get_next_op))
@@ -376,7 +376,7 @@ class DatasetSerializationTestBase(test.TestCase):
       get_next_op, saver = self._build_empty_graph(
           ds_fn, sparse_tensors=sparse_tensors)
       get_next_op = remove_variants(get_next_op)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
           actual.append(sess.run(get_next_op))
@@ -410,7 +410,7 @@ class DatasetSerializationTestBase(test.TestCase):
       init_op, get_next_op, saver = self._build_graph(
           ds_fn, sparse_tensors=sparse_tensors)
       get_next_op = remove_variants(get_next_op)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         self._initialize(init_op, sess)
         for _ in range(break_point):
           sess.run(get_next_op)
@@ -510,14 +510,13 @@ class DatasetSerializationTestBase(test.TestCase):
       else:
         init_op, get_next_op, saver = self._build_graph(
             ds_fn, sparse_tensors=sparse_tensors)
-      get_next_op = remove_variants(get_next_op)
       return init_op, get_next_op, saver
 
     for i in range(len(break_points) + 1):
       with ops.Graph().as_default() as g:
         init_op, get_next_op, saver = get_ops()
         get_next_op = remove_variants(get_next_op)
-        with self.test_session(graph=g) as sess:
+        with self.session(graph=g) as sess:
           if ckpt_saved:
             if init_before_restore:
               self._initialize(init_op, sess)
@@ -616,29 +615,40 @@ class DatasetSerializationTestBase(test.TestCase):
     # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
     # do not support tuples we flatten the tensors and restore the shape in
     # `_get_iterator_ops_from_collection`.
-
-    # TODO(shivaniagrwal): `output_classes` is a nested structure of classes,
-    # this base class is specific to current test cases. Update when tests are
-    # added with `output_classes` as a nested structure with at least one of the
-    # component being `tf.SparseTensor`.
-    if (sparse_tensors or
-        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
+    if sparse_tensors:  # specific for deprecated `from_sparse_tensor_slices`.
       ops.add_to_collection("iterator_ops", get_next.indices)
       ops.add_to_collection("iterator_ops", get_next.values)
       ops.add_to_collection("iterator_ops", get_next.dense_shape)
-    else:
-      for el in nest.flatten(get_next):
-        ops.add_to_collection("iterator_ops", el)
+      return
+
+    get_next_list = nest.flatten(get_next)
+    for i, output_class in enumerate(
+        nest.flatten(self._get_output_classes(ds_fn))):
+      if output_class is sparse_tensor.SparseTensor:
+        ops.add_to_collection("iterator_ops", get_next_list[i].indices)
+        ops.add_to_collection("iterator_ops", get_next_list[i].values)
+        ops.add_to_collection("iterator_ops", get_next_list[i].dense_shape)
+      else:
+        ops.add_to_collection("iterator_ops", get_next_list[i])
 
   def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
     all_ops = ops.get_collection("iterator_ops")
-    if (sparse_tensors or
-        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
+    if sparse_tensors:  # specific for deprecated `from_sparse_tensor_slices`.
       init_op, indices, values, dense_shape = all_ops
       return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
-    else:
-      return all_ops[0], nest.pack_sequence_as(
-          self._get_output_types(ds_fn), all_ops[1:])
+    get_next_list = []
+    i = 1
+    for output_class in nest.flatten(self._get_output_classes(ds_fn)):
+      if output_class is sparse_tensor.SparseTensor:
+        indices, values, dense_shape = all_ops[i:i + 3]
+        i += 3
+        get_next_list.append(
+            sparse_tensor.SparseTensor(indices, values, dense_shape))
+      else:
+        get_next_list.append(all_ops[i])
+        i += 1
+    return all_ops[0], nest.pack_sequence_as(
+        self._get_output_types(ds_fn), get_next_list)
 
   def _get_output_types(self, ds_fn):
     with ops.Graph().as_default():
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py
new file mode 100644
index 0000000000..d3fa84e74c
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization/parse_example_dataset_serialization_test.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ParseExampleDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.platform import test
+
+
+class ParseExampleDatasetSerializationTest(
+    reader_dataset_ops_test_base.ReadBatchFeaturesTestBase,
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def ParseExampleDataset(self, num_repeat, batch_size):
+    return self.make_batch_feature(
+        filenames=self.test_filenames,
+        num_epochs=num_repeat,
+        batch_size=batch_size,
+        reader_num_threads=5,
+        parser_num_threads=10)
+
+  def testSerializationCore(self):
+    num_repeat = 5
+    batch_size = 2
+    num_outputs = self._num_records * self._num_files * num_repeat // batch_size
+    # pylint: disable=g-long-lambda
+    self.run_core_tests(
+        lambda: self.ParseExampleDataset(
+            num_repeat=num_repeat, batch_size=batch_size),
+        lambda: self.ParseExampleDataset(num_repeat=10, batch_size=4),
+        num_outputs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization/range_dataset_serialization_test.py
index e4f5b6cf5d..6341190847 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -70,7 +70,7 @@ class RangeDatasetSerializationTest(
     break_point = 5
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for i in range(start, break_point):
@@ -79,7 +79,7 @@ class RangeDatasetSerializationTest(
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         sess.run(restore_op)
         for i in range(break_point, stop):
@@ -90,7 +90,7 @@ class RangeDatasetSerializationTest(
     # Saving and restoring in same session.
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for i in range(start, break_point):
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization/serialization_integration_test.py
index 992d996a48..6aac50ecd9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization/serialization_integration_test.py
@@ -59,7 +59,7 @@ class SerializationIntegrationTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
                                                         num_outputs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_ops)
         for _ in range(break_point):
           output = sess.run(get_next_ops)
@@ -70,7 +70,7 @@ class SerializationIntegrationTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
                                                         num_outputs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         saver.restore(sess, self._ckpt_path())
         for _ in range(num_outputs - break_point):
           output = sess.run(get_next_ops)
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index d46c762aaa..a59fa94d66 100644
--- a/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -136,7 +136,7 @@ class ShuffleDatasetSerializationTest(
           for saveable in saveables:
             ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
           saver = saver_lib.Saver(allow_empty=True)
-          with self.test_session(graph=g) as sess:
+          with self.session(graph=g) as sess:
             self._save(sess, saver)
             expected = [sess.run(get_next_ops) for _ in range(num_outputs)]
             self._restore(saver, sess)
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
index 3c11d7a97f..077abd6b30 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -106,7 +106,7 @@ class ShuffleAndRepeatTest(test.TestCase):
       ds = dataset_ops.Dataset.range(20).apply(
           shuffle_ops.shuffle_and_repeat(buffer_size=21))
       get_next_op = ds.make_one_shot_iterator().get_next()
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(get_next_op)
 
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index a41d21f8c1..53c22628c7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -190,7 +190,7 @@ class FeatureStatsDatasetTest(
         batch_size=batch_size,
         shuffle=True,
         shuffle_seed=5,
-        drop_final_batch=True).apply(
+        drop_final_batch=False).apply(
             stats_ops.set_stats_aggregator(stats_aggregator))
     iterator = dataset.make_initializable_iterator()
     next_element = iterator.get_next()
@@ -198,7 +198,8 @@ class FeatureStatsDatasetTest(
 
     with self.test_session() as sess:
       sess.run(iterator.initializer)
-      for _ in range(total_records // batch_size):
+      for _ in range(total_records // batch_size + 1 if total_records %
+                     batch_size else total_records // batch_size):
         sess.run(next_element)
 
       with self.assertRaises(errors.OutOfRangeError):
diff --git a/tensorflow/contrib/data/python/kernel_tests/test_utils.py b/tensorflow/contrib/data/python/kernel_tests/test_utils.py
new file mode 100644
index 0000000000..1b962b3418
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/test_utils.py
@@ -0,0 +1,60 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utilities for tf.data functionality."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class DatasetTestBase(test.TestCase):
+  """Base class for dataset tests."""
+
+  def _assert_datasets_equal(self, dataset1, dataset2):
+    # TODO(rachelim): support sparse tensor outputs
+    next1 = dataset1.make_one_shot_iterator().get_next()
+    next2 = dataset2.make_one_shot_iterator().get_next()
+    with self.test_session() as sess:
+      while True:
+        try:
+          op1 = sess.run(next1)
+        except errors.OutOfRangeError:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next2)
+          break
+        op2 = sess.run(next2)
+
+        op1 = nest.flatten(op1)
+        op2 = nest.flatten(op2)
+        assert len(op1) == len(op2)
+        for i in range(len(op1)):
+          self.assertAllEqual(op1[i], op2[i])
+
+  def _assert_datasets_raise_same_error(self, dataset1, dataset2, exc_class):
+    next1 = dataset1.make_one_shot_iterator().get_next()
+    next2 = dataset2.make_one_shot_iterator().get_next()
+    with self.test_session() as sess:
+      try:
+        sess.run(next1)
+        raise ValueError(
+            "Expected dataset to raise an error of type %s, but it did not." %
+            repr(exc_class))
+      except exc_class as e:
+        # Check that the first segment of the error messages are the same.
+        with self.assertRaisesRegexp(exc_class, e.message.split(". ")[0]):
+          sess.run(next2)
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index ad9378dfb9..4b45cc7e36 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -80,17 +80,14 @@ py_library(
         ":batching",
         ":gen_dataset_ops",
         ":interleave_ops",
+        ":parsing_ops",
         ":shuffle_ops",
-        ":stats_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -211,6 +208,22 @@ py_library(
 )
 
 py_library(
+    name = "parsing_ops",
+    srcs = ["parsing_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+py_library(
     name = "map_defun",
     srcs = ["map_defun.py"],
     srcs_version = "PY2AND3",
@@ -331,7 +344,10 @@ py_library(
 tf_gen_op_wrapper_py(
     name = "gen_dataset_ops",
     out = "gen_dataset_ops.py",
-    deps = ["//tensorflow/contrib/data:dataset_ops_op_lib"],
+    deps = [
+        "//tensorflow/contrib/data:dataset_ops_op_lib",
+        "//tensorflow/contrib/data:indexed_dataset_ops_op_lib",
+    ],
 )
 
 tf_kernel_library(
@@ -349,6 +365,7 @@ tf_custom_op_py_library(
     dso = ["//tensorflow/contrib/data:_dataset_ops.so"],
     kernels = [
         ":dataset_ops_kernels",
+        "//tensorflow/contrib/data:indexed_dataset_ops_op_lib",
         "//tensorflow/contrib/data:dataset_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
@@ -360,6 +377,19 @@ tf_custom_op_py_library(
 )
 
 py_library(
+    name = "indexed_dataset_ops",
+    srcs = ["indexed_dataset_ops.py"],
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+py_library(
     name = "prefetching_ops",
     srcs = ["prefetching_ops.py"],
     deps = [
@@ -380,6 +410,7 @@ py_library(
         ":error_ops",
         ":get_single_element",
         ":grouping",
+        ":indexed_dataset_ops",
         ":interleave_ops",
         ":map_defun",
         ":optimization",
diff --git a/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py b/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py
new file mode 100644
index 0000000000..a0932b4081
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/indexed_dataset_ops.py
@@ -0,0 +1,173 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for indexed datasets."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+
+class MaterializedIndexedDataset(object):
+  """MaterializedIndexedDataset is highly experimental!
+  """
+
+  def __init__(self, materialized_resource, materializer, output_classes,
+               output_types, output_shapes):
+    self._materialized_resource = materialized_resource
+    self._materializer = materializer
+    self._output_classes = output_classes
+    self._output_types = output_types
+    self._output_shapes = output_shapes
+
+  @property
+  def initializer(self):
+    if self._materializer is not None:
+      return self._materializer
+    raise ValueError("MaterializedDataset does not have a materializer")
+
+  def get(self, index):
+    """Get retrieves a value (or set of values) from the IndexedDataset.
+
+    Args:
+      index: A uint64 scalar or vector tensor with the indices to retrieve.
+
+    Returns:
+      A tensor containing the values corresponding to `index`.
+    """
+    # TODO(saeta): nest.pack_sequence_as(...)
+    return gen_dataset_ops.indexed_dataset_get(
+        self._materialized_resource,
+        index,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self._output_types, self._output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_types(self._output_shapes, self._output_classes)))
+
+
+class IndexedDataset(dataset_ops.Dataset):
+  """IndexedDataset is highly experimental!
+  """
+
+  def __init__(self):
+    pass
+
+  def materialize(self, shared_name=None, container=None):
+    """Materialize creates a MaterializedIndexedDataset.
+
+    IndexedDatasets can be combined through operations such as TBD. Therefore,
+    they are only materialized when absolutely required.
+
+    Args:
+      shared_name: a string for the shared name to use for the resource.
+      container: a string for the container to store the resource.
+
+    Returns:
+      A MaterializedIndexedDataset.
+    """
+    if container is None:
+      container = ""
+    if shared_name is None:
+      shared_name = ""
+    materialized_resource = gen_dataset_ops.materialized_index_dataset_handle(
+        container=container,
+        shared_name=shared_name,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_types(self.output_shapes, self.output_classes)))
+
+    with ops.colocate_with(materialized_resource):
+      materializer = gen_dataset_ops.indexed_dataset_materialize(
+          self._as_variant_tensor(), materialized_resource)
+    return MaterializedIndexedDataset(materialized_resource, materializer,
+                                      self.output_classes, self.output_types,
+                                      self.output_shapes)
+
+  @abc.abstractproperty
+  def output_types(self):
+    """Returns the type of each component of an element of this IndexedDataset.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this IndexedDataset.
+    """
+    raise NotImplementedError("IndexedDataset.output_types")
+
+  @abc.abstractproperty
+  def output_classes(self):
+    """Returns the class of each component of an element of this IndexedDataset.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this IndexedDataset.
+    """
+    raise NotImplementedError("IndexedDataset.output_classes")
+
+  @abc.abstractproperty
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this IndexedDataset.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this IndexedDataset.
+    """
+    raise NotImplementedError("IndexedDataset.output_shapes")
+
+  @abc.abstractmethod
+  def _as_variant_tensor(self):
+    """Creates a `tf.variant` `tf.Tensor` representing this IndexedDataset.
+
+    Returns:
+      A scalar `tf.Tensor` of `tf.variant` type, which represents this
+      IndexedDataset.
+    """
+    raise NotImplementedError("IndexedDataset._as_variant_tensor")
+
+
+class IdentityIndexedDataset(IndexedDataset):
+  """IdentityIndexedDataset is a trivial indexed dataset used for testing.
+  """
+
+  def __init__(self, size):
+    super(IdentityIndexedDataset, self).__init__()
+    # TODO(saeta): Verify _size is a scalar!
+    self._size = ops.convert_to_tensor(size, dtype=dtypes.uint64, name="size")
+
+  @property
+  def output_types(self):
+    return dtypes.uint64
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.identity_indexed_dataset(self._size)
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 5a1a35199a..54a92ab185 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -163,7 +163,7 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     for data_input in data_inputs[1:]:
       if (data_input.output_types != data_inputs[0].output_types or
           data_input.output_classes != data_inputs[0].output_classes):
-        raise TypeError("All datasets must have the same type.")
+        raise TypeError("All datasets must have the same type and class.")
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -216,25 +216,46 @@ def sample_from_datasets(datasets, weights=None, seed=None):
       length of the `datasets` element.
   """
   num_datasets = len(datasets)
-  if weights is None:
-    weights = dataset_ops.Dataset.from_tensors([1.0] * num_datasets).repeat()
-  elif not isinstance(weights, dataset_ops.Dataset):
-    weights = ops.convert_to_tensor(weights, name="weights")
-    if weights.dtype not in (dtypes.float32, dtypes.float64):
-      raise TypeError("`weights` must be convertible to a tensor of "
-                      "`tf.float32` or `tf.float64` elements.")
-    if not weights.shape.is_compatible_with([num_datasets]):
-      raise ValueError("`weights` must be a vector of length `len(datasets)`.")
-    weights = dataset_ops.Dataset.from_tensors(weights).repeat()
-
-  # The `stateless_multinomial()` op expects log-probabilities, as opposed to
-  # weights.
-  logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
-  def select_dataset(logits, seed):
-    return array_ops.squeeze(
-        stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
-  selector_input = dataset_ops.Dataset.zip(
-      (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset)
+  if not isinstance(weights, dataset_ops.Dataset):
+    if weights is None:
+      # Select inputs with uniform probability.
+      logits = [[1.0] * num_datasets]
+    else:
+      # Use the given `weights` as the probability of choosing the respective
+      # input.
+      weights = ops.convert_to_tensor(weights, name="weights")
+      if weights.dtype not in (dtypes.float32, dtypes.float64):
+        raise TypeError("`weights` must be convertible to a tensor of "
+                        "`tf.float32` or `tf.float64` elements.")
+      if not weights.shape.is_compatible_with([num_datasets]):
+        raise ValueError(
+            "`weights` must be a vector of length `len(datasets)`.")
+
+      # The `stateless_multinomial()` op expects log-probabilities, as opposed
+      # to weights.
+      logits = array_ops.expand_dims(math_ops.log(weights, name="logits"), 0)
+
+    def select_dataset_constant_logits(seed):
+      return array_ops.squeeze(
+          stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
+
+    selector_input = random_ops.RandomDataset(seed).batch(2).map(
+        select_dataset_constant_logits)
+  else:
+    # Use each element of the given `weights` dataset as the probability of
+    # choosing the respective input.
+
+    # The `stateless_multinomial()` op expects log-probabilities, as opposed to
+    # weights.
+    logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
+
+    def select_dataset_varying_logits(logits, seed):
+      return array_ops.squeeze(
+          stateless.stateless_multinomial(logits, 1, seed=seed), axis=[0, 1])
+
+    selector_input = dataset_ops.Dataset.zip(
+        (logits_ds, random_ops.RandomDataset(seed).batch(2)
+        )).map(select_dataset_varying_logits)
 
   return _DirectedInterleaveDataset(selector_input, datasets)
 
diff --git a/tensorflow/contrib/data/python/ops/parsing_ops.py b/tensorflow/contrib/data/python/ops/parsing_ops.py
new file mode 100644
index 0000000000..2701605e64
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/parsing_ops.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental `dataset` API for parsing example."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import parsing_ops
+
+
+class _ParseExampleDataset(dataset_ops.Dataset):
+  """A `Dataset` that parses `example` dataset into a `dict` dataset."""
+
+  def __init__(self, input_dataset, features, num_parallel_calls):
+    super(_ParseExampleDataset, self).__init__()
+    self._input_dataset = input_dataset
+    if not all(types == dtypes.string
+               for types in nest.flatten(input_dataset.output_types)):
+      raise TypeError("Input dataset should be a dataset of vectors of strings")
+    self._num_parallel_calls = num_parallel_calls
+    # pylint: disable=protected-access
+    self._features = parsing_ops._prepend_none_dimension(features)
+    # sparse_keys and dense_keys come back sorted here.
+    (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
+     dense_shapes) = parsing_ops._features_to_raw_params(
+         self._features, [
+             parsing_ops.VarLenFeature, parsing_ops.SparseFeature,
+             parsing_ops.FixedLenFeature, parsing_ops.FixedLenSequenceFeature
+         ])
+    # TODO(b/112859642): Pass sparse_index and sparse_values for SparseFeature.
+    (_, dense_defaults_vec, sparse_keys, sparse_types, dense_keys, dense_shapes,
+     dense_shape_as_shape) = parsing_ops._process_raw_parameters(
+         None, dense_defaults, sparse_keys, sparse_types, dense_keys,
+         dense_types, dense_shapes)
+    # pylint: enable=protected-access
+    self._sparse_keys = sparse_keys
+    self._sparse_types = sparse_types
+    self._dense_keys = dense_keys
+    self._dense_defaults = dense_defaults_vec
+    self._dense_shapes = dense_shapes
+    self._dense_types = dense_types
+    dense_output_shapes = [
+        self._input_dataset.output_shapes.concatenate(shape)
+        for shape in dense_shape_as_shape
+    ]
+    sparse_output_shapes = [
+        self._input_dataset.output_shapes.concatenate([None])
+        for _ in range(len(sparse_keys))
+    ]
+
+    self._output_shapes = dict(
+        zip(self._dense_keys + self._sparse_keys,
+            dense_output_shapes + sparse_output_shapes))
+    self._output_types = dict(
+        zip(self._dense_keys + self._sparse_keys,
+            self._dense_types + self._sparse_types))
+    self._output_classes = dict(
+        zip(self._dense_keys + self._sparse_keys,
+            [ops.Tensor for _ in range(len(self._dense_defaults))] +
+            [sparse_tensor.SparseTensor for _ in range(len(self._sparse_keys))
+            ]))
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.parse_example_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._num_parallel_calls,
+        self._dense_defaults,
+        self._sparse_keys,
+        self._dense_keys,
+        self._sparse_types,
+        self._dense_shapes,
+        **dataset_ops.flat_structure(self))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+
+# TODO(b/111553342): add arguments names and example names as well.
+def parse_example_dataset(features, num_parallel_calls=1):
+  """A transformation that parses `Example` protos into a `dict` of tensors.
+
+  Parses a number of serialized `Example` protos given in `serialized`. We refer
+  to `serialized` as a batch with `batch_size` many entries of individual
+  `Example` protos.
+
+  This op parses serialized examples into a dictionary mapping keys to `Tensor`
+  and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+  `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+  and `SparseFeature` is mapped to a `SparseTensor`, and each
+  `FixedLenFeature` is mapped to a `Tensor`. See `tf.parse_example` for more
+  details about feature dictionaries.
+
+  Args:
+   features: A `dict` mapping feature keys to `FixedLenFeature`,
+     `VarLenFeature`, and `SparseFeature` values.
+   num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+      representing the number of parsing processes to call in parallel.
+
+  Returns:
+    A dataset transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: if features argument is None.
+  """
+  if features is None:
+    raise ValueError("Missing: features was %s." % features)
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    out_dataset = _ParseExampleDataset(dataset, features, num_parallel_calls)
+    if any([
+        isinstance(feature, parsing_ops.SparseFeature)
+        for _, feature in features.items()
+    ]):
+      # pylint: disable=protected-access
+      # pylint: disable=g-long-lambda
+      out_dataset = out_dataset.map(
+          lambda x: parsing_ops._construct_sparse_tensors_for_sparse_features(
+              features, x), num_parallel_calls=num_parallel_calls)
+    return out_dataset
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 3882d4bfdb..29005859d7 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -25,8 +25,8 @@ import numpy as np
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import gen_dataset_ops as contrib_gen_dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.contrib.data.python.ops import parsing_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
-from tensorflow.contrib.data.python.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import convert
@@ -37,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import deprecation
 
@@ -326,7 +325,6 @@ def make_csv_dataset(
     shuffle_seed=None,
     prefetch_buffer_size=1,
     num_parallel_reads=1,
-    num_parallel_parser_calls=2,
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
@@ -393,8 +391,6 @@ def make_csv_dataset(
       batches consumed per training step.
     num_parallel_reads: Number of threads used to read CSV records from files.
       If >1, the results will be interleaved.
-    num_parallel_parser_calls: Number of parallel invocations of the CSV parsing
-      function on CSV records.
     sloppy: If `True`, reading performance will be improved at
       the cost of non-deterministic ordering. If `False`, the order of elements
       produced is deterministic prior to shuffling (elements are still
@@ -503,7 +499,7 @@ def make_csv_dataset(
   # indefinitely, and all batches will be full-sized.
   dataset = dataset.batch(batch_size=batch_size,
                           drop_remainder=num_epochs is None)
-  dataset = dataset.map(map_fn, num_parallel_calls=num_parallel_parser_calls)
+  dataset = dataset.map(map_fn)
   dataset = dataset.prefetch(prefetch_buffer_size)
 
   return dataset
@@ -778,8 +774,6 @@ def make_batched_features_dataset(file_pattern,
   dataset = _maybe_shuffle_and_repeat(
       dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
 
-  dataset = dataset.apply(stats_ops.feature_stats("record_stats"))
-
   # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to
   # improve the shape inference, because it makes the batch dimension static.
   # It is safe to do this because in that case we are repeating the input
@@ -788,9 +782,9 @@ def make_batched_features_dataset(file_pattern,
       batch_size, drop_remainder=drop_final_batch or num_epochs is None)
 
   # Parse `Example` tensors to a dictionary of `Feature` tensors.
-  dataset = dataset.map(
-      lambda x: parsing_ops.parse_example(x, features),
-      num_parallel_calls=parser_num_threads)
+  dataset = dataset.apply(
+      parsing_ops.parse_example_dataset(
+          features, num_parallel_calls=parser_num_threads))
 
   # TODO(rachelim): Add an optional label_name argument for extracting the label
   # from the features dictionary, to comply with the type expected by the
@@ -974,3 +968,49 @@ class SqlDataset(dataset_ops.Dataset):
   @property
   def output_types(self):
     return self._output_types
+
+
+class LMDBDataset(dataset_ops.Dataset):
+  """A LMDB Dataset that reads the lmdb file."""
+
+  def __init__(self, filenames):
+    """Create a `LMDBDataset`.
+
+    `LMDBDataset` allows a user to read data from a mdb file as
+    (key value) pairs sequentially.
+    For example:
+    ```python
+    dataset = tf.contrib.lmdb.LMDBDataset("/foo/bar.mdb")
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    # Prints the (key, value) pairs inside a lmdb file.
+    while True:
+      try:
+        print(sess.run(next_element))
+      except tf.errors.OutOfRangeError:
+        break
+    ```
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+    """
+    super(LMDBDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+
+  def _as_variant_tensor(self):
+    return contrib_gen_dataset_ops.lmdb_dataset(
+        self._filenames,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor, ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+
+  @property
+  def output_types(self):
+    return dtypes.string, dtypes.string
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
index d3628d480d..02feeafb60 100644
--- a/tensorflow/contrib/distribute/BUILD
+++ b/tensorflow/contrib/distribute/BUILD
@@ -29,12 +29,12 @@ py_library(
         "//tensorflow/contrib/distribute/python:cross_tower_ops",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
         "//tensorflow/contrib/distribute/python:monitor",
-        "//tensorflow/contrib/distribute/python:multi_worker_strategy",
         "//tensorflow/contrib/distribute/python:one_device_strategy",
         "//tensorflow/contrib/distribute/python:parameter_server_strategy",
         "//tensorflow/contrib/distribute/python:step_fn",
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_config",
     ],
 )
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index 9123ca749b..bf763215ba 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -22,13 +22,14 @@ from __future__ import print_function
 from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy
 from tensorflow.contrib.distribute.python.cross_tower_ops import *
 from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
-from tensorflow.contrib.distribute.python.multi_worker_strategy import MultiWorkerMirroredStrategy
 from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
+from tensorflow.python.distribute.distribute_config import DistributeConfig
 from tensorflow.python.training.distribute import *
+from tensorflow.python.training.distribution_strategy_context import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -37,9 +38,9 @@ _allowed_symbols = [
     'AllReduceCrossTowerOps',
     'CollectiveAllReduceStrategy',
     'CrossTowerOps',
+    'DistributeConfig',
     'DistributionStrategy',
     'MirroredStrategy',
-    'MultiWorkerMirroredStrategy',
     'Monitor',
     'OneDeviceStrategy',
     'ParameterServerStrategy',
@@ -55,6 +56,7 @@ _allowed_symbols = [
     'get_tower_context',
     'has_distribution_strategy',
     'require_tower_context',
+    'UpdateContext',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 40a1c1707c..f5b236e35f 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -72,48 +72,72 @@ py_library(
         ":cross_tower_ops",
         ":shared_variable_creator",
         ":values",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:device",
         "//tensorflow/python:device_util",
         "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
-        "@six_archive//:six",
     ],
 )
 
 py_library(
-    name = "multi_worker_strategy",
-    srcs = ["multi_worker_strategy.py"],
+    name = "parameter_server_strategy",
+    srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":cross_tower_ops",
         ":mirrored_strategy",
         ":values",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/eager:context",
     ],
 )
 
-py_library(
-    name = "parameter_server_strategy",
-    srcs = ["parameter_server_strategy.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":cross_tower_ops",
-        ":mirrored_strategy",
+cuda_py_test(
+    name = "parameter_server_strategy_test",
+    srcs = ["parameter_server_strategy_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":multi_worker_test_base",
+        ":parameter_server_strategy",
         ":values",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
-        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
     ],
 )
 
@@ -147,6 +171,7 @@ py_library(
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -184,7 +209,6 @@ py_library(
     ],
     deps = [
         ":mirrored_strategy",
-        ":multi_worker_strategy",
         ":one_device_strategy",
         ":tpu_strategy",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
@@ -219,9 +243,13 @@ py_test(
     ],
     deps = [
         ":mirrored_strategy",
+        ":multi_worker_test_base",
         ":strategy_test_lib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:distribute",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -243,40 +271,12 @@ py_test(
     ],
 )
 
-py_test(
-    name = "parameter_server_strategy_test",
-    srcs = ["parameter_server_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        ":combinations",
-        ":multi_worker_test_base",
-        ":parameter_server_strategy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:estimator_py",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
     srcs = ["mirrored_strategy_multigpu_test.py"],
     additional_deps = [
         ":mirrored_strategy",
+        ":multi_worker_test_base",
         ":values",
         ":strategy_test_lib",
         "//tensorflow/python:distribute",
@@ -345,19 +345,17 @@ py_library(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "collective_all_reduce_strategy_test",
     srcs = ["collective_all_reduce_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":collective_all_reduce_strategy",
         ":combinations",
         ":cross_tower_utils",
         ":multi_worker_test_base",
         ":strategy_test_lib",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -371,8 +369,10 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
     ],
 )
 
@@ -452,6 +452,32 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "estimator_training_test",
+    size = "large",
+    srcs = ["estimator_training_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":mirrored_strategy",
+        ":multi_worker_test_base",
+        ":parameter_server_strategy",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
 py_library(
     name = "single_loss_example",
     srcs = ["single_loss_example.py"],
@@ -607,6 +633,7 @@ cuda_py_test(
         ":combinations",
         ":cross_tower_ops",
         ":multi_worker_test_base",
+        ":mirrored_strategy",
         ":values",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index bcb977f640..865dba803f 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -48,7 +48,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
       mode=["graph"]))
   def testInitFromCheckpoint(self, distribution, in_tower_mode):
     checkpoint_dir = self.get_temp_dir()
-    with self.test_session() as session:
+    with self.cached_session() as session:
       v1_value, v2_value, _, _ = checkpoint_utils_test._create_checkpoints(
           session, checkpoint_dir)
 
@@ -62,7 +62,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
           "var1": "new_var1",
           "var2": "new_var2"
       })
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         session.run(variables.global_variables_initializer())
         self.assertAllEqual(v1_value, self.evaluate(v1))
         self.assertAllEqual(v2_value, self.evaluate(v2))
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index 9afcaecf78..2331444261 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -18,30 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
-
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import values
-from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.training import server_lib
-
-
-# TODO(yuefengz): move this function to a common util file.
-def _normalize_cluster_spec(cluster_spec):
-  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
-    return server_lib.ClusterSpec(cluster_spec)
-  elif not isinstance(cluster_spec, server_lib.ClusterSpec):
-    raise ValueError(
-        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
-        "`tf.train.ClusterDef` object")
-  return cluster_spec
 
 
 # TODO(yuefengz): shard the dataset.
@@ -52,51 +37,45 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
   It is similar to the MirroredStrategy but it uses collective ops for
-  reduction. It currently only works for between-graph replication and its
-  reduction will reduce across all workers.
+  reduction.
+
+  When `cluster_spec` is given by the `configure` method, it turns into the
+  mulit-worker version that works on multiple workers with between-graph
+  replication.
+
+  Note: `configure` will be called by higher-level APIs if running in
+  distributed environment.
   """
 
-  def __init__(self,
-               num_gpus_per_worker=0,
-               cluster_spec=None,
-               task_type="worker",
-               task_id=0):
+  def __init__(self, num_gpus_per_worker=0):
     """Initializes the object.
 
     Args:
       num_gpus_per_worker: number of local GPUs or GPUs per worker.
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type, such as "worker".
-      task_id: the current task id.
-
-    Raises:
-      ValueError: if `task_type` is not in the `cluster_spec`.
     """
     self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize(cluster_spec, task_type, task_id)
+    self._initialize(None, None, None)
 
   def _initialize(self, cluster_spec, task_type, task_id):
-    if task_type not in ["chief", "worker"]:
-      raise ValueError(
-          "Unrecognized task_type: %r, valid task types are: \"chief\", "
-          "\"worker\"." % task_type)
     if cluster_spec:
-      self._cluster_spec = _normalize_cluster_spec(cluster_spec)
+      if task_type is None or task_id is None:
+        raise ValueError("When `cluster_spec` is given, you must also specify "
+                         "`task_type` and `task_id`")
+      if task_type not in ["chief", "worker"]:
+        raise ValueError(
+            "Unrecognized task_type: %r, valid task types are: \"chief\", "
+            "\"worker\"." % task_type)
+      self._cluster_spec = multi_worker_util.normalize_cluster_spec(
+          cluster_spec)
       worker_device = "/job:%s/task:%d" % (task_type, task_id)
-      num_workers = len(self._cluster_spec.as_dict().get(task_type, []))
-      if "chief" in self._cluster_spec.as_dict():
-        num_workers += 1
+      num_workers = len(self._cluster_spec.as_dict().get("worker", [])) + len(
+          self._cluster_spec.as_dict().get("chief", []))
       if not num_workers:
-        raise ValueError("`task_type` shoud be in `cluster_spec`.")
+        raise ValueError("No `worker` or `chief` tasks can be found in "
+                         "`cluster_spec`.")
 
-      # TODO(yuefengz): create a utility to infer chief.
-      if "chief" in self._cluster_spec.as_dict() and task_type == "chief":
-        assert task_id == 0
-        self._is_chief = True
-      else:
-        assert task_type == "worker"
-        self._is_chief = task_id == 0
+      self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
+                                                  task_id)
     else:
       self._cluster_spec = None
       self._is_chief = True
@@ -187,19 +166,41 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return mirrored_strategy._create_mirrored_variable(
         devices, _real_mirrored_creator, *args, **kwargs)
 
-  def configure(self, session_config=None):
-    # Use TF_CONFIG to get the cluster spec and the current job.
-    if not self._cluster_spec:
-      tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
-      cluster_spec = _normalize_cluster_spec(tf_config.get("cluster", {}))
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    """Configures the object.
 
-      task_env = tf_config.get("task", {})
-      if task_env:
-        task_type = task_env.get("type", "worker")
-        task_id = int(task_env.get("index", "0"))
-      else:
-        task_type = "worker"
-        task_id = 0
+    Args:
+      session_config: a @{tf.ConfigProto}
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type, such as "worker".
+      task_id: the current task id.
 
-      if cluster_spec:
-        self._initialize(cluster_spec, task_type, task_id)
+    Raises:
+      ValueError: if `task_type` is not in the `cluster_spec`.
+    """
+    # TODO(yuefengz): we'll need to mutate the session_config to add
+    # configurations for collective ops.
+    del session_config
+    if not self._cluster_spec and cluster_spec:
+      self._initialize(cluster_spec, task_type, task_id)
+
+  @property
+  def between_graph(self):
+    return True
+
+  @property
+  def should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index b5e54e3b7d..e284969b1a 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -25,10 +25,8 @@ from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -41,53 +39,43 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class DistributedCollectiveAllReduceStrategyTest(
-    multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase):
+class CollectiveAllReduceStrategyTestBase(
+    multi_worker_test_base.MultiWorkerTestBase):
 
   collective_key_base = 0
 
-  @classmethod
-  def setUpClass(cls):
-    """Create a local cluster with 2 workers."""
-    cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster(
-        num_workers=3, num_ps=0)
-    cls._cluster_spec = {
-        run_config.TaskType.WORKER: [
-            'fake_worker_0', 'fake_worker_1', 'fake_worker_2'
-        ]
-    }
-
   def setUp(self):
     self._run_options = config_pb2.RunOptions()
     self._run_options.experimental.collective_graph_key = 6
 
     self._sess_config = config_pb2.ConfigProto()
-    self._sess_config.experimental.collective_group_leader = (
-        '/job:worker/replica:0/task:0')
 
     # We use a different key_base for each test so that collective keys won't be
     # reused.
     # TODO(yuefengz, tucker): enable it to reuse collective keys in different
     # tests.
-    DistributedCollectiveAllReduceStrategyTest.collective_key_base += 100000
-    super(DistributedCollectiveAllReduceStrategyTest, self).setUp()
+    CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
+    super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
   def _get_test_object(self, task_type, task_id, num_gpus=0):
     distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=num_gpus,
-        cluster_spec=self._cluster_spec,
-        task_type=task_type,
-        task_id=task_id)
+        num_gpus_per_worker=num_gpus)
+    if task_type and task_id is not None:
+      distribution.configure(
+          cluster_spec=self._cluster_spec, task_type=task_type, task_id=task_id)
     collective_keys = cross_tower_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
-        DistributedCollectiveAllReduceStrategyTest.collective_key_base,
+        CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_start=num_gpus * 100 +
-        DistributedCollectiveAllReduceStrategyTest.collective_key_base,
+        CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
-        DistributedCollectiveAllReduceStrategyTest.collective_key_base)
+        CollectiveAllReduceStrategyTestBase.collective_key_base)
     distribution._collective_keys = collective_keys
     distribution._cross_tower_ops._collective_keys = collective_keys
-    return distribution, self._workers[task_id].target
+    if task_type and task_id is not None:
+      return distribution, 'grpc://' + self._cluster_spec[task_type][task_id]
+    else:
+      return distribution, ''
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
     d, master_target = self._get_test_object(task_type, task_id, num_gpus)
@@ -155,12 +143,6 @@ class DistributedCollectiveAllReduceStrategyTest(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraph(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
-
   def _test_variable_initialization(self, task_type, task_id, num_gpus):
     distribution, master_target = self._get_test_object(task_type, task_id,
                                                         num_gpus)
@@ -182,16 +164,74 @@ class DistributedCollectiveAllReduceStrategyTest(
           distribution.reduce(
               variable_scope.VariableAggregation.MEAN, x,
               destinations='/cpu:0'))[0]
+      x = distribution.unwrap(x)[0]
 
       sess.run(
           variables.global_variables_initializer(), options=self._run_options)
+
       x_value, reduced_x_value = sess.run(
           [x, reduced_x], options=self._run_options)
-      self.assertTrue(np.array_equal(x_value, reduced_x_value))
-    return np.array_equal(x_value, reduced_x_value)
+      self.assertTrue(
+          np.allclose(x_value, reduced_x_value, atol=1e-5),
+          msg=('x_value = %r, reduced_x_value = %r' % (x_value,
+                                                       reduced_x_value)))
+    return np.allclose(x_value, reduced_x_value, atol=1e-5)
+
+
+class DistributedCollectiveAllReduceStrategyTest(
+    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=0)
+
+  def setUp(self):
+    super(DistributedCollectiveAllReduceStrategyTest, self).setUp()
+    self._sess_config.experimental.collective_group_leader = (
+        '/job:worker/replica:0/task:0')
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testVariableInitialization(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      return
+    self._run_between_graph_clients(
+        self._test_variable_initialization,
+        self._cluster_spec,
+        num_gpus=num_gpus)
+
+
+class DistributedCollectiveAllReduceStrategyTestWithChief(
+    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers and 1 chief."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=0, has_chief=True)
+
+  def setUp(self):
+    super(DistributedCollectiveAllReduceStrategyTestWithChief, self).setUp()
+    self._run_options.experimental.collective_graph_key = 7
+    self._sess_config.experimental.collective_group_leader = (
+        '/job:chief/replica:0/task:0')
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
       return
@@ -201,16 +241,14 @@ class DistributedCollectiveAllReduceStrategyTest(
         num_gpus=num_gpus)
 
 
-class LocalCollectiveAllReduceStrategy(strategy_test_lib.DistributionTestBase,
-                                       parameterized.TestCase):
+class LocalCollectiveAllReduceStrategy(
+    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
 
   def testMinimizeLossGraph(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
       return
-    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=num_gpus)
-    self._test_minimize_loss_graph(distribution)
+    self._test_minimize_loss_graph(None, None, num_gpus)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index a1efbcaf9a..2301ba9233 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -48,7 +48,6 @@ import six
 
 from tensorflow.contrib.cluster_resolver import TPUClusterResolver
 from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
-from tensorflow.contrib.distribute.python import multi_worker_strategy
 from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
 from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
@@ -56,7 +55,7 @@ from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training import adam
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import tf_inspect
 
@@ -320,7 +319,7 @@ class NamedDistribution(object):
 # pylint: disable=g-long-lambda
 default_strategy = NamedDistribution(
     "Default",
-    lambda: distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
+    distribution_strategy_context._get_default_distribution_strategy,  # pylint: disable=protected-access
     required_gpus=None)
 one_device_strategy = NamedDistribution(
     "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
@@ -342,33 +341,6 @@ mirrored_strategy_with_two_gpus = NamedDistribution(
         ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
     required_gpus=2)
 
-multi_worker_strategy_with_cpu = NamedDistribution(
-    "MultiWorkerCPU",
-    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster={
-            "worker": [
-                "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
-            ]
-        },
-        num_gpus_per_worker=0), 0)
-multi_worker_strategy_with_one_gpu = NamedDistribution(
-    "MultiWorker1GPU",
-    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster={
-            "worker": [
-                "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
-            ]
-        },
-        num_gpus_per_worker=1), 1)
-multi_worker_strategy_with_two_gpus = NamedDistribution(
-    "MultiWorker2GPUs",
-    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster={
-            "worker": [
-                "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
-            ]
-        },
-        num_gpus_per_worker=2), 2)
 
 adam_optimizer_v1_fn = NamedObject(
     "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index 3a7addf221..2a653b0f10 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -53,7 +53,7 @@ def validate_destinations(destinations):
   if not isinstance(
       destinations,
       (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
-       six.string_types, list)):
+       value_lib.AggregatingVariable, six.string_types, list)):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
                      " a tf.Variable object, a device string, a list of device "
                      "strings or None")
@@ -62,7 +62,44 @@ def validate_destinations(destinations):
     raise ValueError("destinations can not be empty")
 
 
+def _make_tensor_into_per_device(input_tensor):
+  """Converts a single tensor into a PerDevice object."""
+  if isinstance(input_tensor, (tuple, list)):
+    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object, "
+                     "got %r but expected a object that is not a tuple or list."
+                     % (input_tensor,))
+  if isinstance(input_tensor, value_lib.PerDevice):
+    return input_tensor
+
+  try:
+    device = input_tensor.device
+  except AttributeError:
+    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object "
+                     "because it doesn't have device set.")
+
+  return value_lib.PerDevice({device: input_tensor})
+
+
+def _normalize_value_destination_pairs(value_destination_pairs):
+  """Converts each tensor into a PerDevice object in the input list."""
+  result = []
+  if not isinstance(value_destination_pairs, (list, tuple)):
+    raise ValueError("`value_destination_pairs` should be a list or tuple")
+  for pair in value_destination_pairs:
+    if not isinstance(pair, tuple):
+      raise ValueError(
+          "Each element of `value_destination_pairs` should be a tuple.")
+    if len(pair) != 2:
+      raise ValueError("Each element of `value_destination_pairs` should be a "
+                       "tuple of size 2.")
+
+    per_device = _make_tensor_into_per_device(pair[0])
+    result.append((per_device, pair[1]))
+  return result
+
+
 def _validate_value_destination_pairs(value_destination_pairs):
+  # TODO(yuefengz): raise exceptions instead of returning False.
   # pylint: disable=g-missing-docstring
   if not value_destination_pairs: return False
   if not isinstance(value_destination_pairs, (list, tuple)): return False
@@ -78,12 +115,15 @@ def _validate_value_destination_pairs(value_destination_pairs):
 def get_devices_from(destinations):
   if isinstance(destinations, value_lib.DistributedValues):
     return list(destinations.devices)
-  elif isinstance(destinations, resource_variable_ops.ResourceVariable):
+  elif isinstance(destinations, (resource_variable_ops.ResourceVariable,
+                                 value_lib.AggregatingVariable)):
     return [destinations.device]
   elif isinstance(destinations, six.string_types):
     return [device_util.resolve(destinations)]
-  else:
+  elif isinstance(destinations, (list, tuple)):
     return [device_util.resolve(destination) for destination in destinations]
+  else:
+    return [destinations.device]
 
 
 def _devices_match(left, right):
@@ -158,7 +198,7 @@ class CrossTowerOps(object):
     Args:
       aggregation: Indicates how a variable will be aggregated. Accepted values
         are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-      per_device_value: a PerDevice object.
+      per_device_value: a PerDevice object or a tensor with device set.
       destinations: the reduction destinations.
 
     Returns:
@@ -168,7 +208,8 @@ class CrossTowerOps(object):
       ValueError: if per_device_value is not a PerDevice object.
     """
     if not isinstance(per_device_value, value_lib.PerDevice):
-      raise ValueError("`per_device_value` must be a `PerDevice` object.")
+      per_device_value = _make_tensor_into_per_device(per_device_value)
+
     if destinations is not None:
       validate_destinations(destinations)
     return self._reduce(aggregation, per_device_value, destinations)
@@ -183,8 +224,9 @@ class CrossTowerOps(object):
       aggregation: Indicates how a variable will be aggregated. Accepted values
         are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
       value_destination_pairs: a list or a tuple of tuples of PerDevice objects
-        and destinations. If a destination is None, then the destinations
-        are set to match the devices of the input PerDevice object.
+        (or tensors with device set if there is one tower) and destinations. If
+        a destination is None, then the destinations are set to match the
+        devices of the input PerDevice object.
 
     Returns:
       a list of Mirrored objects.
@@ -194,8 +236,11 @@ class CrossTowerOps(object):
         tuples of PerDevice objects and destinations
     """
     if not _validate_value_destination_pairs(value_destination_pairs):
-      raise ValueError("`value_destination_pairs` must be a list or a tuple of "
-                       "tuples of PerDevice objects and destinations")
+      # If the first element of each pair is a tensor, we try to turn it into a
+      # PerDevice object.
+      value_destination_pairs = _normalize_value_destination_pairs(
+          value_destination_pairs)
+
     for _, d in value_destination_pairs:
       if d is not None:
         validate_destinations(d)
@@ -756,7 +801,7 @@ class CollectiveAllReduce(CrossTowerOps):
     )
     super(CollectiveAllReduce, self).__init__()
 
-  # TODO(yuefengz, tucker): is index slices supported by collective ops?
+  # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
   def _reduce(self, aggregation, per_device_value, destinations):
     all_reduced = self._batch_all_reduce(aggregation, [per_device_value])[0]
     if destinations is None or _devices_match(per_device_value, destinations):
@@ -768,8 +813,10 @@ class CollectiveAllReduce(CrossTowerOps):
         if d in all_reduced._index:
           index[d] = all_reduced._index[d]
         else:
-          with ops.device(d):
+          with ops.control_dependencies(list(
+              all_reduced._index.values())), ops.device(d):
             index[d] = array_ops.identity(list(all_reduced._index.values())[0])
+
       return value_lib.Mirrored(index)
 
   def _batch_reduce(self, aggregation, value_destination_pairs):
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
index aec53b01d7..2ad91d56e9 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -26,12 +26,12 @@ import numpy as np
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import cross_tower_utils
+from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
-from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -40,9 +40,17 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import device_util
 
 
-def _make_per_device(values, devices):
+def _make_per_device(values, devices, regroup=False):
   devices = cross_tower_ops_lib.get_devices_from(devices)
   assert len(values) == len(devices)
+
+  # We simulate the result of regroup called on PerDevice which strips the
+  # PerDevice wrapper if it has only one value.
+  if len(values) == 1 and regroup:
+    with ops.device(devices[0]):
+      placed_v = array_ops.identity(values[0])
+    return placed_v
+
   index = {}
   for d, v in zip(devices, values):
     with ops.device(d):
@@ -368,14 +376,27 @@ class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase,
                                       ("xring", 2, -1)], 0, 0, 0)),
       ],
       distribution=[
-          combinations.multi_worker_strategy_with_cpu,
-          combinations.multi_worker_strategy_with_one_gpu,
-          combinations.multi_worker_strategy_with_two_gpus
+          combinations.NamedDistribution(
+              "MirroredCPU",
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus=0),
+              required_gpus=0),
+          combinations.NamedDistribution(
+              "Mirrored1GPU",
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus=1),
+              required_gpus=1),
+          combinations.NamedDistribution(
+              "Mirrored2GPUs",
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus=2),
+              required_gpus=2),
       ],
       mode=["graph"])
 
   @combinations.generate(multi_worker_allreduce_combinations)
   def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+    distribution.configure(cluster_spec={
+        "worker":
+            ["/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"]
+    })
     with distribution.scope():
       self._testReductionAndBroadcast(cross_tower_ops, distribution)
 
@@ -388,13 +409,8 @@ class MultiWorkerCollectiveAllReduceTest(
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
-    cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster(
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0)
-    cls._cluster_spec = {
-        run_config.TaskType.WORKER: [
-            "fake_worker_0", "fake_worker_1", "fake_worker_2"
-        ]
-    }
 
   def setUp(self):
     super(MultiWorkerCollectiveAllReduceTest, self).setUp()
@@ -417,7 +433,7 @@ class MultiWorkerCollectiveAllReduceTest(
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
       else:
         devices = ["/device:CPU:0"]
-      return collective_all_reduce_ops, devices, "local"
+      return collective_all_reduce_ops, devices, ""
     else:
       collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
           3, num_gpus, collective_keys=collective_keys)
@@ -428,7 +444,8 @@ class MultiWorkerCollectiveAllReduceTest(
         ]
       else:
         devices = ["/job:%s/task:%d" % (task_type, task_id)]
-      return collective_all_reduce_ops, devices, self._workers[task_id].target
+      return (collective_all_reduce_ops, devices,
+              "grpc://" + self._cluster_spec[task_type][task_id])
 
   def _assert_values_equal(self, left, right, sess):
     if isinstance(left, list):
@@ -455,7 +472,8 @@ class MultiWorkerCollectiveAllReduceTest(
       num_workers = 1
       worker_device = None
     else:
-      num_workers = len(self._workers)
+      num_workers = len(self._cluster_spec.get("chief", [])) + len(
+          self._cluster_spec.get("worker", []))
       worker_device = "/job:%s/task:%d" % (task_type, task_id)
     with ops.Graph().as_default(), \
          ops.device(worker_device), \
@@ -463,7 +481,7 @@ class MultiWorkerCollectiveAllReduceTest(
       # Collective ops doesn't support scalar tensors, so we have to construct
       # 1-d tensors.
       values = [constant_op.constant([float(d)]) for d in range(len(devices))]
-      per_device = _make_per_device(values, devices)
+      per_device = _make_per_device(values, devices, regroup=True)
       mean = np.array([(len(devices) - 1.) / 2.])
 
       values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
@@ -476,7 +494,7 @@ class MultiWorkerCollectiveAllReduceTest(
       destination_list = devices
 
       all_destinations = [
-          None, destination_mirrored, destination_different, destination_str,
+          destination_different, None, destination_mirrored, destination_str,
           destination_list
       ]
 
@@ -533,13 +551,19 @@ class MultiWorkerCollectiveAllReduceTest(
     return True
 
   @combinations.generate(
-      combinations.combine(mode=["graph"], num_gpus=[0, 1, 2]))
+      combinations.combine(mode=["graph"], num_gpus=[0, 1, 2], required_gpus=1))
   def testReductionDistributed(self, num_gpus):
     if context.num_gpus() < num_gpus:
       return
     self._run_between_graph_clients(self._test_reduction, self._cluster_spec,
                                     num_gpus)
 
+  # Collective ops doesn't support strategy with one device.
+  def testReductionLocal(self, num_gpus=2):
+    if context.num_gpus() < num_gpus:
+      return
+    self._test_reduction(None, None, num_gpus, local_mode=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index 3e00cf4332..cc626c33bf 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.optimizer_v2 import adagrad
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator import training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
@@ -63,8 +64,9 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
               combinations.mirrored_strategy_with_two_gpus
-          ]))
-  def test_complete_flow_with_mode(self, distribution):
+          ],
+          use_train_and_evaluate=[True, False]))
+  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
     label_dimension = 2
     input_dimension = label_dimension
     batch_size = 10
@@ -103,9 +105,15 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
             train_distribute=distribution, eval_distribute=distribution))
 
     num_steps = 10
-    estimator.train(train_input_fn, steps=num_steps)
+    if use_train_and_evaluate:
+      scores, _ = training.train_and_evaluate(
+          estimator,
+          training.TrainSpec(train_input_fn, max_steps=num_steps),
+          training.EvalSpec(eval_input_fn))
+    else:
+      estimator.train(train_input_fn, steps=num_steps)
+      scores = estimator.evaluate(eval_input_fn)
 
-    scores = estimator.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
     self.assertIn('loss', six.iterkeys(scores))
 
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
new file mode 100644
index 0000000000..5348512016
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -0,0 +1,659 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that show Distribute Coordinator works with Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import json
+import os
+import sys
+import tempfile
+import threading
+from absl.testing import parameterized
+import numpy as np
+import six
+
+_portpicker_import_error = None
+try:
+  import portpicker  # pylint: disable=g-import-not-at-top
+except ImportError as _error:  # pylint: disable=invalid-name
+  _portpicker_import_error = _error
+  portpicker = None
+
+# pylint: disable=g-import-not-at-top
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import parameter_server_strategy
+from tensorflow.contrib.optimizer_v2 import adagrad
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import estimator_training as dc_training
+from tensorflow.python.distribute.distribute_config import DistributeConfig
+from tensorflow.python.eager import context
+from tensorflow.python.estimator import exporter as exporter_lib
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.estimator import training as estimator_training
+from tensorflow.python.estimator.canned import dnn_linear_combined
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export as export_lib
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary_iterator
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import server_lib
+
+BATCH_SIZE = 10
+LABEL_DIMENSION = 2
+DATA = np.linspace(
+    0., 2., BATCH_SIZE * LABEL_DIMENSION, dtype=np.float32).reshape(
+        BATCH_SIZE, LABEL_DIMENSION)
+EVAL_NAME = "foo"
+EXPORTER_NAME = "saved_model_exporter"
+MAX_STEPS = 10
+
+CHIEF = dc._TaskType.CHIEF
+EVALUATOR = dc._TaskType.EVALUATOR
+WORKER = dc._TaskType.WORKER
+PS = dc._TaskType.PS
+
+original_run_distribute_coordinator = dc.run_distribute_coordinator
+
+
+# TODO(yuefengz): merge this method back to test_util.
+def _create_local_cluster(num_workers,
+                          num_ps,
+                          has_eval=False,
+                          protocol="grpc",
+                          worker_config=None,
+                          ps_config=None):
+  if _portpicker_import_error:
+    raise _portpicker_import_error  # pylint: disable=raising-bad-type
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+
+  cluster_dict = {
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  if has_eval:
+    cluster_dict["evaluator"] = ["localhost:%s" % portpicker.pick_unused_port()]
+
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+      server_lib.Server(
+          cs,
+          job_name="worker",
+          protocol=protocol,
+          task_index=ix,
+          config=worker_config,
+          start=True) for ix in range(num_workers)
+  ]
+  ps_servers = [
+      server_lib.Server(
+          cs,
+          job_name="ps",
+          protocol=protocol,
+          task_index=ix,
+          config=ps_config,
+          start=True) for ix in range(num_ps)
+  ]
+  if has_eval:
+    evals = [
+        server_lib.Server(
+            cs,
+            job_name="evaluator",
+            protocol=protocol,
+            task_index=0,
+            config=worker_config,
+            start=True)
+    ]
+  else:
+    evals = []
+
+  return workers, ps_servers, evals
+
+
+def _create_in_process_cluster(num_workers, num_ps, has_eval=False):
+  """Create an in-process cluster that consists of only standard server."""
+  # Leave some memory for cuda runtime.
+  if has_eval:
+    gpu_mem_frac = 0.7 / (num_workers + 1)
+  else:
+    gpu_mem_frac = 0.7 / num_workers
+
+  worker_config = config_pb2.ConfigProto()
+  worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
+
+  # Enable collective ops which has no impact on non-collective ops.
+  # TODO(yuefengz, tucker): removing this after we move the initialization of
+  # collective mgr to the session level.
+  worker_config.experimental.collective_group_leader = (
+      "/job:worker/replica:0/task:0")
+
+  ps_config = config_pb2.ConfigProto()
+  ps_config.device_count["GPU"] = 0
+
+  return _create_local_cluster(
+      num_workers,
+      num_ps=num_ps,
+      has_eval=has_eval,
+      worker_config=worker_config,
+      ps_config=ps_config,
+      protocol="grpc")
+
+
+def _create_cluster_spec(has_chief=False,
+                         num_workers=1,
+                         num_ps=0,
+                         has_eval=False):
+  if _portpicker_import_error:
+    raise _portpicker_import_error  # pylint: disable=raising-bad-type
+
+  cluster_spec = {}
+  if has_chief:
+    cluster_spec[CHIEF] = ["localhost:%s" % portpicker.pick_unused_port()]
+  if num_workers:
+    cluster_spec[WORKER] = [
+        "localhost:%s" % portpicker.pick_unused_port()
+        for _ in range(num_workers)
+    ]
+  if num_ps:
+    cluster_spec[PS] = [
+        "localhost:%s" % portpicker.pick_unused_port() for _ in range(num_ps)
+    ]
+  if has_eval:
+    cluster_spec[EVALUATOR] = ["localhost:%s" % portpicker.pick_unused_port()]
+  return cluster_spec
+
+
+def _bytes_to_str(maybe_bytes):
+  if isinstance(maybe_bytes, six.string_types):
+    return maybe_bytes
+  else:
+    return str(maybe_bytes, "utf-8")
+
+
+def _strip_protocol(target):
+  # cluster_spec expects "host:port" strings.
+  if "//" in target:
+    return target.split("//")[1]
+  else:
+    return target
+
+
+class DistributeCoordinatorIntegrationTest(test.TestCase,
+                                           parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 2 workers."""
+    cls._workers, cls._ps, cls._evals = _create_in_process_cluster(
+        num_workers=3, num_ps=2, has_eval=True)
+    cls._cluster_spec = {
+        "worker": [
+            _strip_protocol(_bytes_to_str(w.target)) for w in cls._workers
+        ],
+        "ps": [_strip_protocol(_bytes_to_str(ps.target)) for ps in cls._ps],
+        "evaluator": [
+            _strip_protocol(_bytes_to_str(e.target)) for e in cls._evals
+        ]
+    }
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+    self._event = threading.Event()
+    super(DistributeCoordinatorIntegrationTest, self).setUp()
+
+  def dataset_input_fn(self, x, y, batch_size, shuffle):
+
+    def input_fn():
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      if shuffle:
+        dataset = dataset.shuffle(batch_size)
+      dataset = dataset.repeat(100).batch(batch_size)
+      return dataset
+
+    return input_fn
+
+  def _get_exporter(self, name, fc):
+    feature_spec = feature_column.make_parse_example_spec(fc)
+    serving_input_receiver_fn = (
+        export_lib.build_parsing_serving_input_receiver_fn(feature_spec))
+    return exporter_lib.LatestExporter(
+        name, serving_input_receiver_fn=serving_input_receiver_fn)
+
+  def _extract_loss_and_global_step(self, event_folder):
+    """Returns the loss and global step in last event."""
+    event_paths = glob.glob(os.path.join(event_folder, "events*"))
+
+    loss = None
+    global_step_count = None
+
+    for e in summary_iterator.summary_iterator(event_paths[-1]):
+      current_loss = None
+      for v in e.summary.value:
+        if v.tag == "loss":
+          current_loss = v.simple_value
+
+      # If loss is not found, global step is meaningless.
+      if current_loss is None:
+        continue
+
+      current_global_step = e.step
+      if global_step_count is None or current_global_step > global_step_count:
+        global_step_count = current_global_step
+        loss = current_loss
+
+    return (loss, global_step_count)
+
+  def _get_estimator(self,
+                     train_distribute,
+                     eval_distribute,
+                     remote_cluster=None):
+    input_dimension = LABEL_DIMENSION
+    linear_feature_columns = [
+        feature_column.numeric_column("x", shape=(input_dimension,))
+    ]
+    dnn_feature_columns = [
+        feature_column.numeric_column("x", shape=(input_dimension,))
+    ]
+
+    return dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        label_dimension=LABEL_DIMENSION,
+        model_dir=self._model_dir,
+        dnn_optimizer=adagrad.AdagradOptimizer(0.001),
+        linear_optimizer=adagrad.AdagradOptimizer(0.001),
+        config=run_config_lib.RunConfig(
+            experimental_distribute=DistributeConfig(
+                train_distribute=train_distribute,
+                eval_distribute=eval_distribute,
+                remote_cluster=remote_cluster)))
+
+  def _complete_flow(self,
+                     train_distribute,
+                     eval_distribute,
+                     remote_cluster=None):
+    estimator = self._get_estimator(train_distribute, eval_distribute,
+                                    remote_cluster)
+
+    input_dimension = LABEL_DIMENSION
+    train_input_fn = self.dataset_input_fn(
+        x={"x": DATA},
+        y=DATA,
+        batch_size=BATCH_SIZE // len(train_distribute.worker_devices),
+        shuffle=True)
+    if eval_distribute:
+      eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices)
+    else:
+      eval_batch_size = BATCH_SIZE
+    eval_input_fn = self.dataset_input_fn(
+        x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False)
+
+    linear_feature_columns = [
+        feature_column.numeric_column("x", shape=(input_dimension,))
+    ]
+    dnn_feature_columns = [
+        feature_column.numeric_column("x", shape=(input_dimension,))
+    ]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+
+    estimator_training.train_and_evaluate(
+        estimator,
+        estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
+        estimator_training.EvalSpec(
+            name=EVAL_NAME,
+            input_fn=eval_input_fn,
+            steps=None,
+            exporters=self._get_exporter(EXPORTER_NAME, feature_columns),
+            start_delay_secs=0,
+            throttle_secs=1))
+    return estimator
+
+  def _inspect_train_and_eval_events(self, estimator):
+    # Make sure nothing is stuck in limbo.
+    writer_cache.FileWriterCache.clear()
+
+    # Examine the training events. Use a range to check global step to avoid
+    # flakyness due to global step race condition.
+    training_loss, _ = self._extract_loss_and_global_step(self._model_dir)
+    self.assertIsNotNone(training_loss)
+
+    # Examine the eval events. The global step should be accurate.
+    eval_dir = os.path.join(self._model_dir, "eval_" + EVAL_NAME)
+    eval_loss, eval_global_step = self._extract_loss_and_global_step(
+        event_folder=eval_dir)
+    self.assertIsNotNone(eval_loss)
+    self.assertGreaterEqual(eval_global_step, MAX_STEPS)
+
+    # Examine the export folder.
+    export_dir = os.path.join(
+        os.path.join(self._model_dir, "export"), EXPORTER_NAME)
+    self.assertTrue(gfile.Exists(export_dir))
+
+    # Examine the ckpt for predict.
+    def predict_input_fn():
+      return dataset_ops.Dataset.from_tensor_slices({
+          "x": DATA
+      }).batch(BATCH_SIZE)
+
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in estimator.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((BATCH_SIZE, LABEL_DIMENSION), predicted_proba.shape)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          train_distribute_cls=[
+              mirrored_strategy.MirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy
+          ],
+          eval_distribute_cls=[
+              None, mirrored_strategy.MirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy
+          ],
+          required_gpus=1))
+  def test_complete_flow_standalone_client(self, train_distribute_cls,
+                                           eval_distribute_cls):
+    try:
+      train_distribute = train_distribute_cls(num_gpus=context.num_gpus())
+    except TypeError:
+      train_distribute = train_distribute_cls(num_gpus_per_worker=2)
+
+    if eval_distribute_cls:
+      eval_distribute = eval_distribute_cls()
+    else:
+      eval_distribute = None
+
+    estimator = self._complete_flow(
+        train_distribute, eval_distribute, remote_cluster=self._cluster_spec)
+    self._inspect_train_and_eval_events(estimator)
+
+  def _mock_run_distribute_coordinator(
+      self,
+      worker_fn,
+      strategy,
+      eval_fn,
+      eval_strategy,
+      mode=dc.CoordinatorMode.STANDALONE_CLIENT,
+      cluster_spec=None,
+      session_config=None):
+    # Calls the origial `run_distribute_coordinator` method but gets task config
+    # from environment variables and then signals the caller.
+    task_type = None
+    task_id = None
+    if not cluster_spec:
+      cluster_spec = None
+      tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+      if not cluster_spec:
+        cluster_spec = tf_config.get("cluster", {})
+        task_env = tf_config.get("task", {})
+        if task_env:
+          task_type = task_env.get("type", task_type)
+          task_id = int(task_env.get("index", task_id))
+    self._event.set()
+    original_run_distribute_coordinator(
+        worker_fn,
+        strategy,
+        eval_fn,
+        eval_strategy,
+        mode=mode,
+        cluster_spec=cluster_spec,
+        task_type=task_type,
+        task_id=task_id,
+        session_config=session_config)
+
+  def _task_thread(self, train_distribute, eval_distribute):
+    with test.mock.patch.object(dc, "run_distribute_coordinator",
+                                self._mock_run_distribute_coordinator):
+      self._complete_flow(train_distribute, eval_distribute)
+
+  def _run_task_in_thread(self, cluster_spec, task_type, task_id,
+                          train_distribute, eval_distribute):
+    if task_type:
+      tf_config = {
+          "cluster": cluster_spec,
+          "task": {
+              "type": task_type,
+              "index": task_id
+          }
+      }
+    else:
+      tf_config = {
+          "cluster": cluster_spec,
+          "task": {
+              "type": task_type,
+              "index": task_id
+          }
+      }
+    self._event.clear()
+    t = threading.Thread(
+        target=self._task_thread, args=(train_distribute, eval_distribute))
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(tf_config)}):
+      t.start()
+      self._event.wait()
+    return t
+
+  def _run_multiple_tasks_in_threads(self, cluster_spec, train_distribute,
+                                     eval_distribute):
+    threads = {}
+    for task_type in cluster_spec.keys():
+      threads[task_type] = []
+      for task_id in range(len(cluster_spec[task_type])):
+        t = self._run_task_in_thread(cluster_spec, task_type, task_id,
+                                     train_distribute, eval_distribute)
+        threads[task_type].append(t)
+    return threads
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          train_distribute_cls=[
+              parameter_server_strategy.ParameterServerStrategy,
+          ],
+          eval_distribute_cls=[
+              None, mirrored_strategy.MirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy
+          ],
+          required_gpus=1))
+  def test_complete_flow_indepedent_worker_between_graph(
+      self, train_distribute_cls, eval_distribute_cls):
+    train_distribute = train_distribute_cls(
+        num_gpus_per_worker=context.num_gpus())
+
+    if eval_distribute_cls:
+      eval_distribute = eval_distribute_cls()
+    else:
+      eval_distribute = None
+
+    cluster_spec = _create_cluster_spec(num_workers=3, num_ps=2, has_eval=True)
+    threads = self._run_multiple_tasks_in_threads(
+        cluster_spec, train_distribute, eval_distribute)
+    for task_type, ts in threads.items():
+      if task_type == PS:
+        continue
+      for t in ts:
+        t.join()
+
+    estimator = self._get_estimator(train_distribute, eval_distribute)
+    self._inspect_train_and_eval_events(estimator)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          train_distribute_cls=[mirrored_strategy.MirroredStrategy],
+          eval_distribute_cls=[None, mirrored_strategy.MirroredStrategy],
+          required_gpus=1))
+  def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
+                                                    eval_distribute_cls):
+    train_distribute = train_distribute_cls(num_gpus=context.num_gpus())
+
+    if eval_distribute_cls:
+      eval_distribute = eval_distribute_cls()
+    else:
+      eval_distribute = None
+
+    cluster_spec = _create_cluster_spec(num_workers=3, num_ps=2, has_eval=True)
+    threads = self._run_multiple_tasks_in_threads(
+        cluster_spec, train_distribute, eval_distribute)
+    threads[WORKER][0].join()
+    threads[EVALUATOR][0].join()
+
+    estimator = self._get_estimator(train_distribute, eval_distribute)
+    self._inspect_train_and_eval_events(estimator)
+
+
+TF_CONFIG_WITH_CHIEF = {
+    "cluster": {
+        "chief": ["fake_chief"],
+    },
+    "task": {
+        "type": "chief",
+        "index": 0
+    }
+}
+
+TF_CONFIG_WITH_MASTER = {
+    "cluster": {
+        "master": ["fake_master"],
+    },
+    "task": {
+        "type": "master",
+        "index": 0
+    }
+}
+
+TF_CONFIG_WITHOUT_TASK = {"cluster": {"chief": ["fake_worker"]}}
+
+
+class RunConfigTest(test.TestCase):
+
+  def test_previously_unexpected_cluster_spec(self):
+    with test.mock.patch.dict(
+        "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}):
+      run_config_lib.RunConfig(
+          experimental_distribute=DistributeConfig(
+              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+
+  def test_should_run_distribute_coordinator(self):
+    """Tests that should_run_distribute_coordinator return a correct value."""
+    # We don't use distribute coordinator for local training.
+    self.assertFalse(
+        dc_training.should_run_distribute_coordinator(
+            run_config_lib.RunConfig()))
+
+    # When `train_distribute` is not specified, don't use distribute
+    # coordinator.
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
+      self.assertFalse(
+          dc_training.should_run_distribute_coordinator(
+              run_config_lib.RunConfig()))
+
+    # When `train_distribute` is specified and TF_CONFIG is detected, use
+    # distribute coordinator.
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
+      config_with_train_distribute = run_config_lib.RunConfig(
+          experimental_distribute=DistributeConfig(
+              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+      config_with_eval_distribute = run_config_lib.RunConfig(
+          experimental_distribute=DistributeConfig(
+              eval_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+    self.assertTrue(
+        dc_training.should_run_distribute_coordinator(
+            config_with_train_distribute))
+    self.assertFalse(
+        dc_training.should_run_distribute_coordinator(
+            config_with_eval_distribute))
+
+    # With a master in the cluster, don't run distribute coordinator.
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
+      config = run_config_lib.RunConfig(
+          experimental_distribute=DistributeConfig(
+              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+    self.assertFalse(dc_training.should_run_distribute_coordinator(config))
+
+  def test_init_run_config_duplicate_distribute(self):
+    with self.assertRaises(ValueError):
+      run_config_lib.RunConfig(
+          train_distribute=mirrored_strategy.MirroredStrategy(),
+          experimental_distribute=DistributeConfig(
+              train_distribute=mirrored_strategy.MirroredStrategy()))
+
+    with self.assertRaises(ValueError):
+      run_config_lib.RunConfig(
+          eval_distribute=mirrored_strategy.MirroredStrategy(),
+          experimental_distribute=DistributeConfig(
+              eval_distribute=mirrored_strategy.MirroredStrategy()))
+
+  def test_init_run_config_none_distribute_coordinator_mode(self):
+    # We don't use distribute coordinator for local training.
+    config = run_config_lib.RunConfig(
+        train_distribute=mirrored_strategy.MirroredStrategy())
+    dc_training.init_run_config(config, {})
+    self.assertIsNone(config._distribute_coordinator_mode)
+
+    # With a master in the cluster, don't run distribute coordinator.
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
+      config = run_config_lib.RunConfig(
+          train_distribute=mirrored_strategy.MirroredStrategy())
+      self.assertIsNone(config._distribute_coordinator_mode)
+
+    # When `train_distribute` is not specified, don't use distribute
+    # coordinator.
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
+      config = run_config_lib.RunConfig()
+      self.assertFalse(hasattr(config, "_distribute_coordinator_mode"))
+
+  def test_init_run_config_independent_worker(self):
+    # When `train_distribute` is specified and TF_CONFIG is detected, use
+    # distribute coordinator with INDEPENDENT_WORKER mode.
+    with test.mock.patch.dict("os.environ",
+                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
+      config = run_config_lib.RunConfig(
+          train_distribute=mirrored_strategy.MirroredStrategy())
+    self.assertEqual(config._distribute_coordinator_mode,
+                     dc.CoordinatorMode.INDEPENDENT_WORKER)
+
+  def test_init_run_config_standalone_client(self):
+    # When `train_distribute` is specified, TF_CONFIG is detected and
+    # `experimental.remote_cluster` is set use distribute coordinator with
+    # STANDALONE_CLIENT mode.
+    config = run_config_lib.RunConfig(
+        train_distribute=mirrored_strategy.MirroredStrategy(),
+        experimental_distribute=DistributeConfig(
+            remote_cluster={"chief": ["fake_worker"]}))
+    self.assertEqual(config._distribute_coordinator_mode,
+                     dc.CoordinatorMode.STANDALONE_CLIENT)
+
+
+if __name__ == "__main__":
+  with test.mock.patch.object(sys, "exit", os._exit):
+    test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
index cbfd178502..84b106545e 100644
--- a/tensorflow/contrib/distribute/python/examples/BUILD
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -19,9 +19,20 @@ py_binary(
 )
 
 py_binary(
-    name = "simple_tfkeras_example",
+    name = "keras_model_with_estimator",
     srcs = [
-        "simple_tfkeras_example.py",
+        "keras_model_with_estimator.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_binary(
+    name = "keras_mnist",
+    srcs = [
+        "keras_mnist.py",
     ],
     deps = [
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
new file mode 100644
index 0000000000..a20069c4fe
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -0,0 +1,126 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An example training a Keras Model using MirroredStrategy and native APIs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+NUM_CLASSES = 10
+
+
+def get_input_datasets():
+  """Downloads the MNIST dataset and creates train and eval dataset objects.
+
+  Returns:
+    Train dataset, eval dataset and input shape.
+
+  """
+  # input image dimensions
+  img_rows, img_cols = 28, 28
+
+  # the data, split between train and test sets
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+
+  if tf.keras.backend.image_data_format() == 'channels_first':
+    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
+    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
+    input_shape = (1, img_rows, img_cols)
+  else:
+    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
+    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
+    input_shape = (img_rows, img_cols, 1)
+
+  x_train = x_train.astype('float32')
+  x_test = x_test.astype('float32')
+  x_train /= 255
+  x_test /= 255
+
+  # convert class vectors to binary class matrices
+  y_train = tf.keras.utils.to_categorical(y_train, NUM_CLASSES)
+  y_test = tf.keras.utils.to_categorical(y_test, NUM_CLASSES)
+
+  # train dataset
+  train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  train_ds = train_ds.repeat()
+  train_ds = train_ds.shuffle(100)
+  train_ds = train_ds.batch(64)
+
+  # eval dataset
+  eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  eval_ds = eval_ds.repeat()
+  eval_ds = eval_ds.shuffle(100)
+  eval_ds = eval_ds.batch(64)
+
+  return train_ds, eval_ds, input_shape
+
+
+def get_model(input_shape):
+  """Builds a Sequential CNN model to recognize MNIST digits.
+
+  Args:
+    input_shape: Shape of the input depending on the `image_data_format`.
+
+  Returns:
+    a Keras model
+
+  """
+  # Define a CNN model to recognize MNIST digits.
+  model = tf.keras.models.Sequential()
+  model.add(tf.keras.layers.Conv2D(32, kernel_size=(3, 3),
+                                   activation='relu',
+                                   input_shape=input_shape))
+  model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
+  model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
+  model.add(tf.keras.layers.Dropout(0.25))
+  model.add(tf.keras.layers.Flatten())
+  model.add(tf.keras.layers.Dense(128, activation='relu'))
+  model.add(tf.keras.layers.Dropout(0.5))
+  model.add(tf.keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+  return model
+
+
+def main(_):
+  # Build the train and eval datasets from the MNIST data. Also return the
+  # input shape which is constructed based on the `image_data_format`
+  # i.e channels_first or channels_last.
+  train_ds, eval_ds, input_shape = get_input_datasets()
+  model = get_model(input_shape)
+
+  # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
+  # the `devices` argument then all the GPUs available on the machine are used.
+  strategy = tf.contrib.distribute.MirroredStrategy()
+
+  # Compile the model by passing the distribution strategy object to the
+  # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
+  # based on the strategy instantiated.
+  model.compile(loss=tf.keras.losses.categorical_crossentropy,
+                optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
+                metrics=['accuracy'],
+                distribute=strategy)
+
+  # Train the model with the train dataset.
+  model.fit(x=train_ds, epochs=20, steps_per_epoch=310)
+
+  # Evaluate the model with the eval dataset.
+  score = model.evaluate(eval_ds, steps=10, verbose=0)
+  print('Test loss:', score[0])
+  print('Test accuracy:', score[1])
+
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/keras_model_with_estimator.py
index 518ec9c423..8d117eb7e8 100644
--- a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_model_with_estimator.py
@@ -42,19 +42,19 @@ def main(args):
   model_dir = args[1]
   print('Using %s to store checkpoints.' % model_dir)
 
-  # Define tf.keras Model.
+  # Define a Keras Model.
   model = tf.keras.Sequential()
   model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
   model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
 
-  # Compile tf.keras Model.
+  # Compile the model.
   optimizer = tf.train.GradientDescentOptimizer(0.2)
   model.compile(loss='binary_crossentropy', optimizer=optimizer)
   model.summary()
   tf.keras.backend.set_learning_phase(True)
 
-  # Define a DistributionStrategy and convert the tf.keras Model to a
-  # tf.Estimator that utilizes the DistributionStrategy.
+  # Define a DistributionStrategy and convert the Keras Model to an
+  # Estimator that utilizes the DistributionStrategy.
   strategy = tf.contrib.distribute.MirroredStrategy(
       ['/device:GPU:0', '/device:GPU:1'])
   config = tf.estimator.RunConfig(
@@ -62,7 +62,7 @@ def main(args):
   keras_estimator = tf.keras.estimator.model_to_estimator(
       keras_model=model, config=config, model_dir=model_dir)
 
-  # Train and evaluate the tf.Estimator.
+  # Train and evaluate the model.
   keras_estimator.train(input_fn=input_fn, steps=10)
   eval_result = keras_estimator.evaluate(input_fn=input_fn)
   print('Eval result: {}'.format(eval_result))
diff --git a/tensorflow/contrib/distribute/python/input_ops_test.py b/tensorflow/contrib/distribute/python/input_ops_test.py
index 16179c3a49..c5acb7ced4 100644
--- a/tensorflow/contrib/distribute/python/input_ops_test.py
+++ b/tensorflow/contrib/distribute/python/input_ops_test.py
@@ -91,7 +91,7 @@ class AutoShardDatasetTest(test.TestCase):
   def _verifySimpleShardingOutput(self, dataset, record_fn):
     iterator = dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
           self.assertAllEqual(record_fn(r, f), sess.run(next_element))
@@ -150,7 +150,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     iterator = dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual, expected = [], []
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
@@ -182,7 +182,7 @@ class AutoShardDatasetTest(test.TestCase):
     # Verify output.
     iterator = dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual = []
       num_iterations = (self._num_files * self._num_records * num_epochs) // (
           self._num_shards * batch_size)
@@ -218,7 +218,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     iterator = dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
           self.assertAllEqual(self._record(r, f), sess.run(next_element))
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 4facd72d12..d39fd57294 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -116,7 +116,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
                                       model_dir=self._base_dir,
                                       train_distribute=dist,
                                       eval_distribute=dist)
-    with self.test_session():
+    with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
       before_eval_results = est_keras.evaluate(
@@ -139,7 +139,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
                                       train_distribute=dist)
-    with self.test_session():
+    with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
       before_eval_results = est_keras.evaluate(
@@ -163,7 +163,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
                                       train_distribute=dist)
-    with self.test_session():
+    with self.cached_session():
       est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
                                                config=config)
       with self.assertRaisesRegexp(ValueError,
@@ -178,7 +178,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 class TestWithDistributionStrategy(test.TestCase):
 
   def test_validating_dataset_input_tensors_with_shape_mismatch(self):
-    with self.test_session():
+    with self.cached_session():
       strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
                                                      '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2))
@@ -197,7 +197,7 @@ class TestWithDistributionStrategy(test.TestCase):
               strategy, x, y)
 
   def test_validating_dataset_input_tensors_with_dtype_mismatch(self):
-    with self.test_session():
+    with self.cached_session():
       strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
                                                      '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
@@ -216,7 +216,7 @@ class TestWithDistributionStrategy(test.TestCase):
               strategy, x, y)
 
   def test_calling_model_on_same_dataset(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
@@ -242,7 +242,7 @@ class TestWithDistributionStrategy(test.TestCase):
       model.predict(dataset, steps=2)
 
   def test_fit_with_tuple_and_dict_dataset_inputs(self):
-    with self.test_session():
+    with self.cached_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
       b = keras.layers.Input(shape=(3,), name='input_b')
 
@@ -283,7 +283,7 @@ class TestWithDistributionStrategy(test.TestCase):
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
   def test_fit_eval_and_predict_methods_on_dataset(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
@@ -320,7 +320,7 @@ class TestWithDistributionStrategy(test.TestCase):
       def __call__(self, y_true, y_pred):
         return y_pred - y_true
 
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
@@ -336,7 +336,7 @@ class TestWithDistributionStrategy(test.TestCase):
         model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
 
   def test_unsupported_features(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
@@ -367,8 +367,8 @@ class TestWithDistributionStrategy(test.TestCase):
       # Test with sample weight.
       sample_weight = np.random.random((10,))
       with self.assertRaisesRegexp(
-          NotImplementedError, 'sample_weight is currently not supported when '
-                               'using DistributionStrategy.'):
+          NotImplementedError, '`sample_weight` is currently not supported '
+                               'when using DistributionStrategy.'):
         model.fit(
             dataset,
             epochs=1,
@@ -389,7 +389,7 @@ class TestWithDistributionStrategy(test.TestCase):
         model.predict(dataset, verbose=0)
 
   def test_calling_with_unsupported_predefined_callbacks(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
@@ -428,7 +428,7 @@ class TestWithDistributionStrategy(test.TestCase):
                   callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
 
   def test_dataset_input_shape_validation(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
@@ -465,7 +465,7 @@ class TestWithDistributionStrategy(test.TestCase):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(16,), name='input')
       y = keras.layers.Dense(16)(x)
       z = keras.layers.Dropout(0.9999)(y)
@@ -498,7 +498,7 @@ class TestWithDistributionStrategy(test.TestCase):
 class LossMaskingWithDistributionStrategyTest(test.TestCase):
 
   def test_masking(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -523,7 +523,7 @@ class LossMaskingWithDistributionStrategyTest(test.TestCase):
 class NormalizationLayerWithDistributionStrategyTest(test.TestCase):
 
   def test_batchnorm_correctness(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
       model.add(norm)
@@ -550,7 +550,7 @@ class NormalizationLayerWithDistributionStrategyTest(test.TestCase):
 class CorrectnessWithDistributionStrategyTest(test.TestCase):
 
   def test_correctness(self):
-    with self.test_session():
+    with self.cached_session():
       keras.backend.set_image_data_format('channels_last')
       num_samples = 10000
       x_train = np.random.rand(num_samples, 1)
@@ -565,8 +565,7 @@ class CorrectnessWithDistributionStrategyTest(test.TestCase):
       dataset_with = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       dataset_with = dataset_with.batch(32)
       strategy = mirrored_strategy.MirroredStrategy(devices=['/device:CPU:0',
-                                                             '/device:GPU:0'],
-                                                    prefetch_on_device=False)
+                                                             '/device:GPU:0'])
 
       model.compile(loss=keras.losses.mean_squared_error,
                     optimizer=gradient_descent.GradientDescentOptimizer(0.5),
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 2f3d6bdd3f..8163494c8e 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -68,6 +68,8 @@ def _regression_dataset_fn():
       "predictions": [1., .75, .25, 0.]}).repeat()
 
 
+# TODO(priyag): Add TPU Strategy to this once metrics aggregate correctly using
+# TowerLocalVariables on TPUs. Submit http://cl/208914352.
 def all_combinations():
   return combinations.combine(
       distribution=[combinations.default_strategy,
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index aa7a61bb3b..bdac4fb58c 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -56,11 +56,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      def step_fn(ctx, inputs):
+      def step_fn(ctx, *inputs):
         del ctx  # Unused
         return distribution.group(
             distribution.call_for_each_tower(
-                model_fn, inputs, run_concurrently=layer.built))
+                model_fn, *inputs, run_concurrently=layer.built))
 
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
@@ -71,7 +71,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
 
@@ -108,7 +108,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
                 model_fn, iterator.get_next(), run_concurrently=layer.built))
 
       if not context.executing_eagerly():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
@@ -153,11 +153,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
-      def step_fn(ctx, inputs):
+      def step_fn(ctx, *inputs):
         del ctx  # Unused
         return distribution.group(
             distribution.call_for_each_tower(
-                model_fn, inputs, run_concurrently=layer.built))
+                model_fn, *inputs, run_concurrently=layer.built))
 
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
@@ -168,7 +168,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
 
@@ -231,11 +231,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
         self.assertFalse(distribution._prefetch_on_device)
 
-      def step_fn(ctx, inputs):
+      def step_fn(ctx, *inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
             distribution.call_for_each_tower(
-                model_fn, inputs, run_concurrently=batchnorm.built))
+                model_fn, *inputs, run_concurrently=batchnorm.built))
         if update_ops_in_cross_tower_mode:
           fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
         return control_flow_ops.group(fetches)
@@ -249,7 +249,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
 
@@ -328,9 +328,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
         return dataset_ops.Dataset.zip((features, labels)).repeat()
 
-      def step_fn(ctx, inputs):
+      def step_fn(ctx, x, y):
         del ctx  # Unused
-        x, y = inputs
         return distribution.group(
             distribution.call_for_each_tower(
                 model_fn, x, y, run_concurrently=False))
@@ -344,7 +343,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
 
@@ -417,9 +416,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         output_context.set_non_tensor_output(key1, value1)
         return (train_op, loss)
 
-      def step_fn(output_context, inputs):
+      def step_fn(output_context, *inputs):
         (train_op, loss) = distribution.call_for_each_tower(
-            model_fn, output_context, inputs, run_concurrently=False)
+            model_fn, output_context, *inputs, run_concurrently=False)
         output_context.set_last_step_output(
             name="cross_tower_loss_agg",
             output=loss,
@@ -467,7 +466,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index e3376a0636..e87b48ba41 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -19,12 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+from functools import partial
 import threading
 
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import shared_variable_creator
 from tensorflow.contrib.distribute.python import values
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
@@ -274,6 +276,9 @@ def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):
     else:
       result = values.MirroredVariable(index, index[devices[0]], aggregation)
 
+  # Add the wrapped variable to the requested collections.
+  # The handling of eager mode and the global step matches
+  # ResourceVariable._init_from_args().
   if not context.executing_eagerly():
     g = ops.get_default_graph()
     # If "trainable" is True, next_creator() will add the member variables
@@ -287,13 +292,55 @@ def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):
       for v in index.values():
         l.remove(v)
     g.add_to_collections(collections, result)
+  elif ops.GraphKeys.GLOBAL_STEP in collections:
+    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
+
   return result
 
 
 class MirroredStrategy(distribute_lib.DistributionStrategy):
-  """Mirrors vars to distribute across multiple devices on a single machine.
+  """Mirrors vars to distribute across multiple devices and machines.
+
+  This strategy uses one tower per device and sync replication for its multi-GPU
+  version.
+
+  When `cluster_spec` is given by the `configure` method., it turns into the
+  mulit-worker version that works on multiple workers with in-graph replication.
+  Note: `configure` will be called by higher-level APIs if running in
+  distributed environment.
+
+  There are several important concepts for distributed TensorFlow, e.g.
+  `client`, `job`, 'task', `cluster`, `in-graph replication` and
+  'synchronous training' and they have already been defined in the
+  [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
+  The distribution strategy inherits these concepts as well and in addition to
+  that we also clarify several more concepts:
+    * **In-graph replication**: the `client` creates a single `tf.Graph` that
+    specifies tasks for devices on all workers. The `client` then creates a
+    client session which will talk to the `master` service of a `worker`. Then
+    the `master` will partition the graph and distribute the work to all
+    participating workers.
+    * **Worker**: A `worker` is a TensorFlow `task` that usually maps to one
+    physical machine. We will have multiple `worker`s with different `task`
+    index. They all do similar things except for one worker checkpointing model
+    variables, writing summaries, etc. in addition to its ordinary work.
+
+  The multi-worker version of this class maps one tower to one device on a
+  worker. It mirrors all model variables on all towers. For example, if you have
+  two `worker`s and each `worker` has 4 GPUs, it will create 8 copies of the
+  model variables on these 8 GPUs. Then like in MirroredStrategy, each tower
+  performs their computation with their own copy of variables unless in
+  cross-tower model where variable or tensor reduction happens.
 
-  This strategy uses one tower per device and sync replication.
+  Args:
+    devices: a list of device strings.
+    num_gpus: number of GPUs. For local training, either specify `devices` or
+      `num_gpus`. In distributed training, this must be specified as number of
+      GPUs on each worker.
+    cross_tower_ops: optional, a descedant of `CrossTowerOps`. If this is not
+      set, the `configure` method will try to find the best one.
+    prefetch_on_device: optional boolean to specify whether to prefetch input
+      data to devices.
   """
 
   def __init__(self,
@@ -302,13 +349,73 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                cross_tower_ops=None,
                prefetch_on_device=None):
     super(MirroredStrategy, self).__init__()
+
+    self._cross_tower_ops = cross_tower_ops
+    self._prefetch_on_device = prefetch_on_device
+    # Rememeber num GPUs which might be needed by `configure` method.
+    self._num_gpus = num_gpus
+
+    self._initialize_local(num_gpus, devices)
+
+  def _initialize_local(self, num_gpus, devices):
+    """Initializes the object for local training."""
+    self._cluster_spec = None
     # Convert `num_gpus` into `devices`, shouldn't specify both.
     if devices is None:
       if num_gpus is None:
         num_gpus = context.num_gpus()
-      devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
+      if num_gpus == 0:
+        devices = ["/device:CPU:0"]
+      else:
+        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
     elif num_gpus is not None:
       raise ValueError("Must only specify one of `devices` and `num_gpus`.")
+    self._num_gpus = num_gpus
+    # TODO(yuefengz): consider setting the default device.
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = [device_util.resolve(d) for d in devices]
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerDevice({d: i for i, d in enumerate(devices)})
+
+  def _initialize_multi_worker(self, num_gpus, cluster_spec):
+    """Initializes the object for multi-worker training."""
+    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+    self._cluster_spec = cluster_spec
+
+    self._workers = []
+    for job in ["chief", "worker"]:
+      for task in range(len(cluster_spec.as_dict().get(job, []))):
+        self._workers.append("/job:%s/task:%d" % (job, task))
+
+    if num_gpus is None:
+      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
+    if num_gpus > 0:
+      self._worker_device_map = {
+          worker: [
+              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
+              for gpu in range(num_gpus)
+          ] for worker in self._workers
+      }
+    else:
+      self._worker_device_map = {
+          worker: [device_util.canonicalize(worker, "/device:CPU:0")]
+          for worker in self._workers
+      }
+
+    devices = nest.flatten(self._worker_device_map)
+
+    # Setting `_default_device` will add a device scope in the
+    # distribution.scope. We set the default device to the first worker. When
+    # users specify device under distribution.scope by
+    #   with tf.device("/cpu:0"):
+    #     ...
+    # their ops will end up on the cpu device of its first worker, e.g.
+    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
+    self._default_device = self._workers[0]
 
     assert devices, "Must specify at least one device."
     assert len(set(devices)) == len(devices), (
@@ -318,9 +425,6 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     self._canonical_device_set = set(self._devices)
     self._device_index = values.PerDevice(
         {d: i for i, d in enumerate(devices)})
-    self._cross_tower_ops = cross_tower_ops
-    self._prefetch_on_device = prefetch_on_device
-    # TODO(yuefengz): consider setting the default device.
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a mirrored variable. See `DistributionStrategy.scope`."""
@@ -357,9 +461,14 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                                      **kwargs)
 
   def distribute_dataset(self, dataset_fn):
-    return values.PerDeviceDataset(
-        self._call_dataset_fn(dataset_fn), self._devices,
-        self._prefetch_on_device)
+    if self._cluster_spec:
+      return values.MultiWorkerDataset(
+          partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
+          self._prefetch_on_device)
+    else:
+      return values.PerDeviceDataset(
+          self._call_dataset_fn(dataset_fn), self._devices,
+          self._prefetch_on_device)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   def _run_steps_on_dataset(self, fn, iterator, iterations,
@@ -372,7 +481,10 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     def body(i, *args):
       """A wrapper around `fn` to create the while loop body."""
       del args
-      fn_result = fn(ctx, iterator.get_next())
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, *fn_inputs)
       for (name, output) in ctx.last_step_outputs.items():
         # Convert all outputs to tensors, potentially from `DistributedValues`.
         ctx.last_step_outputs[name] = self.unwrap(output)
@@ -380,12 +492,21 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       with ops.control_dependencies([fn_result]):
         return [i + 1] + flat_last_step_outputs
 
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
     cond = lambda i, *args: i < iterations
     i = constant_op.constant(0)
     loop_result = control_flow_ops.while_loop(
         cond, body, [i] + initial_loop_values, name="",
         parallel_iterations=1, back_prop=False, swap_memory=False,
         return_same_structure=True)
+    del self._outer_control_flow_context
 
     ctx.run_op = control_flow_ops.group(loop_result)
 
@@ -432,10 +553,22 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     # in addition to PerDevice data.
     return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
 
-  def configure(self, session_config=None):
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    del task_type, task_id
+    if cluster_spec:
+      self._initialize_multi_worker(self._num_gpus, cluster_spec)
+
     if self._cross_tower_ops is None:
-      self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
-          self._devices, session_config=session_config)
+      if self._cluster_spec:
+        self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
+            self._workers, self._num_gpus)
+      else:
+        self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
+            self._devices, session_config=session_config)
 
   def _get_cross_tower_ops(self):
     if self._cross_tower_ops is None:
@@ -520,6 +653,22 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   def parameter_devices(self):
     return list(self._devices)
 
+  @property
+  def between_graph(self):
+    return False
+
+  @property
+  def should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
   def non_slot_devices(self, var_list):
     del var_list
     return list(self._devices)
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index e064cfe37d..a12ff662db 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import sys
 
 from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
@@ -40,7 +41,8 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import server_lib
 
 
 GPU_TEST = "test_gpu" in sys.argv[0]
@@ -164,7 +166,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       # This variable should be created only once across the threads because of
       # special variable_creator functions used by `dist.call_for_each_tower`.
       v = variable_scope.variable(1.0, name="foo")
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -181,7 +183,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     def model_fn():
       v = variable_scope.variable(1.0)
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -201,7 +203,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       vs = []
       for i in range(5):
         vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return vs
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -223,7 +225,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
       vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return vs
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -245,7 +247,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     def model_fn(device_id):
       v = variable_scope.variable(1.0, name="foo_" + str(device_id))
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -268,7 +270,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         layer2 = core.Dense(1)
         layer2(features)
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         layer3 = core.Dense(1)
         layer3(features)
         return [(layer1.kernel, layer1.bias),
@@ -300,7 +303,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.variable(1.0, name="var1")
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         v2 = variable_scope.variable(
             1.0,
             name="var2",
@@ -343,7 +347,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.get_variable("var1", [1])
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         v2 = variable_scope.get_variable(
             "var2", [1],
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -453,7 +458,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     def model_fn():
       v = variable_scope.variable(1.0, name="foo")
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -470,7 +475,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
-      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
       return v
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -570,7 +575,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       with ops.name_scope("foo"):
         a = constant_op.constant(1.0, name="a")
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         b = constant_op.constant(1.0, name="b")
       return a, b
 
@@ -591,7 +597,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       with ops.name_scope(None, "foo"):
         a = constant_op.constant(1.0, name="a")
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
         b = constant_op.constant(2.0, name="b")
       return a, b
 
@@ -619,7 +626,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       b = variable_scope.variable(1.0, name="b")
       with ops.name_scope("foo"):
-        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+        c = distribution_strategy_context.get_tower_context().merge_call(
+            in_cross_tower)
       return b, c
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -651,7 +659,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       b = variable_scope.get_variable("b", [1])
       with ops.name_scope("foo"):
-        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+        c = distribution_strategy_context.get_tower_context().merge_call(
+            in_cross_tower)
       return b, c
 
     dist = mirrored_strategy.MirroredStrategy(
@@ -833,8 +842,9 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertEquals(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
-        value = math_ops.cast(distribute_lib.get_tower_context().tower_id,
-                              mirrored_var.dtype)
+        value = math_ops.cast(
+            distribution_strategy_context.get_tower_context().tower_id,
+            mirrored_var.dtype)
         return mirrored_var.assign(value)
 
       self.evaluate(dist.unwrap(dist.call_for_each_tower(
@@ -878,8 +888,18 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
       self.assertEquals(1.0, self.evaluate(mirrored_var))
-      mirrored_var_result = self.evaluate(mirrored_var.assign_add(6.0))
+
+      # read_value == True
+      mirrored_var_result = self.evaluate(
+          mirrored_var.assign_add(6.0, read_value=True))
       self.assertEquals(7.0, mirrored_var_result)
+      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+
+      # read_value == False
+      self.evaluate(mirrored_var.assign_add(2.0, read_value=False))
+      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testAssignAddMirroredVarTowerContext(self):
@@ -898,8 +918,9 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertEquals(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
-        value = math_ops.cast(distribute_lib.get_tower_context().tower_id,
-                              mirrored_var.dtype)
+        value = math_ops.cast(
+            distribution_strategy_context.get_tower_context().tower_id,
+            mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
       self.evaluate(dist.unwrap(dist.call_for_each_tower(
@@ -945,6 +966,8 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertEquals(5.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0))
       self.assertEquals(3.0, mirrored_var_result)
+      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testAssignSubMirroredVarTowerContext(self):
@@ -963,8 +986,9 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertEquals(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
-        value = math_ops.cast(distribute_lib.get_tower_context().tower_id,
-                              mirrored_var.dtype)
+        value = math_ops.cast(
+            distribution_strategy_context.get_tower_context().tower_id,
+            mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
       self.evaluate(dist.unwrap(dist.call_for_each_tower(
@@ -1234,5 +1258,39 @@ class MirroredStrategyDefunTest(test.TestCase):
     self._call_and_check(fn1, [factors], expected_result, [fn1])
 
 
+class MultiWorkerMirroredStrategyTest(
+    multi_worker_test_base.MultiWorkerTestBase,
+    strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": ["/job:worker/task:0", "/job:worker/task:1"]
+    })
+    strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
+    strategy.configure(cluster_spec=cluster_spec)
+    return strategy
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy(),
+                                   learning_rate=0.05)
+
+
+class MultiWorkerMirroredStrategyTestWithChief(
+    multi_worker_test_base.MultiWorkerTestBase,
+    strategy_test_lib.DistributionTestBase):
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 2 workers and 1 chief."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=2, num_ps=0, has_chief=True)
+    cls._default_target = "grpc://" + cls._cluster_spec["chief"][0]
+
+  def testMinimizeLossGraph(self):
+    strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
+    strategy.configure(cluster_spec=self._cluster_spec)
+    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
index a066adf124..969e126956 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -22,9 +22,11 @@ from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 
 
 class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
@@ -60,6 +62,7 @@ class VariableCreatorStackTest(test.TestCase):
 
     def model_fn(device_id):
       assert isinstance(device_id, int)
+
       def thread_creator_fn(next_creator, *args, **kwargs):
         return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
 
@@ -68,7 +71,8 @@ class VariableCreatorStackTest(test.TestCase):
         v = variable_scope.variable(1.0)
 
         # This will pause the current thread, and execute the other thread.
-        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        distribution_strategy_context.get_tower_context().merge_call(
+            lambda _: _)
       return v
 
     def main_thread_creator(next_creator, *args, **kwargs):
@@ -85,5 +89,21 @@ class VariableCreatorStackTest(test.TestCase):
       self.assertEquals(expected, result)
 
 
+class MultiWorkerMirroredStrategyTest(test.TestCase):
+
+  def testDeviceScope(self):
+    """Test the device scope of multi-worker MirroredStrategy."""
+    with context.graph_mode():
+      strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
+      strategy.configure(
+          cluster_spec={"worker": ["/job:worker/task:0", "/job:worker/task:1"]})
+      with strategy.scope():
+        a = constant_op.constant(1.)
+        with ops.device("/cpu:0"):
+          b = constant_op.constant(1.)
+        self.assertEqual(a.device, "/job:worker/task:0")
+        self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/monitor_test.py b/tensorflow/contrib/distribute/python/monitor_test.py
index 2892ce4394..16be839e1d 100644
--- a/tensorflow/contrib/distribute/python/monitor_test.py
+++ b/tensorflow/contrib/distribute/python/monitor_test.py
@@ -45,7 +45,7 @@ class MonitorTest(test.TestCase, parameterized.TestCase):
       if context.executing_eagerly():
         monitor = monitor_lib.Monitor(single_loss_step, None)
       else:
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           monitor = monitor_lib.Monitor(single_loss_step, sess)
 
       monitor.run_steps(1)
diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy.py b/tensorflow/contrib/distribute/python/multi_worker_strategy.py
deleted file mode 100644
index cbfe5df61d..0000000000
--- a/tensorflow/contrib/distribute/python/multi_worker_strategy.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes implementing a mirrored DistributionStrategy for multiple workers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import partial
-
-from tensorflow.contrib.distribute.python import values
-from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
-from tensorflow.core.protobuf import cluster_pb2
-from tensorflow.python.training import device_util
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import nest
-
-
-# TODO(yuefengz): support between-graph replication.
-# TODO(yuefengz): merge this class into its base class.
-# TODO(yuefengz): in some cases, we probably want to use configure method to
-# configure this class.
-# TODO(yuefengz): MirroredStrategy.worker_devices may be confusing after the
-# class is introduced.
-class MultiWorkerMirroredStrategy(MirroredStrategy):
-  """Mirrored strategy that works on multiple workers with in-graph replication.
-
-  There are several important concepts for distributed TensorFlow, e.g.
-  `client`, `job`, 'task', `cluster`, `in-graph replication` and
-  'synchronous training' and they have already been defined in the
-  [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
-  The distribution strategy inherits these concepts as well and in addition to
-  that we also clarify several more concepts:
-    * **In-graph replication**: the `client` creates a single `tf.Graph` that
-    specifies tasks for devices on all workers. The `client` then creates a
-    client session which will talk to the `master` service of a `worker`. Then
-    the `master` will partition the graph and distribute the work to all
-    participating workers.
-    * **Worker**: A `worker` is a TensorFlow `task` that usually maps to one
-    physical machine. We will have multiple `worker`s with different `task`
-    index. They all do similar things except for one worker checkpointing model
-    variables, writing summaries, etc. in addition to its ordinary work.
-
-  This class maps one tower to one device on a worker. It mirrors all model
-  variables on all towers. For example, if you have two `worker`s and each
-  `worker` has 4 GPUs, it will create 8 copies of the model variables on these 8
-  GPUs. Then like in MirroredStrategy, each tower performs their computation
-  with their own copy of variables unless in cross-tower model where variable or
-  tensor reduction happens.
-  """
-
-  def __init__(self,
-               num_gpus_per_worker=1,
-               worker_job_name=None,
-               num_workers=None,
-               cluster=None,
-               cross_tower_ops=None,
-               prefetch_on_device=None):
-    """Initialize the strategy object.
-
-    Args:
-      num_gpus_per_worker: number of GPUs per work. If it is zero, the local
-        CPU will be used.
-      worker_job_name: the job name for `worker`, typically just 'worker'.
-      num_workers: the number of workers. If it is 0, it regenerates to
-        single-worker MirroredStrategy.
-      cluster: a `tf.train.ClusterSpec` object or a dict that can be used to
-        construct a `tf.train.ClusterSpec` object or a `tf.train.ClusterDef`
-        proto buffer. It is an alternative way to initialize this object.
-      cross_tower_ops: the cross tower ops to use. If None, a default one will
-        be used. If configure method is called, a best one for the configuration
-        will be chosen.
-      prefetch_on_device: a boolean to specify whether to prefetech input to
-        each worker's devices.
-
-    Raises:
-      ValueError: if got an unexpected `cluster`.
-    """
-    if cluster is None:
-      self._workers = [
-          '/job:%s/task:%d' % (worker_job_name, task_index)
-          for task_index in range(num_workers)
-      ]
-    else:
-      if isinstance(cluster, (dict, cluster_pb2.ClusterDef)):
-        cluster_spec = server_lib.ClusterSpec(cluster)
-      elif isinstance(cluster, server_lib.ClusterSpec):
-        cluster_spec = cluster
-      else:
-        raise ValueError(
-            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
-            '`tf.train.ClusterDef` object')
-
-      self._workers = []
-      for job in sorted(cluster_spec.jobs):
-        for task in range(cluster_spec.num_tasks(job)):
-          self._workers.append('/job:%s/task:%d' % (job, task))
-
-    self._num_gpus_per_worker = num_gpus_per_worker
-    if num_gpus_per_worker > 0:
-      self._worker_device_map = {
-          worker: [
-              device_util.canonicalize(worker + '/device:GPU:%d' % gpu)
-              for gpu in range(num_gpus_per_worker)
-          ] for worker in self._workers
-      }
-    else:
-      self._worker_device_map = {
-          worker: [device_util.canonicalize(worker, '/device:CPU:0')]
-          for worker in self._workers
-      }
-    self._devices = nest.flatten(self._worker_device_map)
-
-    super(MultiWorkerMirroredStrategy, self).__init__(
-        devices=self._devices, prefetch_on_device=prefetch_on_device)
-
-    # Setting `_default_device` will add a device scope in the
-    # distribution.scope. We set the default device to the first worker. When
-    # users specify device under distribution.scope by
-    #   with tf.device("/cpu:0"):
-    #     ...
-    # their ops will end up on the cpu device of its first worker, e.g.
-    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
-    self._default_device = self._workers[0]
-
-  def distribute_dataset(self, dataset_fn):
-    return values.MultiWorkerDataset(
-        partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
-        self._prefetch_on_device)
diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py b/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
deleted file mode 100644
index 09c859b32a..0000000000
--- a/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for MultiWorkerMirroredStrategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distribute.python import multi_worker_strategy
-from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.training import server_lib
-
-
-class MultiWorkerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
-                              strategy_test_lib.DistributionTestBase):
-
-  def _get_distribution_strategy(self):
-    return multi_worker_strategy.MultiWorkerMirroredStrategy(
-        cluster=server_lib.ClusterSpec({
-            'worker': ['/job:worker/task:0', '/job:worker/task:1']
-        }),
-        num_gpus_per_worker=context.num_gpus())
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
-
-
-class DeviceScopeTest(test.TestCase):
-  """Test the device scope of MultiWorkerMirroredStrategy."""
-
-  def testDeviceScope(self):
-    with context.graph_mode():
-      strategy = multi_worker_strategy.MultiWorkerMirroredStrategy(
-          cluster={'worker': ['/job:worker/task:0', '/job:worker/task:1']},
-          num_gpus_per_worker=context.num_gpus())
-      with strategy.scope():
-        a = constant_op.constant(1.)
-        with ops.device('/cpu:0'):
-          b = constant_op.constant(1.)
-        self.assertEqual(a.device, '/job:worker/task:0')
-        self.assertEqual(b.device, '/job:worker/task:0/device:CPU:0')
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index 249de01f08..18b4503eff 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -23,26 +23,105 @@ import copy
 import threading
 import numpy as np
 
+_portpicker_import_error = None
+try:
+  import portpicker  # pylint: disable=g-import-not-at-top
+except ImportError as _error:  # pylint: disable=invalid-name
+  _portpicker_import_error = _error
+  portpicker = None
+
+# pylint: disable=g-import-not-at-top
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.estimator import run_config
 from tensorflow.python.platform import test
-from tensorflow.python.framework import test_util
-
-
-def create_in_process_cluster(num_workers, num_ps):
+from tensorflow.python.training import server_lib
+
+
+def _create_cluster(num_workers,
+                    num_ps,
+                    has_chief=False,
+                    has_eval=False,
+                    protocol='grpc',
+                    worker_config=None,
+                    ps_config=None):
+  """Creates and starts local servers and returns the cluster_spec dict."""
+  if _portpicker_import_error:
+    raise _portpicker_import_error  # pylint: disable=raising-bad-type
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+
+  cluster_dict = {}
+  if num_workers > 0:
+    cluster_dict['worker'] = ['localhost:%s' % port for port in worker_ports]
+  if num_ps > 0:
+    cluster_dict['ps'] = ['localhost:%s' % port for port in ps_ports]
+  if has_eval:
+    cluster_dict['evaluator'] = ['localhost:%s' % portpicker.pick_unused_port()]
+  if has_chief:
+    cluster_dict['chief'] = ['localhost:%s' % portpicker.pick_unused_port()]
+
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  for i in range(num_workers):
+    server_lib.Server(
+        cs,
+        job_name='worker',
+        protocol=protocol,
+        task_index=i,
+        config=worker_config,
+        start=True)
+
+  for i in range(num_ps):
+    server_lib.Server(
+        cs,
+        job_name='ps',
+        protocol=protocol,
+        task_index=i,
+        config=ps_config,
+        start=True)
+
+  if has_chief:
+    server_lib.Server(
+        cs,
+        job_name='chief',
+        protocol=protocol,
+        task_index=0,
+        config=worker_config,
+        start=True)
+
+  if has_eval:
+    server_lib.Server(
+        cs,
+        job_name='evaluator',
+        protocol=protocol,
+        task_index=0,
+        config=worker_config,
+        start=True)
+
+  return cluster_dict
+
+
+def create_in_process_cluster(num_workers,
+                              num_ps,
+                              has_chief=False,
+                              has_eval=False):
   """Create an in-process cluster that consists of only standard server."""
   # Leave some memory for cuda runtime.
-  gpu_mem_frac = 0.7 / num_workers
+  gpu_mem_frac = 0.7 / (num_workers + int(has_chief) + int(has_eval))
   worker_config = config_pb2.ConfigProto()
   worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
 
   # Enable collective ops which has no impact on non-collective ops.
   # TODO(yuefengz, tucker): removing this after we move the initialization of
   # collective mgr to the session level.
-  worker_config.experimental.collective_group_leader = (
-      '/job:worker/replica:0/task:0')
+  if has_chief:
+    worker_config.experimental.collective_group_leader = (
+        '/job:chief/replica:0/task:0')
+  else:
+    worker_config.experimental.collective_group_leader = (
+        '/job:worker/replica:0/task:0')
 
   ps_config = config_pb2.ConfigProto()
   ps_config.device_count['GPU'] = 0
@@ -56,9 +135,10 @@ def create_in_process_cluster(num_workers, num_ps):
   # 2) there is something global in CUDA such that if we initialize CUDA in the
   # parent process, the child process cannot initialize it again and thus cannot
   # use GPUs (https://stackoverflow.com/questions/22950047).
-  return test_util.create_local_cluster(
+  return _create_cluster(
       num_workers,
       num_ps=num_ps,
+      has_chief=has_chief,
       worker_config=worker_config,
       ps_config=ps_config,
       protocol='grpc')
@@ -70,7 +150,8 @@ class MultiWorkerTestBase(test.TestCase):
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
-    cls._workers, cls._ps = create_in_process_cluster(num_workers=2, num_ps=0)
+    cls._cluster_spec = create_in_process_cluster(num_workers=2, num_ps=0)
+    cls._default_target = 'grpc://' + cls._cluster_spec['worker'][0]
 
   def setUp(self):
     # We only cache the session in one test because another test may have a
@@ -111,17 +192,17 @@ class MultiWorkerTestBase(test.TestCase):
     config.graph_options.rewrite_options.constant_folding = (
         rewriter_config_pb2.RewriterConfig.OFF)
 
+    if target is None:
+      target = self._default_target
     if graph is None:
       if getattr(self._thread_local, 'cached_session', None) is None:
         self._thread_local.cached_session = session.Session(
-            graph=None, config=config, target=target or self._workers[0].target)
+            graph=None, config=config, target=target)
       sess = self._thread_local.cached_session
       with sess.graph.as_default(), sess.as_default():
         yield sess
     else:
-      with session.Session(
-          graph=graph, config=config, target=target or
-          self._workers[0].target) as sess:
+      with session.Session(graph=graph, config=config, target=target) as sess:
         yield sess
 
   def _run_client(self, client_fn, task_type, task_id, num_gpus, *args,
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 016978cdb3..68561b5bbf 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -80,18 +80,30 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     def body(i, *args):
       """A wrapper around `fn` to create the while loop body."""
       del args
-      fn_result = fn(ctx, iterator.get_next())
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, *fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       with ops.control_dependencies([fn_result]):
         return [i + 1] + flat_last_step_outputs
 
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    # TODO(priyag): Use max_iterations instead of an explicit counter.
     cond = lambda i, *args: i < iterations
     i = constant_op.constant(0)
-    # TODO(priyag): Use max_iterations instead of an explicit counter.
     loop_result = control_flow_ops.while_loop(
         cond, body, [i] + initial_loop_values, name="",
         parallel_iterations=1, back_prop=False, swap_memory=False,
         return_same_structure=True)
+    del self._outer_control_flow_context
 
     ctx.run_op = control_flow_ops.group(loop_result)
 
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index a2d736e422..6e9ba37a19 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -51,7 +51,7 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
                 model_fn, iterator.get_next(), run_concurrently=layer.built)))
 
       if not context.executing_eagerly():
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
         self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 407c78df95..361c8be590 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,38 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
-
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import values
-from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import device_setter
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 _LOCAL_CPU = "/device:CPU:0"
 _LOCAL_GPU_0 = "/device:GPU:0"
 
 
-def _normalize_cluster_spec(cluster_spec):
-  """Makes `cluster_spec` into a `ClusterSpec` object."""
-  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
-    return server_lib.ClusterSpec(cluster_spec)
-  elif not isinstance(cluster_spec, server_lib.ClusterSpec):
-    raise ValueError(
-        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
-        "`tf.train.ClusterDef` object")
-  return cluster_spec
-
-
 # TODO(yuefengz): maybe cache variables on local CPU.
 # TODO(yuefengz): we may want to set session options to disallow communication
 # between workers.
@@ -70,7 +57,11 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   assigned to.
 
   This class assumes between-graph replication will be used and works on a graph
-  for a particular worker.
+  for a particular worker. Note that each graph and worker is independent.
+  This means that while each worker will synchronously compute a single gradient
+  update across all GPUs, updates between workers proceed asynchronously.
+  Operations that occur only on the first tower (such as incrementing the global
+  step), will occur on the first tower *of every worker*.
 
   It is expected to call `call_for_each_tower(fn, *args, **kwargs)` for any
   operations which potentially can be replicated across towers (i.e. multiple
@@ -88,7 +79,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   3) It is also not recommended to open a colocation scope (i.e. calling
   `tf.colocate_with`) under the strategy's scope. For colocating variables,
   use `distribution.colocate_vars_with` instead. Colocation of ops will possibly
-  create conflicts of device assignement.
+  create conflicts of device assignment.
   """
 
   def __init__(self,
@@ -96,7 +87,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
                cluster_spec=None,
                task_type=None,
                task_id=None):
-    """Initiailizes this strategy.
+    """Initializes this strategy.
 
     Args:
       num_gpus_per_worker: number of local GPUs or GPUs per worker.
@@ -104,11 +95,18 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
         cluster configurations.
       task_type: the current task type.
       task_id: the current task id.
+
+    Raises:
+      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
+        not.
     """
     super(ParameterServerStrategy, self).__init__()
     self._num_gpus_per_worker = num_gpus_per_worker
     if cluster_spec:
-      cluster_spec = _normalize_cluster_spec(cluster_spec)
+      cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+      if task_type is None or task_id is None:
+        raise ValueError("When `cluster_spec` is given, must also specify "
+                         "`task_type` and `task_id`.")
     self._cluster_spec = cluster_spec
 
     # We typically don't need to do all-reduce in this strategy.
@@ -216,6 +214,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     else:
       self._default_device = self._worker_device
 
+    self._is_chief = cluster_spec is None or multi_worker_util.is_chief(
+        cluster_spec, task_type, task_id)
+
   def distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     return values.PerDeviceDataset(
@@ -229,14 +230,57 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
   def _create_variable(self, next_creator, *args, **kwargs):
+    if self.num_towers > 1:
+      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+      if aggregation not in (
+          vs.VariableAggregation.NONE,
+          vs.VariableAggregation.SUM,
+          vs.VariableAggregation.MEAN
+      ):
+        raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                         " for variable: " + kwargs["name"])
+
+      def var_creator(*args, **kwargs):
+        # Record what collections this variable should be added to.
+        collections = kwargs.pop("collections", None)
+        if collections is None:
+          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+        kwargs["collections"] = []
+
+        # Create and wrap the variable.
+        v = next_creator(*args, **kwargs)
+        wrapped = values.AggregatingVariable(v, aggregation)
+
+        # Add the wrapped variable to the requested collections.
+        # The handling of eager mode and the global step matches
+        # ResourceVariable._init_from_args().
+        if not context.executing_eagerly():
+          g = ops.get_default_graph()
+          # If "trainable" is True, next_creator() will add the contained
+          # variable to the TRAINABLE_VARIABLES collection, so we manually
+          # remove it and replace with the wrapper. We can't set "trainable"
+          # to False for next_creator() since that causes functions like
+          # implicit_gradients to skip those variables.
+          if kwargs.get("trainable", True):
+            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+            l.remove(v)
+          g.add_to_collections(collections, wrapped)
+        elif ops.GraphKeys.GLOBAL_STEP in collections:
+          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
+
+        return wrapped
+    else:
+      var_creator = next_creator
+
     if "colocate_with" in kwargs:
       with ops.device(None):
         with ops.colocate_with(kwargs["colocate_with"]):
-          return next_creator(*args, **kwargs)
+          return var_creator(*args, **kwargs)
 
     with ops.colocate_with(None, ignore_existing=True):
       with ops.device(self._variable_device):
-        return next_creator(*args, **kwargs)
+        return var_creator(*args, **kwargs)
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
     # pylint: disable=protected-access
@@ -258,7 +302,6 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       # pylint: disable=protected-access
       return mirrored_strategy._reduce_non_distributed_value(
           self, aggregation, value, destinations)
-
     return self._cross_tower_ops.reduce(
         aggregation, value, destinations=destinations)
 
@@ -291,6 +334,8 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     return nest.map_structure(_select_fn, structured)
 
   def _update(self, var, fn, *args, **kwargs):
+    if isinstance(var, values.AggregatingVariable):
+      var = var.get()
     if not isinstance(var, resource_variable_ops.ResourceVariable):
       raise ValueError(
           "You can not update `var` %r. It must be a Variable." % var)
@@ -319,26 +364,38 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     # No need to distinguish between normal variables and tower-local variables.
     return array_ops.identity(var)
 
-  def configure(self, session_config=None):
-    del session_config
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    """Configures the strategy class.
 
-    # Use TF_CONFIG to get the cluster spec and the current job.
-    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
-    cluster_spec = _normalize_cluster_spec(tf_config.get("cluster", {}))
+    The strategy object will be re-initialized if `cluster_spec` is given but
+    was not passed in the constructor.
 
-    task_env = tf_config.get("task", {})
-    if task_env:
-      task_type = task_env.get("type", "worker")
-      task_id = int(task_env.get("index", "0"))
-    else:
-      task_type = "worker"
-      task_id = None
+    Args:
+      session_config: not used currently.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
+        not.
+    """
+    del session_config
 
     # Set the devices if cluster_spec is defined in TF_CONFIG but not passed in
     # the constructor.
     if not self._cluster_spec and cluster_spec:
-      self._cluster_spec = cluster_spec
-      self._initialize_devices(self._num_gpus_per_worker, cluster_spec,
+      self._cluster_spec = multi_worker_util.normalize_cluster_spec(
+          cluster_spec)
+      if task_type is None or task_id is None:
+        raise ValueError("When `cluster_spec` is given, must also specify "
+                         "`task_type` and `task_id`.")
+      self._initialize_devices(self._num_gpus_per_worker, self._cluster_spec,
                                task_type, task_id)
 
   @property
@@ -356,3 +413,19 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def non_slot_devices(self, var_list):
     return min(var_list, key=lambda x: x.name)
+
+  @property
+  def between_graph(self):
+    return True
+
+  @property
+  def should_init(self):
+    return self._is_chief
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index cf29c0ed91..0e2bfcec5f 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -18,13 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
 import threading
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import parameter_server_strategy
+from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
@@ -37,22 +38,16 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import training_util
 
+CHIEF = run_config.TaskType.CHIEF
+WORKER = run_config.TaskType.WORKER
+PS = run_config.TaskType.PS
 
-class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
-                                  parameterized.TestCase):
 
-  @classmethod
-  def setUpClass(cls):
-    cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster(
-        num_workers=3, num_ps=2)
-    cls._cluster_spec = {
-        run_config.TaskType.WORKER: [
-            'fake_worker_0', 'fake_worker_1', 'fake_worker_2'
-        ],
-        run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
-    }
+class ParameterServerStrategyTestBase(
+    multi_worker_test_base.MultiWorkerTestBase):
 
   def setUp(self):
     self._result = 0
@@ -61,7 +56,7 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
     self._init_reached = 0
     self._finish_condition = threading.Condition()
     self._finish_reached = 0
-    super(ParameterServerStrategyTest, self).setUp()
+    super(ParameterServerStrategyTestBase, self).setUp()
 
   def _get_test_objects(self, task_type, task_id, num_gpus):
     distribution = parameter_server_strategy.ParameterServerStrategy(
@@ -69,26 +64,15 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
     if not task_type:
       return distribution, ''
 
-    tf_config = {
-        'cluster': self._cluster_spec,
-        'task': {
-            'type': task_type,
-            'index': task_id
-        }
-    }
-    with self._lock:
-      # Accessing environment variables should be protected by locks because
-      # environment variables are shared by all threads.
-      with test.mock.patch.dict('os.environ',
-                                {'TF_CONFIG': json.dumps(tf_config)}):
-        distribution.configure()
-    return distribution, self._workers[task_id].target
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type=task_type, task_id=task_id)
+    return distribution, 'grpc://' + self._cluster_spec[WORKER][task_id]
 
   def _test_device_assignment_distributed(self, task_type, task_id, num_gpus):
     worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
     d, _ = self._get_test_objects(task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
-         self.test_session(target=self._workers[0].target) as sess, \
+         self.test_session(target=self._default_target) as sess, \
          d.scope():
 
       # Define a variable outside the call_for_each_tower scope. This is not
@@ -101,7 +85,8 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
           last_part_device = 'device:CPU:0'
         else:
           last_part_device = (
-              'device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+              'device:GPU:%d' %
+              distribution_strategy_context.get_tower_context().tower_id)
 
         a = constant_op.constant(1.0)
         b = constant_op.constant(2.0)
@@ -112,7 +97,9 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
 
         # The device scope is ignored for variables but not for normal ops.
         with ops.device('/job:worker/task:0'):
-          x = variable_scope.get_variable('x', initializer=10.0)
+          x = variable_scope.get_variable(
+              'x', initializer=10.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
           x_add = x.assign_add(c)
           e = a + c
         # The variable x is on the task 1 since the device_function has been
@@ -124,18 +111,26 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
 
         # The colocate_vars_with can override the distribution's device.
         with d.colocate_vars_with(x):
-          y = variable_scope.get_variable('y', initializer=20.0)
-        y_add = y.assign_add(x_add)
+          y = variable_scope.get_variable(
+              'y', initializer=20.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
+        # We add an identity here to avoid complaints about summing
+        # non-distributed values.
+        y_add = y.assign_add(array_ops.identity(x_add))
         self.assertEqual(y.device, '/job:ps/task:1')
         self.assertEqual(y_add.device, y.device)
         self.assertEqual(y.device, x.device)
 
-        z = variable_scope.get_variable('z', initializer=10.0)
+        z = variable_scope.get_variable(
+            'z', initializer=10.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
         self.assertEqual(z.device, '/job:ps/task:0')
         self.assertNotEqual(z.device, x.device)
 
         with ops.control_dependencies([y_add]):
-          z_add = z.assign_add(y)
+          # We add an identity here to avoid complaints about summing
+          # non-distributed values.
+          z_add = z.assign_add(array_ops.identity(y))
         with ops.control_dependencies([z_add]):
           f = z + c
         self.assertEqual(f.device, worker_device + '/' + last_part_device)
@@ -173,18 +168,13 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
-  @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testDeviceAssignmentDistributed(self, num_gpus):
-    self._test_device_assignment_distributed('worker', 1, num_gpus)
-
   def _test_device_assignment_local(self,
                                     d,
                                     compute_device='CPU',
                                     variable_device='CPU',
                                     num_gpus=0):
     with ops.Graph().as_default(), \
-         self.test_session(target=self._workers[0].target) as sess, \
+         self.test_session(target=self._default_target) as sess, \
          d.scope():
 
       def model_fn():
@@ -192,14 +182,16 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
           tower_compute_device = '/device:CPU:0'
         else:
           tower_compute_device = (
-              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+              '/device:GPU:%d' %
+              distribution_strategy_context.get_tower_context().tower_id)
         tower_compute_device = device_util.canonicalize(tower_compute_device)
 
         if 'CPU' in variable_device:
           tower_variable_device = '/device:CPU:0'
         else:
           tower_variable_device = (
-              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+              '/device:GPU:%d' %
+              distribution_strategy_context.get_tower_context().tower_id)
         tower_variable_device = device_util.canonicalize(tower_variable_device)
 
         a = constant_op.constant(1.0)
@@ -211,7 +203,9 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
 
         # The device scope is ignored for variables but not for normal ops.
         with ops.device('/device:GPU:2'):
-          x = variable_scope.get_variable('x', initializer=10.0)
+          x = variable_scope.get_variable(
+              'x', initializer=10.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
           x_add = x.assign_add(c)
           e = a + c
         self.assertEqual(
@@ -221,19 +215,27 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
 
         # The colocate_vars_with can override the distribution's device.
         with d.colocate_vars_with(x):
-          y = variable_scope.get_variable('y', initializer=20.0)
-        y_add = y.assign_add(x_add)
+          y = variable_scope.get_variable(
+              'y', initializer=20.0,
+              aggregation=variable_scope.VariableAggregation.SUM)
+        # We add an identity here to avoid complaints about summing
+        # non-distributed values.
+        y_add = y.assign_add(array_ops.identity(x_add))
         self.assertEqual(
             device_util.canonicalize(y.device), tower_variable_device)
         self.assertEqual(y_add.device, y.device)
         self.assertEqual(y.device, x.device)
 
-        z = variable_scope.get_variable('z', initializer=10.0)
+        z = variable_scope.get_variable(
+            'z', initializer=10.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
         self.assertEqual(
             device_util.canonicalize(z.device), tower_variable_device)
 
         with ops.control_dependencies([y_add]):
-          z_add = z.assign_add(y)
+          # We add an identity here to avoid complaints about summing
+          # non-distributed values.
+          z_add = z.assign_add(array_ops.identity(y))
         with ops.control_dependencies([z_add]):
           f = z + c
         self.assertEqual(f.device, tower_compute_device)
@@ -265,29 +267,12 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
-  def testDeviceAssignmentLocalCPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=0)
-    self._test_device_assignment_local(
-        distribution, compute_device='CPU', variable_device='CPU', num_gpus=0)
-
-  def testDeviceAssignmentLocalOneGPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=1)
-    self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='GPU', num_gpus=1)
-
-  def testDeviceAssignmentLocalTwoGPUs(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='CPU', num_gpus=2)
-
   def _test_simple_increment(self, task_type, task_id, num_gpus):
     d, master_target = self._get_test_objects(task_type, task_id, num_gpus)
     if hasattr(d, '_cluster_spec') and d._cluster_spec:
-      num_workers = len(d._cluster_spec.as_dict().get('worker',
-                                                      ['dummy_worker']))
+      num_workers = len(d._cluster_spec.as_dict().get(WORKER))
+      if 'chief' in d._cluster_spec.as_dict():
+        num_workers += 1
     else:
       num_workers = 1
     with ops.Graph().as_default(), \
@@ -295,11 +280,18 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
          d.scope():
 
       def model_fn():
-        x = variable_scope.get_variable('x', initializer=10.0)
-        y = variable_scope.get_variable('y', initializer=20.0)
-
-        x_add = x.assign_add(1.0, use_locking=True)
-        y_add = y.assign_add(1.0, use_locking=True)
+        x = variable_scope.get_variable(
+            'x', initializer=10.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
+        y = variable_scope.get_variable(
+            'y', initializer=20.0,
+            aggregation=variable_scope.VariableAggregation.SUM)
+
+        # We explicitly make a constant tensor here to avoid complaints about
+        # summing non-distributed values.
+        one = constant_op.constant(1.0)
+        x_add = x.assign_add(one, use_locking=True)
+        y_add = y.assign_add(one, use_locking=True)
 
         train_op = control_flow_ops.group([x_add, y_add])
         return x, y, train_op
@@ -339,6 +331,11 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
     d, master_target = self._get_test_objects(task_type, task_id, num_gpus)
+    assert hasattr(d, '_cluster_spec') and d._cluster_spec
+    num_workers = len(d._cluster_spec.as_dict().get(WORKER))
+    if CHIEF in d._cluster_spec.as_dict():
+      num_workers += 1
+
     with ops.Graph().as_default(), \
          self.test_session(target=master_target) as sess, \
          d.scope():
@@ -387,13 +384,13 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
       if context.num_gpus() < d._num_gpus_per_worker:
         return True
 
-      if task_id == 0:
+      if multi_worker_util.is_chief(d._cluster_spec, task_type, task_id):
         variables.global_variables_initializer().run()
 
       # Workers waiting for chief worker's initializing variables.
       self._init_condition.acquire()
       self._init_reached += 1
-      while self._init_reached != 3:
+      while self._init_reached != num_workers:
         self._init_condition.wait()
       self._init_condition.notify_all()
       self._init_condition.release()
@@ -410,9 +407,42 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
+
+class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
+                                  parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2)
+    cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
+
+  def testDeviceAssignmentLocalCPU(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=0)
+    self._test_device_assignment_local(
+        distribution, compute_device='CPU', variable_device='CPU', num_gpus=0)
+
+  def testDeviceAssignmentLocalOneGPU(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=1)
+    self._test_device_assignment_local(
+        distribution, compute_device='GPU', variable_device='GPU', num_gpus=1)
+
+  def testDeviceAssignmentLocalTwoGPUs(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_device_assignment_local(
+        distribution, compute_device='GPU', variable_device='CPU', num_gpus=2)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributed(self, num_gpus):
+    self._test_device_assignment_distributed('worker', 1, num_gpus)
+
   def testSimpleBetweenGraph(self):
     self._run_between_graph_clients(self._test_simple_increment,
-                                    self._cluster_spec, 0)
+                                    self._cluster_spec, context.num_gpus())
 
   @combinations.generate(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
@@ -426,5 +456,38 @@ class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
                                     self._cluster_spec, num_gpus)
 
 
+class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
+                                           parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2, has_chief=True)
+    cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
+
+  def testSimpleBetweenGraph(self):
+    self._run_between_graph_clients(self._test_simple_increment,
+                                    self._cluster_spec, context.num_gpus())
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_between_graph_clients(self._test_minimize_loss_graph,
+                                    self._cluster_spec, num_gpus)
+
+  def testGlobalStepIsWrapped(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    with ops.Graph().as_default(), distribution.scope():
+      created_step = training_util.create_global_step()
+      get_step = training_util.get_global_step()
+      self.assertEqual(created_step, get_step,
+                       msg=('created_step %s type %s vs. get_step %s type %s' %
+                            (id(created_step), created_step.__class__.__name__,
+                             id(get_step), get_step.__class__.__name__)))
+      self.assertIs(values.AggregatingVariable, type(created_step))
+      self.assertIs(values.AggregatingVariable, type(get_step))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
index a68dbce6c7..bb10b546a1 100644
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
@@ -37,7 +37,7 @@ class PrefetchingOpsV2Test(test.TestCase):
     iterator = device_dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for i in range(10):
         self.assertEqual(i, sess.run(next_element))
       with self.assertRaises(errors.OutOfRangeError):
@@ -55,7 +55,7 @@ class PrefetchingOpsV2Test(test.TestCase):
     next_element = iterator.get_next()
 
     output = []
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for _ in range(5):
         result = sess.run(next_element)
         self.assertEqual(2, len(result))
@@ -75,7 +75,7 @@ class PrefetchingOpsV2Test(test.TestCase):
     iterator = device_dataset.make_initializable_iterator()
     next_element = iterator.get_next()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(iterator.initializer)
       for _ in range(5):
         sess.run(next_element)
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index d3611570b4..1b5a4f64e5 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -90,14 +90,14 @@ class StandardSingleLossStep(StandardInputStep):
 
   def __call__(self):
     with self._distribution.scope():
-      def step_fn(ctx, inputs):
+      def step_fn(ctx, *inputs):
         """Function to run one iteration with one input."""
         gradients_fn = backprop.implicit_grad(self._loss_fn)
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
 
         grads_and_vars = self.distribution.call_for_each_tower(
             gradients_fn,
-            ctx, inputs,
+            ctx, *inputs,
             run_concurrently=self._is_run_concurrently)
         # If threads use layers, then we need to run the first step
         # sequentially, so that layers.build() is not executed in parallel.
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
index 8605ab1f7d..f1ada49fa3 100644
--- a/tensorflow/contrib/distribute/python/step_fn_test.py
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -49,7 +49,7 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase):
       if context.executing_eagerly():
         run_step = single_loss_step
       else:
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           run_step = sess.make_callable(single_loss_step())
       self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index baed0ebaae..6ee26e19ac 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -28,7 +28,7 @@ from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer
 
 
@@ -45,7 +45,8 @@ def _raise_exception_fn(_=None):
 # Must be the argument to a distribution.call_for_each_tower() call, calls a
 # get_tower_context().merge_call() that raises an exception.
 def _merge_raises_fn():
-  distribute_lib.get_tower_context().merge_call(_raise_exception_fn)
+  distribution_strategy_context.get_tower_context().merge_call(
+      _raise_exception_fn)
 
 
 # Must be the argument to a get_tower_context().merge_call() call, calls
@@ -58,7 +59,7 @@ def _call_raises_fn(dist):
 # calls a get_tower_context().merge_call() that calls a
 # call_for_each_tower() that raises an exception.
 def _merge_call_raises_fn():
-  distribute_lib.get_tower_context().merge_call(_call_raises_fn)
+  distribution_strategy_context.get_tower_context().merge_call(_call_raises_fn)
 
 
 # Must be the argument to a get_tower_context().merge_call() call, calls
@@ -72,7 +73,8 @@ def _call_merge_raises_fn(dist):
 # get_tower_context().merge_call() that calls a call_for_each_tower() that
 # calls a get_tower_context().merge_call() that raises an exception.
 def _merge_call_merge_raises_fn():
-  distribute_lib.get_tower_context().merge_call(_call_merge_raises_fn)
+  distribution_strategy_context.get_tower_context().merge_call(
+      _call_merge_raises_fn)
 
 
 class DistributionTestBase(test.TestCase):
@@ -128,7 +130,8 @@ class DistributionTestBase(test.TestCase):
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_minimize_loss_graph(self, d, soft_placement=False):
+  def _test_minimize_loss_graph(self, d, soft_placement=False,
+                                learning_rate=0.2):
     config = config_pb2.ConfigProto()
     config.allow_soft_placement = soft_placement
     config.gpu_options.per_process_gpu_memory_fraction = 0.3
@@ -148,7 +151,7 @@ class DistributionTestBase(test.TestCase):
       grad_fn = backprop.implicit_grad(loss)
 
       def update(v, g):
-        return v.assign_sub(0.2 * g)
+        return v.assign_sub(learning_rate * g)
 
       one = d.broadcast(constant_op.constant([[1.]]))
 
@@ -208,7 +211,7 @@ class DistributionTestBase(test.TestCase):
       expected_devices = [False] * len(d.worker_devices)
 
       def mark_devices_fn():
-        tower_id = distribute_lib.get_tower_context().tower_id
+        tower_id = distribution_strategy_context.get_tower_context().tower_id
         self.assertLess(tower_id, len(d.worker_devices))
         self.assertFalse(expected_devices[tower_id])
         expected_devices[tower_id] = True
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index b510fdb888..6202a0750a 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -37,7 +37,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
-from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 
@@ -46,13 +45,13 @@ def get_tpu_system_metadata(tpu_cluster_resolver):
   master = tpu_cluster_resolver.master()
 
   # pylint: disable=protected-access
-  cluster_def = (tpu_cluster_resolver.cluster_spec()
-                 or server_lib.ClusterSpec({})).as_cluster_def()
+  cluster_spec = tpu_cluster_resolver.cluster_spec()
+  cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
   tpu_system_metadata = (
       tpu_system_metadata_lib._query_tpu_system_metadata(
           master,
           cluster_def=cluster_def,
-          query_topology=True))
+          query_topology=False))
 
   return tpu_system_metadata
 
@@ -60,7 +59,7 @@ def get_tpu_system_metadata(tpu_cluster_resolver):
 class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   """Experimental TPU distribution strategy implementation."""
 
-  def __init__(self, tpu_cluster_resolver, steps_per_run):
+  def __init__(self, tpu_cluster_resolver, steps_per_run, num_cores=None):
     """Initializes the TPUStrategy object.
 
     Args:
@@ -71,6 +70,8 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
           metrics, summaries etc.
           This parameter is only used when Distribution Strategy is used with
           estimator or keras.
+      num_cores: Number of cores to use on the TPU. If None specified, then
+          auto-detect the cores and topology of the TPU system.
     """
     # TODO(isaprykin): Generalize the defaults.  They are currently tailored for
     # the unit test.
@@ -78,13 +79,15 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
+    self._num_cores_override = num_cores
 
-    # TODO(priyag): This should not be hardcoded here.
-    self._host = '/device:CPU:0'
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
 
+    # TODO(frankchn): This should not be hardcoded here for pod purposes.
+    self._host = self.tpu_host_cpu_device(0)
+
   def distribute_dataset(self, dataset_fn):
     # TODO(priyag): Perhaps distribute across cores here.
     return self._call_dataset_fn(dataset_fn)
@@ -107,6 +110,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       """Enqueue ops for one iteration."""
       control_deps = []
       sharded_inputs = []
+      # TODO(sourabhbajaj): Add support for TPU pods
       with ops.device(self._host):
         for _ in range(self.num_towers):
           # Use control dependencies to ensure a deterministic ordering.
@@ -144,7 +148,10 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     ctx = values.MultiStepContext()
     def run_fn(*args, **kwargs):
       del args, kwargs
-      fn_result = fn(ctx, dequeue_fn())
+      fn_inputs = dequeue_fn()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, *fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
         with ops.control_dependencies([fn_result]):
@@ -157,8 +164,18 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     def iterate_on_tpu():
       return training_loop.repeat(iterations, run_fn, initial_loop_values)
 
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop and TPU replicate context. This is useful in cases
+    # where we might need to exit these contexts and get back to the outer
+    # context to do some things, for e.g. create an op which should be
+    # evaluated only once at the end of the loop on the host. One such usage
+    # is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
     replicate_inputs = [[]] * self.num_towers
     replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
+    del self._outer_control_flow_context
     ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
 
     # Filter out any ops from the outputs, typically this would be the case
@@ -246,4 +263,10 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
   @property
   def num_towers(self):
-    return self._tpu_metadata.num_of_cores_per_host
+    return self._num_cores_override or self._tpu_metadata.num_cores
+
+  def tpu_host_cpu_device(self, host_id):
+    if self._tpu_cluster_resolver.get_master() in ('', 'local'):
+      return '/replica:0/task:0/device:CPU:0'
+    return '/job:%s/task:%d/device:CPU:0' % ('tpu_worker', host_id)
+
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 5fd4c9de69..3ccaa2690e 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import saver
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -56,7 +57,7 @@ class DistributedValues(object):
   def get(self, device=None):
     """Returns the value for the current device or raises a ValueError."""
     if device is None:
-      tower_context = distribute_lib.get_tower_context()
+      tower_context = distribution_strategy_context.get_tower_context()
       if tower_context:
         device = tower_context.device
       else:
@@ -182,6 +183,14 @@ class Mirrored(DistributedDelegate):
       return self._index[device]
     return list(self._index.values())[0]
 
+  def _as_graph_element(self):
+    obj = self.get()
+    # pylint: disable=protected-access
+    conv_fn = getattr(obj, "_as_graph_element", None)
+    if conv_fn and callable(conv_fn):
+      return conv_fn()
+    return obj
+
 
 def _assign_on_device(device, variable, tensor):
   with ops.device(device):
@@ -289,14 +298,19 @@ class DistributedVariable(DistributedDelegate):
     # We want cross-tower code that does some var.op.X calls
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-tower context to fail.
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       return DistributedVarOp(self._primary_var.op.name,
                               self._primary_var.op.graph,
                               self._primary_var.op.type)
     return self.get().op
 
+  @property
+  def _in_graph_mode(self):
+    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+
   def read_value(self):
-    return distribute_lib.get_distribution_strategy().read_var(self)
+    return distribution_strategy_context.get_distribution_strategy().read_var(
+        self)
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -306,26 +320,6 @@ class DistributedVariable(DistributedDelegate):
 ops.register_dense_tensor_like_type(DistributedVariable)
 
 
-def _get_update_device():
-  """Validate we are in update/update_non_slot() and return current device.
-
-  This is used in MirroredVariable.assign* members, to make sure they
-  are only called via an update method, to make sure all components of the
-  variable are being updated in a consistent way.
-
-  Returns:
-    A string device.
-
-  Raises:
-    RuntimeError: If not in distribution.update()/.update_non_slot().
-  """
-  device = distribute_lib.get_update_device()
-  if device is None:
-    raise RuntimeError(
-        "Use DistributionStrategy.update() to modify a MirroredVariable.")
-  return device
-
-
 class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -362,17 +356,29 @@ class MirroredVariable(DistributedVariable, Mirrored,
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
     f = kwargs.pop("f")
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       update_device = distribute_lib.get_update_device()
-      # We are calling update on the mirrored variable in cross tower context.
       if update_device is not None:
-        # We are calling an assign function on the mirrored variable in cross
-        # tower context.
+        # We are calling an assign function on the mirrored variable in an
+        # update context.
         v = self.get(device=update_device)
         return f(v, *args, **kwargs)
 
-      return distribute_lib.get_distribution_strategy().update(
-          self, f, *args, **kwargs)
+      # We are calling assign on the mirrored variable in cross tower context,
+      # use update to update the variable.
+      strategy = distribution_strategy_context.get_distribution_strategy()
+      updates = strategy.update(self, f, *args, **kwargs)
+      grouped = strategy.group(updates)
+      if isinstance(updates, DistributedValues) and updates.is_tensor_like:
+        # Make sure we run all updates. Without this, something like
+        # session.run(mirrored_var.assign*(...)) may only update one tower.
+        index = {}
+        for d in updates.devices:
+          with ops.device(d), ops.control_dependencies([grouped]):
+            index[d] = array_ops.identity(updates.get(d))
+        return Mirrored(index)
+      else:
+        return grouped
     else:
       _assert_tower_context()
       # We are calling an assign function on the mirrored variable in tower
@@ -392,8 +398,8 @@ class MirroredVariable(DistributedVariable, Mirrored,
                 aggregation=self._aggregation, value=value, destinations=self),
             *other_args, **other_kwargs)
 
-      return distribute_lib.get_tower_context().merge_call(merge_fn, *args,
-                                                           **kwargs)
+      return distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, *args, **kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -419,7 +425,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       return self._primary_var._as_graph_element()
     return self.get()._as_graph_element()
 
@@ -459,7 +465,7 @@ class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
     # We use a callable so that we don't have to evaluate this expression
     # in the case where we are trying to restore instead of save.
     def tensor():
-      return distribute_lib.get_distribution_strategy().read_var(
+      return distribution_strategy_context.get_distribution_strategy().read_var(
           tower_local_variable)
     spec = saver.BaseSaverBuilder.SaveSpec(
         tensor=tensor,
@@ -475,7 +481,7 @@ class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
 
 
 def _assert_tower_context():
-  if not distribute_lib.get_tower_context():
+  if not distribution_strategy_context.get_tower_context():
     raise RuntimeError(
         "Tower-local variables may only be assigned in a tower context.")
 
@@ -498,7 +504,7 @@ class TowerLocalVariable(DistributedVariable, PerDevice,
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       # To preserve the sum across save and restore, we have to divide the
       # total across all devices when restoring a variable that was summed
       # when saving.
@@ -526,7 +532,7 @@ class TowerLocalVariable(DistributedVariable, PerDevice,
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       return self._get_cross_tower()
     return self.get()._as_graph_element()
 
@@ -994,12 +1000,12 @@ class MultiStepContext(object):
         outputs as already reduced or not.
 
     """
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       self._last_step_outputs_aggregations[name] = aggregation
       if aggregation is variables_lib.VariableAggregation.NONE:
         self._last_step_outputs[name] = output
       else:
-        distribution = distribute_lib.get_distribution_strategy()
+        distribution = distribution_strategy_context.get_distribution_strategy()
         self._last_step_outputs[name] = distribution.reduce(
             aggregation, output, destinations="/device:CPU:0")
     else:
@@ -1011,7 +1017,9 @@ class MultiStepContext(object):
         # context object, so it's more robust to set it only once (even if all
         # the towers are trying to set the same value).
         self._last_step_outputs_aggregations[name] = aggregation
-      distribute_lib.get_tower_context().merge_call(merge_fn, output)
+
+      distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, output)
 
   @property
   def non_tensor_outputs(self):
@@ -1020,14 +1028,15 @@ class MultiStepContext(object):
 
   def set_non_tensor_output(self, name, output):
     """Set `output` with `name` to be captured as a non tensor output."""
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       self._non_tensor_outputs[name] = output
     else:
       def merge_fn(distribution, value):
         # NOTE(priyag): For non tensor outputs, we simply return all the values
         # in a list as aggregation doesn't make sense on non tensors.
         self._non_tensor_outputs[name] = distribution.unwrap(value)
-      distribute_lib.get_tower_context().merge_call(merge_fn, output)
+      distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, output)
 
 
 def value_container(val):
@@ -1052,3 +1061,160 @@ def value_container(val):
     if container is not None:
       return container
   return val
+
+
+# TODO(josh11b): Descend from Variable.
+class AggregatingVariable(checkpointable.CheckpointableBase):
+  """A wrapper around a variable that aggregates updates across towers."""
+
+  def __init__(self, v, aggregation):
+    self._v = v
+    # TODO(josh11b): Set v._distributed_container?
+    # v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._aggregation = aggregation
+
+  def get(self):
+    return self._v
+
+  def __getattr__(self, name):
+    return getattr(self._v, name)
+
+  def _assign_func(self, *args, **kwargs):
+    f = kwargs.pop("f")
+    if distribution_strategy_context.get_cross_tower_context():
+      update_device = distribute_lib.get_update_device()
+      if update_device is not None:
+        # We are calling an assign function in an update context.
+        return f(self._v, *args, **kwargs)
+
+      # We are calling an assign function in cross tower context, wrap it in an
+      # update call.
+      return distribution_strategy_context.get_distribution_strategy().update(
+          self, f, *args, **kwargs)
+    else:
+      assert distribution_strategy_context.get_tower_context()
+      # We are calling an assign function in tower context.
+      # We reduce the value we want to assign/add/sub. More details about how we
+      # handle the different use cases can be found in the _reduce method.
+      # We call the function with the reduced value.
+      if self._aggregation == vs.VariableAggregation.NONE:
+        raise ValueError("You must specify an aggregation method to update a "
+                         "a variable in Tower Context.")
+
+      def merge_fn(strategy, value, *other_args, **other_kwargs):
+        return strategy.update(
+            self, f,
+            strategy.reduce(
+                aggregation=self._aggregation, value=value, destinations=self),
+            *other_args, **other_kwargs)
+
+      return distribution_strategy_context.get_tower_context().merge_call(
+          merge_fn, *args, **kwargs)
+
+  def assign_sub(self, *args, **kwargs):
+    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  @property
+  def name(self):
+    return self._v.name
+
+  @property
+  def dtype(self):
+    return self._v.dtype
+
+  # TODO(josh11b): Test saving & restoring.
+  def _gather_saveables_for_checkpoint(self):
+    return {checkpointable.VARIABLE_VALUE_KEY: self._v}
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self._v + o
+  def __radd__(self, o): return o + self._v
+  def __sub__(self, o): return self._v - o
+  def __rsub__(self, o): return o - self._v
+  def __mul__(self, o): return self._v * o
+  def __rmul__(self, o): return o * self._v
+  def __truediv__(self, o): return self._v / o
+  def __rtruediv__(self, o): return o / self._v
+  def __floordiv__(self, o): return self._v // o
+  def __rfloordiv__(self, o): return o // self._v
+  def __mod__(self, o): return self._v % o
+  def __rmod__(self, o): return o % self._v
+  def __lt__(self, o): return self._v < o
+  def __le__(self, o): return self._v <= o
+  def __gt__(self, o): return self._v > o
+  def __ge__(self, o): return self._v >= o
+  def __and__(self, o): return self._v & o
+  def __rand__(self, o): return o & self._v
+  def __or__(self, o): return self._v | o
+  def __ror__(self, o): return o | self._v
+  def __xor__(self, o): return self._v ^ o
+  def __rxor__(self, o): return o ^ self._v
+  def __getitem__(self, o): return self._v[o]
+  def __pow__(self, o, modulo=None): return pow(self._v, o, modulo)
+  def __rpow__(self, o): return pow(o, self._v)
+  def __invert__(self): return ~self._v
+  def __neg__(self): return -self._v
+  def __abs__(self): return abs(self._v)
+
+  def __div__(self, o):
+    try:
+      return self._v.__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self._v.__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self._v.__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self._v.__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __str__(self):
+    return str(self._v)
+
+  def __repr__(self):
+    return repr(self._v)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
+  return ops.internal_convert_to_tensor(
+      var.get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+ops.register_tensor_conversion_function(
+    AggregatingVariable, _tensor_conversion_aggregate)
+ops.register_dense_tensor_like_type(AggregatingVariable)
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 91a43d4999..3602f4d128 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -653,7 +653,7 @@ class MirroredVariableTest(test.TestCase):
 
   def _save_mirrored(self):
     """Save variables with mirroring, returns save_path."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v, devices, mirrored = _make_mirrored()
 
       # Overwrite the initial values.
@@ -668,7 +668,7 @@ class MirroredVariableTest(test.TestCase):
 
   def _save_normal(self):
     """Save variables without mirroring, returns save_path."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       var = variable_scope.get_variable(
           name="v", initializer=1., use_resource=True)
 
@@ -684,7 +684,7 @@ class MirroredVariableTest(test.TestCase):
 
   def _restore_normal(self, save_path):
     """Restore to variables without mirroring in a fresh graph."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       var = variable_scope.get_variable(
           name="v", initializer=7., use_resource=True)
 
@@ -698,7 +698,7 @@ class MirroredVariableTest(test.TestCase):
 
   def _restore_mirrored(self, save_path):
     """Restore to variables with mirroring in a fresh graph."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v, devices, mirrored = _make_mirrored()
 
       # Overwrite the initial values.
@@ -864,7 +864,7 @@ class TowerLocalVariableTest(test.TestCase):
 
   def _save_tower_local_mean(self):
     """Save variables with mirroring, returns save_path."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v, tower_local = _make_tower_local(
           variable_scope.VariableAggregation.MEAN)
 
@@ -881,7 +881,7 @@ class TowerLocalVariableTest(test.TestCase):
 
   def _save_tower_local_sum(self):
     """Save variables with mirroring, returns save_path."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v, tower_local = _make_tower_local("sum")
 
       # Overwrite the initial values.
@@ -897,7 +897,7 @@ class TowerLocalVariableTest(test.TestCase):
 
   def _save_normal(self):
     """Save variables without mirroring, returns save_path."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       var = variable_scope.get_variable(
           name="v", initializer=1., use_resource=True)
 
@@ -913,7 +913,7 @@ class TowerLocalVariableTest(test.TestCase):
 
   def _restore_normal(self, save_path):
     """Restore to variables without mirroring in a fresh graph."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       var = variable_scope.get_variable(
           name="v", initializer=7., use_resource=True)
 
@@ -927,7 +927,7 @@ class TowerLocalVariableTest(test.TestCase):
 
   def _restore_tower_local_mean(self, save_path):
     """Restore to variables with mirroring in a fresh graph."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v, tower_local = _make_tower_local(
           variable_scope.VariableAggregation.MEAN)
 
@@ -942,7 +942,7 @@ class TowerLocalVariableTest(test.TestCase):
 
   def _restore_tower_local_sum(self, save_path):
     """Restore to variables with mirroring in a fresh graph."""
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v, tower_local = _make_tower_local(variable_scope.VariableAggregation.SUM)
 
       # Overwrite the initial values.
diff --git a/tensorflow/contrib/distribute/python/warm_starting_util_test.py b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
index d8bacdb338..5d57d144c1 100644
--- a/tensorflow/contrib/distribute/python/warm_starting_util_test.py
+++ b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
@@ -56,7 +56,7 @@ class WarmStartingUtilWithDistributionStrategyTest(
 
     # Create variable and save checkpoint from which to warm-start.
     def create_var(g):
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         var = variable_scope.get_variable(var_name, initializer=original_value)
         sess.run(variables.global_variables_initializer())
         saver = saver_lib.Saver()
@@ -75,7 +75,7 @@ class WarmStartingUtilWithDistributionStrategyTest(
     self.assertAllEqual(original_value, prev_init_val)
 
     def warm_start(g):
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         # Initialize with zeros.
         var = variable_scope.get_variable(
             var_name, initializer=[[0., 0.], [0., 0.]])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
index 0928dc3f35..a22d4d825b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
@@ -53,7 +53,7 @@ class AutogressiveTest(test_util.VectorDistributionTestHelpers, test.TestCase):
   def testSampleAndLogProbConsistency(self):
     batch_shape = []
     event_size = 2
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_event_shape = np.concatenate([batch_shape, [event_size]], axis=0)
       sample0 = array_ops.zeros(batch_event_shape)
       affine = Affine(scale_tril=self._random_scale_tril(event_size))
@@ -67,7 +67,7 @@ class AutogressiveTest(test_util.VectorDistributionTestHelpers, test.TestCase):
     sample_shape = np.int32([4, 5])
     batch_shape = np.int32([])
     event_size = np.int32(2)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_event_shape = np.concatenate([batch_shape, [event_size]], axis=0)
       sample0 = array_ops.zeros(batch_event_shape)
       affine = Affine(scale_tril=self._random_scale_tril(event_size))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
index f2bb2d3325..62623deccd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -76,7 +76,7 @@ class _BatchReshapeTest(object):
         wishart.log_prob(x), expected_log_prob_shape)
     actual_log_prob = reshape_wishart.log_prob(expected_sample)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       [
           batch_shape_,
           event_shape_,
@@ -132,7 +132,7 @@ class _BatchReshapeTest(object):
         wishart.variance(), expected_matrix_stat_shape)
     actual_variance = reshape_wishart.variance()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       [
           expected_entropy_, actual_entropy_,
           expected_mean_, actual_mean_,
@@ -202,7 +202,7 @@ class _BatchReshapeTest(object):
         normal.log_prob(x), expected_log_prob_shape)
     actual_log_prob = reshape_normal.log_prob(expected_sample)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       [
           batch_shape_,
           event_shape_,
@@ -255,7 +255,7 @@ class _BatchReshapeTest(object):
         normal.variance(), expected_scalar_stat_shape)
     actual_variance = reshape_normal.variance()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       [
           expected_entropy_, actual_entropy_,
           expected_mean_, actual_mean_,
@@ -323,7 +323,7 @@ class _BatchReshapeTest(object):
         mvn.log_prob(x), expected_log_prob_shape)
     actual_log_prob = reshape_mvn.log_prob(expected_sample)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       [
           batch_shape_,
           event_shape_,
@@ -385,7 +385,7 @@ class _BatchReshapeTest(object):
         mvn.covariance(), expected_matrix_stat_shape)
     actual_covariance = reshape_mvn.covariance()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       [
           expected_entropy_, actual_entropy_,
           expected_mean_, actual_mean_,
@@ -447,7 +447,7 @@ class _BatchReshapeTest(object):
             validate_args=True)
 
     else:
-      with self.test_session():
+      with self.cached_session():
         with self.assertRaisesOpError(r"Shape sizes do not match."):
           batch_reshape_lib.BatchReshape(
               distribution=mvn,
@@ -482,7 +482,7 @@ class _BatchReshapeTest(object):
             validate_args=True)
 
     else:
-      with self.test_session():
+      with self.cached_session():
         with self.assertRaisesOpError(r".*must be >=-1.*"):
           batch_reshape_lib.BatchReshape(
               distribution=mvn,
@@ -512,7 +512,7 @@ class _BatchReshapeTest(object):
             validate_args=True)
 
     else:
-      with self.test_session():
+      with self.cached_session():
         with self.assertRaisesOpError(r".*must be a vector.*"):
           batch_reshape_lib.BatchReshape(
               distribution=mvn,
@@ -548,11 +548,11 @@ class _BatchReshapeTest(object):
       return
 
     with self.assertRaisesOpError("too few batch and event dims"):
-      with self.test_session():
+      with self.cached_session():
         poisson_141_reshaped.log_prob(x_4).eval()
 
     with self.assertRaisesOpError("unexpected batch and event shape"):
-      with self.test_session():
+      with self.cached_session():
         poisson_141_reshaped.log_prob(x_114).eval()
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
index 042c8ebd51..372b7e37b7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/absolute_value_test.py
@@ -31,7 +31,7 @@ class AbsoluteValueTest(test.TestCase):
   """Tests correctness of the absolute value bijector."""
 
   def testBijectorVersusNumpyRewriteOfBasicFunctionsEventNdims0(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       bijector = AbsoluteValue(validate_args=True)
       self.assertEqual("absolute_value", bijector.name)
       x = array_ops.constant([[0., 1., -1], [0., -5., 3.]])  # Shape [2, 3]
@@ -54,13 +54,13 @@ class AbsoluteValueTest(test.TestCase):
                               y, event_ndims=0)))
 
   def testNegativeYRaisesForInverseIfValidateArgs(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
         sess.run(bijector.inverse(-1.))
 
   def testNegativeYRaisesForILDJIfValidateArgs(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       bijector = AbsoluteValue(validate_args=True)
       with self.assertRaisesOpError("y was negative"):
         sess.run(bijector.inverse_log_det_jacobian(-1., event_ndims=0))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
index 1e4ad724d0..a7bd51430e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 class AffineLinearOperatorTest(test.TestCase):
 
   def testIdentity(self):
-    with self.test_session():
+    with self.cached_session():
       affine = AffineLinearOperator(
           validate_args=True)
       x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32)
@@ -45,7 +45,7 @@ class AffineLinearOperatorTest(test.TestCase):
           affine.forward_log_det_jacobian(x, event_ndims=2).eval())
 
   def testDiag(self):
-    with self.test_session():
+    with self.cached_session():
       shift = np.array([-1, 0, 1], dtype=np.float32)
       diag = np.array([[1, 2, 3],
                        [2, 5, 6]], dtype=np.float32)
@@ -67,7 +67,7 @@ class AffineLinearOperatorTest(test.TestCase):
           affine.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testTriL(self):
-    with self.test_session():
+    with self.cached_session():
       shift = np.array([-1, 0, 1], dtype=np.float32)
       tril = np.array([[[3, 0, 0],
                         [2, -1, 0],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
index d2533620be..bc6752a69d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_scalar_test.py
@@ -31,14 +31,14 @@ class AffineScalarBijectorTest(test.TestCase):
   """Tests correctness of the Y = scale @ x + shift transformation."""
 
   def testProperties(self):
-    with self.test_session():
+    with self.cached_session():
       mu = -1.
       # scale corresponds to 1.
       bijector = AffineScalar(shift=mu)
       self.assertEqual("affine_scalar", bijector.name)
 
   def testNoBatchScalar(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       def static_run(fun, x, **kwargs):
         return fun(x, **kwargs).eval()
@@ -60,7 +60,7 @@ class AffineScalarBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testOneBatchScalarViaIdentityIn64BitUserProvidesShiftOnly(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       def static_run(fun, x, **kwargs):
         return fun(x, **kwargs).eval()
@@ -83,7 +83,7 @@ class AffineScalarBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testOneBatchScalarViaIdentityIn64BitUserProvidesScaleOnly(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       def static_run(fun, x, **kwargs):
         return fun(x, **kwargs).eval()
@@ -106,7 +106,7 @@ class AffineScalarBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testTwoBatchScalarIdentityViaIdentity(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       def static_run(fun, x, **kwargs):
         return fun(x, **kwargs).eval()
@@ -129,7 +129,7 @@ class AffineScalarBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testTwoBatchScalarIdentityViaScale(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       def static_run(fun, x, **kwargs):
         return fun(x, **kwargs).eval()
@@ -152,7 +152,7 @@ class AffineScalarBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=0))
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = AffineScalar(shift=3.6, scale=0.42)
       assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index 9e14b9a53e..dc18eb3df6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -32,14 +32,14 @@ class AffineBijectorTest(test.TestCase):
   """Tests correctness of the Y = scale @ x + shift transformation."""
 
   def testProperties(self):
-    with self.test_session():
+    with self.cached_session():
       mu = -1.
       # scale corresponds to 1.
       bijector = Affine(shift=mu)
       self.assertEqual("affine", bijector.name)
 
   def testNoBatchMultivariateIdentity(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -71,7 +71,7 @@ class AffineBijectorTest(test.TestCase):
             0., run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateDiag(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -114,7 +114,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateFullDynamic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32, name="x")
       mu = array_ops.placeholder(dtypes.float32, name="mu")
       scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
@@ -137,7 +137,7 @@ class AffineBijectorTest(test.TestCase):
                    feed_dict))
 
   def testBatchMultivariateIdentity(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -161,7 +161,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateDiag(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -185,7 +185,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testBatchMultivariateFullDynamic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32, name="x")
       mu = array_ops.placeholder(dtypes.float32, name="mu")
       scale_diag = array_ops.placeholder(dtypes.float32, name="scale_diag")
@@ -209,7 +209,7 @@ class AffineBijectorTest(test.TestCase):
               x, event_ndims=1), feed_dict))
 
   def testIdentityWithDiagUpdate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -235,7 +235,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithTriL(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -261,7 +261,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithTriL(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -285,7 +285,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityAndDiagWithTriL(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -312,7 +312,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testIdentityWithVDVTUpdate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -349,7 +349,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testDiagWithVDVTUpdate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -385,7 +385,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -422,7 +422,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testTriLWithVDVTUpdateNoDiagonal(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       placeholder = array_ops.placeholder(dtypes.float32, name="x")
 
       def static_run(fun, x, **kwargs):
@@ -459,7 +459,7 @@ class AffineBijectorTest(test.TestCase):
             run(bijector_ref.inverse_log_det_jacobian, x, event_ndims=1))
 
   def testNoBatchMultivariateRaisesWhenSingular(self):
-    with self.test_session():
+    with self.cached_session():
       mu = [1., -1]
       bijector = Affine(
           shift=mu,
@@ -531,7 +531,7 @@ class AffineBijectorTest(test.TestCase):
           itertools.combinations(s, r) for r in range(len(s) + 1))
 
     for args in _powerset(scale_params.items()):
-      with self.test_session():
+      with self.cached_session():
         args = dict(args)
 
         scale_args = dict({"x": x}, **args)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
index c832fcaa68..bf61e9f2fe 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/batch_normalization_test.py
@@ -69,7 +69,7 @@ class BatchNormTest(test_util.VectorDistributionTestHelpers,
     ]
     for input_shape, event_dims, training in params:
       x_ = np.arange(5 * 4 * 2).astype(np.float32).reshape(input_shape)
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         x = constant_op.constant(x_)
         # When training, memorize the exact mean of the last
         # minibatch that it normalized (instead of moving average assignment).
@@ -145,7 +145,7 @@ class BatchNormTest(test_util.VectorDistributionTestHelpers,
 
   def testMaximumLikelihoodTraining(self):
     # Test Maximum Likelihood training with default bijector.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       base_dist = distributions.MultivariateNormalDiag(loc=[0., 0.])
       batch_norm = BatchNormalization(training=True)
       dist = transformed_distribution_lib.TransformedDistribution(
@@ -176,7 +176,7 @@ class BatchNormTest(test_util.VectorDistributionTestHelpers,
       self.assertAllClose([1., 1.], moving_var_, atol=5e-2)
 
   def testLogProb(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       layer = normalization.BatchNormalization(epsilon=0.)
       batch_norm = BatchNormalization(batchnorm_layer=layer, training=False)
       base_dist = distributions.MultivariateNormalDiag(loc=[0., 0.])
@@ -196,7 +196,7 @@ class BatchNormTest(test_util.VectorDistributionTestHelpers,
   def testMutuallyConsistent(self):
     # BatchNorm bijector is only mutually consistent when training=False.
     dims = 4
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       layer = normalization.BatchNormalization(epsilon=0.)
       batch_norm = BatchNormalization(batchnorm_layer=layer, training=False)
       dist = transformed_distribution_lib.TransformedDistribution(
@@ -215,7 +215,7 @@ class BatchNormTest(test_util.VectorDistributionTestHelpers,
   def testInvertMutuallyConsistent(self):
     # BatchNorm bijector is only mutually consistent when training=False.
     dims = 4
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       layer = normalization.BatchNormalization(epsilon=0.)
       batch_norm = Invert(
           BatchNormalization(batchnorm_layer=layer, training=False))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index dc45114b1c..ada99ec9c6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -46,7 +46,7 @@ class ChainBijectorTest(test.TestCase):
   """Tests the correctness of the Y = Chain(bij1, bij2, bij3) transformation."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       chain = Chain((Exp(), Softplus()))
       self.assertEqual("chain_of_exp_of_softplus", chain.name)
       x = np.asarray([[[1., 2.],
@@ -61,7 +61,7 @@ class ChainBijectorTest(test.TestCase):
           chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testBijectorIdentity(self):
-    with self.test_session():
+    with self.cached_session():
       chain = Chain()
       self.assertEqual("identity", chain.name)
       x = np.asarray([[[1., 2.],
@@ -74,13 +74,13 @@ class ChainBijectorTest(test.TestCase):
           0., chain.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       chain = Chain((Exp(), Softplus()))
       assert_scalar_congruency(
           chain, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
-    with self.test_session():
+    with self.cached_session():
       chain = Chain([
           SoftmaxCentered(validate_args=True),
           SoftmaxCentered(validate_args=True),
@@ -195,7 +195,7 @@ class ChainBijectorTest(test.TestCase):
         dtype=np.float32, shape=[None, 10], name="samples")
     ildj = chain.inverse_log_det_jacobian(samples, event_ndims=0)
     self.assertTrue(ildj is not None)
-    with self.test_session():
+    with self.cached_session():
       ildj.eval({samples: np.zeros([2, 10], np.float32)})
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
index d1ce273499..9681b64ced 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -30,7 +30,7 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
   """Tests the correctness of the Y = X @ X.T transformation."""
 
   def testBijectorMatrix(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = bijectors.CholeskyOuterProduct(validate_args=True)
       self.assertEqual("cholesky_outer_product", bijector.name)
       x = [[[1., 0], [2, 1]], [[np.sqrt(2.), 0], [np.sqrt(8.), 1]]]
@@ -75,7 +75,7 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
     bijector = bijectors.CholeskyOuterProduct()
     x_pl = array_ops.placeholder(dtypes.float32)
 
-    with self.test_session():
+    with self.cached_session():
       log_det_jacobian = bijector.forward_log_det_jacobian(x_pl, event_ndims=2)
 
       # The Jacobian matrix is 2 * tf.eye(2), which has jacobian determinant 4.
@@ -86,7 +86,7 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
   def testNoBatchStatic(self):
     x = np.array([[1., 0], [2, 1]])  # np.linalg.cholesky(y)
     y = np.array([[1., 2], [2, 5]])  # np.matmul(x, x.T)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       y_actual = bijectors.CholeskyOuterProduct().forward(x=x)
       x_actual = bijectors.CholeskyOuterProduct().inverse(y=y)
     [y_actual_, x_actual_] = sess.run([y_actual, x_actual])
@@ -98,7 +98,7 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
   def testNoBatchDeferred(self):
     x = np.array([[1., 0], [2, 1]])  # np.linalg.cholesky(y)
     y = np.array([[1., 2], [2, 5]])  # np.matmul(x, x.T)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_pl = array_ops.placeholder(dtypes.float32)
       y_pl = array_ops.placeholder(dtypes.float32)
       y_actual = bijectors.CholeskyOuterProduct().forward(x=x_pl)
@@ -119,7 +119,7 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
                    [2, 5]],
                   [[9., 3],
                    [3, 5]]])  # np.matmul(x, x.T)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       y_actual = bijectors.CholeskyOuterProduct().forward(x=x)
       x_actual = bijectors.CholeskyOuterProduct().inverse(y=y)
     [y_actual_, x_actual_] = sess.run([y_actual, x_actual])
@@ -137,7 +137,7 @@ class CholeskyOuterProductBijectorTest(test.TestCase):
                    [2, 5]],
                   [[9., 3],
                    [3, 5]]])  # np.matmul(x, x.T)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_pl = array_ops.placeholder(dtypes.float32)
       y_pl = array_ops.placeholder(dtypes.float32)
       y_actual = bijectors.CholeskyOuterProduct().forward(x=x_pl)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
index 7be939cd27..d2c00865e7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
@@ -30,7 +30,7 @@ class ExpBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = exp(X) transformation."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Exp()
       self.assertEqual("exp", bijector.name)
       x = [[[1.], [2.]]]
@@ -48,13 +48,13 @@ class ExpBijectorTest(test.TestCase):
               x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Exp()
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Exp()
       x = np.linspace(-10, 10, num=10).astype(np.float32)
       y = np.logspace(-10, 10, num=10).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
index 54e54c3296..b9cdbfb823 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/gumbel_test.py
@@ -31,7 +31,7 @@ class GumbelBijectorTest(test.TestCase):
   """Tests correctness of the Gumbel bijector."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       loc = 0.3
       scale = 5.
       bijector = Gumbel(loc=loc, scale=scale, validate_args=True)
@@ -52,12 +52,12 @@ class GumbelBijectorTest(test.TestCase):
           atol=0.)
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       assert_scalar_congruency(
           Gumbel(loc=0.3, scale=20.), lower_x=1., upper_x=100., rtol=0.02)
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Gumbel(loc=0., scale=3.0, validate_args=True)
       x = np.linspace(-10., 10., num=10).astype(np.float32)
       y = np.linspace(0.01, 0.99, num=10).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
index 7d3bd758cd..c9bccb36fc 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
@@ -32,7 +32,7 @@ class InlineBijectorTest(test.TestCase):
   """Tests correctness of the inline constructed bijector."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       exp = Exp()
       inline = Inline(
           forward_fn=math_ops.exp,
@@ -55,7 +55,7 @@ class InlineBijectorTest(test.TestCase):
           inline.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testShapeGetters(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Inline(
           forward_event_shape_tensor_fn=lambda x: array_ops.concat((x, [1]), 0),
           forward_event_shape_fn=lambda x: x.as_list() + [1],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 8b14c8327f..7e3340aeb0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -31,7 +31,7 @@ class InvertBijectorTest(test.TestCase):
   """Tests the correctness of the Y = Invert(bij) transformation."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       for fwd in [
           bijectors.Identity(),
           bijectors.Exp(),
@@ -53,13 +53,13 @@ class InvertBijectorTest(test.TestCase):
             rev.forward_log_det_jacobian(x, event_ndims=1).eval())
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = bijectors.Invert(bijectors.Exp())
       assert_scalar_congruency(
           bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = bijectors.Invert(bijectors.SoftmaxCentered(validate_args=True))
       x = tensor_shape.TensorShape([2])
       y = tensor_shape.TensorShape([1])
@@ -73,7 +73,7 @@ class InvertBijectorTest(test.TestCase):
           bijector.inverse_event_shape_tensor(y.as_list()).eval())
 
   def testDocstringExample(self):
-    with self.test_session():
+    with self.cached_session():
       exp_gamma_distribution = (
           transformed_distribution_lib.TransformedDistribution(
               distribution=gamma_lib.Gamma(concentration=1., rate=2.),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
index a8089881f6..b3fb50005e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
@@ -30,7 +30,7 @@ class KumaraswamyBijectorTest(test.TestCase):
   """Tests correctness of the Kumaraswamy bijector."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       a = 2.
       b = 0.3
       bijector = Kumaraswamy(
@@ -54,13 +54,13 @@ class KumaraswamyBijectorTest(test.TestCase):
           atol=0.)
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       assert_scalar_congruency(
           Kumaraswamy(concentration1=0.5, concentration0=1.1),
           lower_x=0., upper_x=1., n=int(10e3), rtol=0.02)
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       concentration1 = 1.2
       concentration0 = 2.
       bijector = Kumaraswamy(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
index 5ba5a2083b..ad4329d425 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
@@ -71,7 +71,7 @@ class MaskedAutoregressiveFlowTest(test_util.VectorDistributionTestHelpers,
 
   def testBijector(self):
     x_ = np.arange(3 * 4 * 2).astype(np.float32).reshape(3, 4, 2)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ma = MaskedAutoregressiveFlow(
           validate_args=True,
           **self._autoregressive_flow_kwargs)
@@ -102,7 +102,7 @@ class MaskedAutoregressiveFlowTest(test_util.VectorDistributionTestHelpers,
 
   def testMutuallyConsistent(self):
     dims = 4
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ma = MaskedAutoregressiveFlow(
           validate_args=True,
           **self._autoregressive_flow_kwargs)
@@ -121,7 +121,7 @@ class MaskedAutoregressiveFlowTest(test_util.VectorDistributionTestHelpers,
 
   def testInvertMutuallyConsistent(self):
     dims = 4
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ma = Invert(MaskedAutoregressiveFlow(
           validate_args=True,
           **self._autoregressive_flow_kwargs))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
index 49a9afe3f6..31ee36f024 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MatrixInverseTriLBijectorTest(test.TestCase):
   """Tests the correctness of the Y = inv(tril) transformation."""
 
@@ -40,7 +41,6 @@ class MatrixInverseTriLBijectorTest(test.TestCase):
       y[idx][np.triu_indices(y[idx].shape[-1], 1)] = 0
     return y
 
-  @test_util.run_in_graph_and_eager_modes
   def testComputesCorrectValues(self):
     inv = bijectors.MatrixInverseTriL(validate_args=True)
     self.assertEqual("matrix_inverse_tril", inv.name)
@@ -62,7 +62,6 @@ class MatrixInverseTriLBijectorTest(test.TestCase):
     self.assertNear(expected_fldj_, fldj_, err=1e-3)
     self.assertNear(-expected_fldj_, ildj_, err=1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
   def testOneByOneMatrix(self):
     inv = bijectors.MatrixInverseTriL(validate_args=True)
     x_ = np.array([[5.]], dtype=np.float32)
@@ -81,7 +80,6 @@ class MatrixInverseTriLBijectorTest(test.TestCase):
     self.assertNear(expected_fldj_, fldj_, err=1e-3)
     self.assertNear(-expected_fldj_, ildj_, err=1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
   def testZeroByZeroMatrix(self):
     inv = bijectors.MatrixInverseTriL(validate_args=True)
     x_ = np.eye(0, dtype=np.float32)
@@ -100,7 +98,6 @@ class MatrixInverseTriLBijectorTest(test.TestCase):
     self.assertNear(expected_fldj_, fldj_, err=1e-3)
     self.assertNear(-expected_fldj_, ildj_, err=1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBatch(self):
     # Test batch computation with input shape (2, 1, 2, 2), i.e. batch shape
     # (2, 1).
@@ -125,20 +122,18 @@ class MatrixInverseTriLBijectorTest(test.TestCase):
     self.assertAllClose(expected_fldj_, fldj_, atol=0., rtol=1e-3)
     self.assertAllClose(-expected_fldj_, ildj_, atol=0., rtol=1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
   def testErrorOnInputRankTooLow(self):
     inv = bijectors.MatrixInverseTriL(validate_args=True)
     x_ = np.array([0.1], dtype=np.float32)
     rank_error_msg = "must have rank at least 2"
-    with self.test_session():
-      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
-        inv.forward(x_).eval()
-      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
-        inv.inverse(x_).eval()
-      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
-        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
-      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
-        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+    with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+      self.evaluate(inv.forward(x_))
+    with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+      self.evaluate(inv.inverse(x_))
+    with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+      self.evaluate(inv.forward_log_det_jacobian(x_, event_ndims=2))
+    with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+      self.evaluate(inv.inverse_log_det_jacobian(x_, event_ndims=2))
 
   # TODO(b/80481923): Figure out why these assertions fail, and fix them.
   ## def testErrorOnInputNonSquare(self):
@@ -146,55 +141,50 @@ class MatrixInverseTriLBijectorTest(test.TestCase):
   ##   x_ = np.array([[1., 2., 3.],
   ##                  [4., 5., 6.]], dtype=np.float32)
   ##   square_error_msg = "must be a square matrix"
-  ##   with self.test_session():
-  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-  ##                                              square_error_msg):
-  ##       inv.forward(x_).eval()
-  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-  ##                                              square_error_msg):
-  ##       inv.inverse(x_).eval()
-  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-  ##                                              square_error_msg):
-  ##       inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
-  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-  ##                                              square_error_msg):
-  ##       inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
-
-  @test_util.run_in_graph_and_eager_modes
+  ##   with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                            square_error_msg):
+  ##     self.evaluate(inv.forward(x_))
+  ##   with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                            square_error_msg):
+  ##     self.evaluate(inv.inverse(x_))
+  ##   with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                            square_error_msg):
+  ##     self.evaluate(inv.forward_log_det_jacobian(x_, event_ndims=2))
+  ##   with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                            square_error_msg):
+  ##     self.evaluate(inv.inverse_log_det_jacobian(x_, event_ndims=2))
+
   def testErrorOnInputNotLowerTriangular(self):
     inv = bijectors.MatrixInverseTriL(validate_args=True)
     x_ = np.array([[1., 2.],
                    [3., 4.]], dtype=np.float32)
     triangular_error_msg = "must be lower triangular"
-    with self.test_session():
-      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-                                               triangular_error_msg):
-        inv.forward(x_).eval()
-      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-                                               triangular_error_msg):
-        inv.inverse(x_).eval()
-      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-                                               triangular_error_msg):
-        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
-      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
-                                               triangular_error_msg):
-        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
-
-  @test_util.run_in_graph_and_eager_modes
+    with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                             triangular_error_msg):
+      self.evaluate(inv.forward(x_))
+    with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                             triangular_error_msg):
+      self.evaluate(inv.inverse(x_))
+    with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                             triangular_error_msg):
+      self.evaluate(inv.forward_log_det_jacobian(x_, event_ndims=2))
+    with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                             triangular_error_msg):
+      self.evaluate(inv.inverse_log_det_jacobian(x_, event_ndims=2))
+
   def testErrorOnInputSingular(self):
     inv = bijectors.MatrixInverseTriL(validate_args=True)
     x_ = np.array([[1., 0.],
                    [0., 0.]], dtype=np.float32)
     nonsingular_error_msg = "must have all diagonal entries nonzero"
-    with self.test_session():
-      with self.assertRaisesOpError(nonsingular_error_msg):
-        inv.forward(x_).eval()
-      with self.assertRaisesOpError(nonsingular_error_msg):
-        inv.inverse(x_).eval()
-      with self.assertRaisesOpError(nonsingular_error_msg):
-        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
-      with self.assertRaisesOpError(nonsingular_error_msg):
-        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+    with self.assertRaisesOpError(nonsingular_error_msg):
+      self.evaluate(inv.forward(x_))
+    with self.assertRaisesOpError(nonsingular_error_msg):
+      self.evaluate(inv.inverse(x_))
+    with self.assertRaisesOpError(nonsingular_error_msg):
+      self.evaluate(inv.forward_log_det_jacobian(x_, event_ndims=2))
+    with self.assertRaisesOpError(nonsingular_error_msg):
+      self.evaluate(inv.inverse_log_det_jacobian(x_, event_ndims=2))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
index cb42331a21..9a88f8f1bc 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py
@@ -38,26 +38,25 @@ class OrderedBijectorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testBijectorVector(self):
-    with self.test_session():
-      ordered = Ordered()
-      self.assertEqual("ordered", ordered.name)
-      x = np.asarray([[2., 3, 4], [4., 8, 13]])
-      y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
-      self.assertAllClose(y, self.evaluate(ordered.forward(x)))
-      self.assertAllClose(x, self.evaluate(ordered.inverse(y)))
-      self.assertAllClose(
-          np.sum(np.asarray(y)[..., 1:], axis=-1),
-          self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)),
-          atol=0.,
-          rtol=1e-7)
-      self.assertAllClose(
-          self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)),
-          self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)),
-          atol=0.,
-          rtol=1e-7)
+    ordered = Ordered()
+    self.assertEqual("ordered", ordered.name)
+    x = np.asarray([[2., 3, 4], [4., 8, 13]])
+    y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]]
+    self.assertAllClose(y, self.evaluate(ordered.forward(x)))
+    self.assertAllClose(x, self.evaluate(ordered.inverse(y)))
+    self.assertAllClose(
+        np.sum(np.asarray(y)[..., 1:], axis=-1),
+        self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)),
+        atol=0.,
+        rtol=1e-7)
+    self.assertAllClose(
+        self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)),
+        self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)),
+        atol=0.,
+        rtol=1e-7)
 
   def testBijectorUnknownShape(self):
-    with self.test_session():
+    with self.cached_session():
       ordered = Ordered()
       self.assertEqual("ordered", ordered.name)
       x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
@@ -84,21 +83,20 @@ class OrderedBijectorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testShapeGetters(self):
-    with self.test_session():
-      x = tensor_shape.TensorShape([4])
-      y = tensor_shape.TensorShape([4])
-      bijector = Ordered(validate_args=True)
-      self.assertAllEqual(y, bijector.forward_event_shape(x))
-      self.assertAllEqual(y.as_list(),
-                          self.evaluate(bijector.forward_event_shape_tensor(
-                              x.as_list())))
-      self.assertAllEqual(x, bijector.inverse_event_shape(y))
-      self.assertAllEqual(x.as_list(),
-                          self.evaluate(bijector.inverse_event_shape_tensor(
-                              y.as_list())))
+    x = tensor_shape.TensorShape([4])
+    y = tensor_shape.TensorShape([4])
+    bijector = Ordered(validate_args=True)
+    self.assertAllEqual(y, bijector.forward_event_shape(x))
+    self.assertAllEqual(y.as_list(),
+                        self.evaluate(bijector.forward_event_shape_tensor(
+                            x.as_list())))
+    self.assertAllEqual(x, bijector.inverse_event_shape(y))
+    self.assertAllEqual(x.as_list(),
+                        self.evaluate(bijector.inverse_event_shape_tensor(
+                            y.as_list())))
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       ordered = Ordered()
       x = np.sort(self._rng.randn(3, 10), axis=-1).astype(np.float32)
       y = (self._rng.randn(3, 10)).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
index 7eef4ab599..e2062ed55d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/permute_test.py
@@ -38,7 +38,7 @@ class PermuteBijectorTest(test.TestCase):
     expected_x = np.random.randn(4, 2, 3)
     expected_y = expected_x[..., expected_permutation]
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       permutation_ph = array_ops.placeholder(dtype=dtypes.int32)
       bijector = Permute(
           permutation=permutation_ph,
@@ -64,7 +64,7 @@ class PermuteBijectorTest(test.TestCase):
       self.assertAllClose(0., ildj, rtol=1e-6, atol=0)
 
   def testRaisesOpError(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.assertRaisesOpError("Permutation over `d` must contain"):
         permutation_ph = array_ops.placeholder(dtype=dtypes.int32)
         bijector = Permute(
@@ -77,7 +77,7 @@ class PermuteBijectorTest(test.TestCase):
     permutation = np.int32([2, 0, 1])
     x = np.random.randn(4, 2, 3)
     y = x[..., permutation]
-    with self.test_session():
+    with self.cached_session():
       bijector = Permute(permutation=permutation, validate_args=True)
       assert_bijective_and_finite(
           bijector, x, y, event_ndims=1, rtol=1e-6, atol=0)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
index 85d2283013..ef303ab664 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
@@ -30,7 +30,7 @@ class PowerTransformBijectorTest(test.TestCase):
   """Tests correctness of the power transformation."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       c = 0.2
       bijector = PowerTransform(power=c, validate_args=True)
       self.assertEqual("power_transform", bijector.name)
@@ -48,13 +48,13 @@ class PowerTransformBijectorTest(test.TestCase):
           atol=0.)
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = PowerTransform(power=0.2, validate_args=True)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = PowerTransform(power=0.2, validate_args=True)
       x = np.linspace(-4.999, 10, num=10).astype(np.float32)
       y = np.logspace(0.001, 10, num=10).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
index 2d52895fbe..b3b7b8535e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
@@ -43,7 +43,7 @@ class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
 
   def testBijector(self):
     x_ = np.arange(3 * 4 * 2).astype(np.float32).reshape(3, 4 * 2)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       nvp = RealNVP(
           num_masked=4,
           validate_args=True,
@@ -78,7 +78,7 @@ class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
 
   def testMutuallyConsistent(self):
     dims = 4
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       nvp = RealNVP(
           num_masked=3,
           validate_args=True,
@@ -98,7 +98,7 @@ class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
 
   def testInvertMutuallyConsistent(self):
     dims = 4
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       nvp = Invert(RealNVP(
           num_masked=3,
           validate_args=True,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index d44e49b487..79eadf524b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -50,7 +50,7 @@ class _ReshapeBijectorTest(object):
     expected_x = np.random.randn(4, 3, 2)
     expected_y = np.reshape(expected_x, [4, 6])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       shape_in, shape_out, feed_dict = self.build_shapes([3, 2], [6,])
       bijector = Reshape(
           event_shape_out=shape_out,
@@ -84,7 +84,7 @@ class _ReshapeBijectorTest(object):
 
     # using the _tensor methods, we should always get a fully-specified
     # result since these are evaluated at graph runtime.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       (shape_out_,
        shape_in_) = sess.run((
            bijector.forward_event_shape_tensor(shape_in),
@@ -103,7 +103,7 @@ class _ReshapeBijectorTest(object):
     expected_y_scalar = expected_x_scalar[0]
 
     shape_in, shape_out, feed_dict = self.build_shapes([], [1,])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       bijector = Reshape(
           event_shape_out=shape_in,
           event_shape_in=shape_out, validate_args=True)
@@ -124,7 +124,7 @@ class _ReshapeBijectorTest(object):
 
   def testMultipleUnspecifiedDimensionsOpError(self):
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [4, -1, -1,])
       bijector = Reshape(
           event_shape_out=shape_out,
@@ -139,7 +139,7 @@ class _ReshapeBijectorTest(object):
   # pylint: disable=invalid-name
   def _testInvalidDimensionsOpError(self, expected_error_message):
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
 
       shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 2, -2,])
       bijector = Reshape(
@@ -155,7 +155,7 @@ class _ReshapeBijectorTest(object):
   def testValidButNonMatchingInputOpError(self):
     x = np.random.randn(4, 3, 2)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 6, 1,])
       bijector = Reshape(
           event_shape_out=shape_out,
@@ -173,7 +173,7 @@ class _ReshapeBijectorTest(object):
   def testValidButNonMatchingInputPartiallySpecifiedOpError(self):
     x = np.random.randn(4, 3, 2)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       shape_in, shape_out, feed_dict = self.build_shapes([2, -1], [1, 6, 1,])
       bijector = Reshape(
           event_shape_out=shape_out,
@@ -190,7 +190,7 @@ class _ReshapeBijectorTest(object):
     x1 = np.random.randn(4, 2, 3)
     x2 = np.random.randn(4, 1, 1, 5)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       shape_in, shape_out, fd_mismatched = self.build_shapes([2, 3],
                                                              [1, 1, 5])
       bijector = Reshape(
@@ -208,7 +208,7 @@ class _ReshapeBijectorTest(object):
     expected_x = np.random.randn(4, 6)
     expected_y = np.reshape(expected_x, [4, 2, 3])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # one of input/output shapes is partially specified
       shape_in, shape_out, feed_dict = self.build_shapes([-1,], [2, 3])
       bijector = Reshape(
@@ -227,7 +227,7 @@ class _ReshapeBijectorTest(object):
   def testBothShapesPartiallySpecified(self):
     expected_x = np.random.randn(4, 2, 3)
     expected_y = np.reshape(expected_x, [4, 3, 2])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       shape_in, shape_out, feed_dict = self.build_shapes([-1, 3], [-1, 2])
       bijector = Reshape(
           event_shape_out=shape_out,
@@ -245,7 +245,7 @@ class _ReshapeBijectorTest(object):
   def testDefaultVectorShape(self):
     expected_x = np.random.randn(4, 4)
     expected_y = np.reshape(expected_x, [4, 2, 2])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _, shape_out, feed_dict = self.build_shapes([-1,], [-1, 2])
       bijector = Reshape(shape_out,
                          validate_args=True)
@@ -292,7 +292,7 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
   def testBijectiveAndFinite(self):
     x = np.random.randn(4, 2, 3)
     y = np.reshape(x, [4, 1, 2, 3])
-    with self.test_session():
+    with self.cached_session():
       bijector = Reshape(
           event_shape_in=[2, 3],
           event_shape_out=[1, 2, 3],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
index cea4a62c22..a6d432753d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
@@ -31,7 +31,7 @@ class SigmoidBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = (1 + exp(-X))^-1 transformation."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       self.assertEqual("sigmoid", Sigmoid().name)
       x = np.linspace(-10., 10., 100).reshape([2, 5, 10]).astype(np.float32)
       y = special.expit(x)
@@ -45,11 +45,11 @@ class SigmoidBijectorTest(test.TestCase):
           x, event_ndims=0).eval(), atol=0., rtol=1e-4)
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       assert_scalar_congruency(Sigmoid(), lower_x=-7., upper_x=7.)
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       x = np.linspace(-7., 7., 100).astype(np.float32)
       eps = 1e-3
       y = np.linspace(eps, 1. - eps, 100).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 795f1993ba..282619a73b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -33,7 +33,7 @@ class SinhArcsinhBijectorTest(test.TestCase):
   """Tests correctness of the power transformation."""
 
   def testBijectorVersusNumpyRewriteOfBasicFunctions(self):
-    with self.test_session():
+    with self.cached_session():
       skewness = 0.2
       tailweight = 2.0
       bijector = SinhArcsinh(
@@ -58,7 +58,7 @@ class SinhArcsinhBijectorTest(test.TestCase):
           atol=0.)
 
   def testLargerTailWeightPutsMoreWeightInTails(self):
-    with self.test_session():
+    with self.cached_session():
       # Will broadcast together to shape [3, 2].
       x = [-1., 1.]
       tailweight = [[0.5], [1.0], [2.0]]
@@ -75,7 +75,7 @@ class SinhArcsinhBijectorTest(test.TestCase):
       self.assertLess(forward_1[1], forward_1[2])
 
   def testSkew(self):
-    with self.test_session():
+    with self.cached_session():
       # Will broadcast together to shape [3, 2].
       x = [-1., 1.]
       skewness = [[-1.], [0.], [1.]]
@@ -92,24 +92,24 @@ class SinhArcsinhBijectorTest(test.TestCase):
       self.assertLess(np.abs(y[2, 0]), np.abs(y[2, 1]))
 
   def testScalarCongruencySkewness1Tailweight0p5(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = SinhArcsinh(skewness=1.0, tailweight=0.5, validate_args=True)
       assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.0, rtol=0.05)
 
   def testScalarCongruencySkewnessNeg1Tailweight1p5(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = SinhArcsinh(skewness=-1.0, tailweight=1.5, validate_args=True)
       assert_scalar_congruency(bijector, lower_x=-2., upper_x=2.0, rtol=0.05)
 
   def testBijectiveAndFiniteSkewnessNeg1Tailweight0p5(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = SinhArcsinh(skewness=-1., tailweight=0.5, validate_args=True)
       x = np.concatenate((-np.logspace(-2, 10, 1000), [0], np.logspace(
           -2, 10, 1000))).astype(np.float32)
       assert_bijective_and_finite(bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectiveAndFiniteSkewness1Tailweight3(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = SinhArcsinh(skewness=1., tailweight=3., validate_args=True)
       x = np.concatenate((-np.logspace(-2, 5, 1000), [0], np.logspace(
           -2, 5, 1000))).astype(np.float32)
@@ -117,7 +117,7 @@ class SinhArcsinhBijectorTest(test.TestCase):
           bijector, x, x, event_ndims=0, rtol=1e-3)
 
   def testBijectorEndpoints(self):
-    with self.test_session():
+    with self.cached_session():
       for dtype in (np.float32, np.float64):
         bijector = SinhArcsinh(
             skewness=dtype(0.), tailweight=dtype(1.), validate_args=True)
@@ -129,7 +129,7 @@ class SinhArcsinhBijectorTest(test.TestCase):
             bijector, bounds, bounds, event_ndims=0, atol=2e-6)
 
   def testBijectorOverRange(self):
-    with self.test_session():
+    with self.cached_session():
       for dtype in (np.float32, np.float64):
         skewness = np.array([1.2, 5.], dtype=dtype)
         tailweight = np.array([2., 10.], dtype=dtype)
@@ -176,12 +176,12 @@ class SinhArcsinhBijectorTest(test.TestCase):
             atol=0.)
 
   def testZeroTailweightRaises(self):
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("not positive"):
         SinhArcsinh(tailweight=0., validate_args=True).forward(1.0).eval()
 
   def testDefaultDtypeIsFloat32(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = SinhArcsinh()
       self.assertEqual(bijector.tailweight.dtype, np.float32)
       self.assertEqual(bijector.skewness.dtype, np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
index 0f0a2fa531..8d18400487 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -35,7 +35,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
   """Tests correctness of the Y = g(X) = exp(X) / sum(exp(X)) transformation."""
 
   def testBijectorVector(self):
-    with self.test_session():
+    with self.cached_session():
       softmax = SoftmaxCentered()
       self.assertEqual("softmax_centered", softmax.name)
       x = np.log([[2., 3, 4], [4., 8, 12]])
@@ -54,7 +54,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
           rtol=1e-7)
 
   def testBijectorUnknownShape(self):
-    with self.test_session():
+    with self.cached_session():
       softmax = SoftmaxCentered()
       self.assertEqual("softmax_centered", softmax.name)
       x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32)
@@ -80,7 +80,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
           rtol=1e-7)
 
   def testShapeGetters(self):
-    with self.test_session():
+    with self.cached_session():
       x = tensor_shape.TensorShape([4])
       y = tensor_shape.TensorShape([5])
       bijector = SoftmaxCentered(validate_args=True)
@@ -94,7 +94,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
                               y.as_list()).eval())
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       softmax = SoftmaxCentered()
       x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
       # Make y values on the simplex with a wide range.
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
index 3d8a0a32bb..e805619041 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -42,13 +42,13 @@ class SoftplusBijectorTest(test.TestCase):
     return -np.log(1 - np.exp(-y))
 
   def testHingeSoftnessZeroRaises(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus(hinge_softness=0., validate_args=True)
       with self.assertRaisesOpError("must be non-zero"):
         bijector.forward([1., 1.]).eval()
 
   def testBijectorForwardInverseEventDimsZero(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
@@ -58,7 +58,7 @@ class SoftplusBijectorTest(test.TestCase):
       self.assertAllClose(x, bijector.inverse(y).eval())
 
   def testBijectorForwardInverseWithHingeSoftnessEventDimsZero(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus(hinge_softness=1.5)
       x = 2 * rng.randn(2, 10)
       y = 1.5 * self._softplus(x / 1.5)
@@ -67,7 +67,7 @@ class SoftplusBijectorTest(test.TestCase):
       self.assertAllClose(x, bijector.inverse(y).eval())
 
   def testBijectorLogDetJacobianEventDimsZero(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       # No reduction needed if event_dims = 0.
@@ -77,7 +77,7 @@ class SoftplusBijectorTest(test.TestCase):
           y, event_ndims=0).eval())
 
   def testBijectorForwardInverseEventDimsOne(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus()
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
@@ -87,7 +87,7 @@ class SoftplusBijectorTest(test.TestCase):
       self.assertAllClose(x, bijector.inverse(y).eval())
 
   def testBijectorLogDetJacobianEventDimsOne(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus()
       y = 2 * rng.rand(2, 10)
       ildj_before = self._softplus_ildj_before_reduction(y)
@@ -97,25 +97,25 @@ class SoftplusBijectorTest(test.TestCase):
           y, event_ndims=1).eval())
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus()
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithPositiveHingeSoftness(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus(hinge_softness=1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testScalarCongruencyWithNegativeHingeSoftness(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus(hinge_softness=-1.3)
       assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testBijectiveAndFinite32bit(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus()
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
@@ -123,7 +123,7 @@ class SoftplusBijectorTest(test.TestCase):
           bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithPositiveHingeSoftness32Bit(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus(hinge_softness=1.23)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
@@ -131,7 +131,7 @@ class SoftplusBijectorTest(test.TestCase):
           bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFiniteWithNegativeHingeSoftness32Bit(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus(hinge_softness=-0.7)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = -np.logspace(-10, 10, 100).astype(np.float32)
@@ -139,7 +139,7 @@ class SoftplusBijectorTest(test.TestCase):
           bijector, x, y, event_ndims=0, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFinite16bit(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Softplus()
       # softplus(-20) is zero, so we can't use such a large range as in 32bit.
       x = np.linspace(-10., 20., 100).astype(np.float16)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
index d0098c3c10..8dad80aa64 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softsign_test.py
@@ -43,16 +43,15 @@ class SoftsignBijectorTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testBijectorBounds(self):
     bijector = Softsign(validate_args=True)
-    with self.test_session():
-      with self.assertRaisesOpError("greater than -1"):
-        bijector.inverse(-3.).eval()
-      with self.assertRaisesOpError("greater than -1"):
-        bijector.inverse_log_det_jacobian(-3., event_ndims=0).eval()
-
-      with self.assertRaisesOpError("less than 1"):
-        bijector.inverse(3.).eval()
-      with self.assertRaisesOpError("less than 1"):
-        bijector.inverse_log_det_jacobian(3., event_ndims=0).eval()
+    with self.assertRaisesOpError("greater than -1"):
+      self.evaluate(bijector.inverse(-3.))
+    with self.assertRaisesOpError("greater than -1"):
+      self.evaluate(bijector.inverse_log_det_jacobian(-3., event_ndims=0))
+
+    with self.assertRaisesOpError("less than 1"):
+      self.evaluate(bijector.inverse(3.))
+    with self.assertRaisesOpError("less than 1"):
+      self.evaluate(bijector.inverse_log_det_jacobian(3., event_ndims=0))
 
   @test_util.run_in_graph_and_eager_modes
   def testBijectorForwardInverse(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
index 30c7a738c3..e5550cc830 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/square_test.py
@@ -29,7 +29,7 @@ class SquareBijectorTest(test.TestCase):
   """Tests the correctness of the Y = X ** 2 transformation."""
 
   def testBijectorScalar(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = bijectors.Square(validate_args=True)
       self.assertEqual("square", bijector.name)
       x = [[[1., 5],
@@ -50,7 +50,7 @@ class SquareBijectorTest(test.TestCase):
           rtol=1e-7)
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = bijectors.Square(validate_args=True)
       assert_scalar_congruency(bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
index f57adcda89..424eb58fa0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/weibull_test.py
@@ -31,7 +31,7 @@ class WeibullBijectorTest(test.TestCase):
   """Tests correctness of the weibull bijector."""
 
   def testBijector(self):
-    with self.test_session():
+    with self.cached_session():
       scale = 5.
       concentration = 0.3
       bijector = Weibull(
@@ -54,13 +54,13 @@ class WeibullBijectorTest(test.TestCase):
           atol=0.)
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       assert_scalar_congruency(
           Weibull(scale=20., concentration=0.3),
           lower_x=1., upper_x=100., rtol=0.02)
 
   def testBijectiveAndFinite(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = Weibull(
           scale=20., concentration=2., validate_args=True)
       x = np.linspace(1., 8., num=10).astype(np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
index d30f6e418d..c317393fbc 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 class BinomialTest(test.TestCase):
 
   def testSimpleShapes(self):
-    with self.test_session():
+    with self.cached_session():
       p = np.float32(np.random.beta(1, 1))
       binom = binomial.Binomial(total_count=1., probs=p)
       self.assertAllEqual([], binom.event_shape_tensor().eval())
@@ -37,7 +37,7 @@ class BinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([]), binom.batch_shape)
 
   def testComplexShapes(self):
-    with self.test_session():
+    with self.cached_session():
       p = np.random.beta(1, 1, size=(3, 2)).astype(np.float32)
       n = [[3., 2], [4, 5], [6, 7]]
       binom = binomial.Binomial(total_count=n, probs=p)
@@ -50,14 +50,14 @@ class BinomialTest(test.TestCase):
   def testNProperty(self):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
-    with self.test_session():
+    with self.cached_session():
       binom = binomial.Binomial(total_count=n, probs=p)
       self.assertEqual((2, 1), binom.total_count.get_shape())
       self.assertAllClose(n, binom.total_count.eval())
 
   def testPProperty(self):
     p = [[0.1, 0.2, 0.7]]
-    with self.test_session():
+    with self.cached_session():
       binom = binomial.Binomial(total_count=3., probs=p)
       self.assertEqual((1, 3), binom.probs.get_shape())
       self.assertEqual((1, 3), binom.logits.get_shape())
@@ -65,7 +65,7 @@ class BinomialTest(test.TestCase):
 
   def testLogitsProperty(self):
     logits = [[0., 9., -0.5]]
-    with self.test_session():
+    with self.cached_session():
       binom = binomial.Binomial(total_count=3., logits=logits)
       self.assertEqual((1, 3), binom.probs.get_shape())
       self.assertEqual((1, 3), binom.logits.get_shape())
@@ -74,7 +74,7 @@ class BinomialTest(test.TestCase):
   def testPmfAndCdfNandCountsAgree(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
-    with self.test_session():
+    with self.cached_session():
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=True)
       binom.prob([2., 3, 2]).eval()
       binom.prob([3., 1, 2]).eval()
@@ -92,7 +92,7 @@ class BinomialTest(test.TestCase):
   def testPmfAndCdfNonIntegerCounts(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
-    with self.test_session():
+    with self.cached_session():
       # No errors with integer n.
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=True)
       binom.prob([2., 3, 2]).eval()
@@ -116,7 +116,7 @@ class BinomialTest(test.TestCase):
       binom.cdf([1.0, 2.5, 1.5]).eval()
 
   def testPmfAndCdfBothZeroBatches(self):
-    with self.test_session():
+    with self.cached_session():
       # Both zero-batches.  No broadcast
       p = 0.5
       counts = 1.
@@ -129,7 +129,7 @@ class BinomialTest(test.TestCase):
       self.assertEqual((), cdf.get_shape())
 
   def testPmfAndCdfBothZeroBatchesNontrivialN(self):
-    with self.test_session():
+    with self.cached_session():
       # Both zero-batches.  No broadcast
       p = 0.1
       counts = 3.
@@ -142,7 +142,7 @@ class BinomialTest(test.TestCase):
       self.assertEqual((), cdf.get_shape())
 
   def testPmfAndCdfPStretchedInBroadcastWhenSameRank(self):
-    with self.test_session():
+    with self.cached_session():
       p = [[0.1, 0.9]]
       counts = [[1., 2.]]
       binom = binomial.Binomial(total_count=3., probs=p)
@@ -154,7 +154,7 @@ class BinomialTest(test.TestCase):
       self.assertEqual((1, 2), cdf.get_shape())
 
   def testPmfAndCdfPStretchedInBroadcastWhenLowerRank(self):
-    with self.test_session():
+    with self.cached_session():
       p = [0.1, 0.4]
       counts = [[1.], [0.]]
       binom = binomial.Binomial(total_count=1., probs=p)
@@ -166,7 +166,7 @@ class BinomialTest(test.TestCase):
       self.assertEqual((2, 2), cdf.get_shape())
 
   def testBinomialMean(self):
-    with self.test_session():
+    with self.cached_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
       binom = binomial.Binomial(total_count=n, probs=p)
@@ -175,7 +175,7 @@ class BinomialTest(test.TestCase):
       self.assertAllClose(expected_means, binom.mean().eval())
 
   def testBinomialVariance(self):
-    with self.test_session():
+    with self.cached_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
       binom = binomial.Binomial(total_count=n, probs=p)
@@ -184,7 +184,7 @@ class BinomialTest(test.TestCase):
       self.assertAllClose(expected_variances, binom.variance().eval())
 
   def testBinomialMode(self):
-    with self.test_session():
+    with self.cached_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
       binom = binomial.Binomial(total_count=n, probs=p)
@@ -193,7 +193,7 @@ class BinomialTest(test.TestCase):
       self.assertAllClose(expected_modes, binom.mode().eval())
 
   def testBinomialMultipleMode(self):
-    with self.test_session():
+    with self.cached_session():
       n = 9.
       p = [0.1, 0.2, 0.7]
       binom = binomial.Binomial(total_count=n, probs=p)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
index 73747db31c..4411d6f461 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
@@ -56,7 +56,7 @@ class CauchyTest(test.TestCase):
     self.assertAllEqual(all_true, is_finite)
 
   def _testParamShapes(self, sample_shape, expected):
-    with self.test_session():
+    with self.cached_session():
       param_shapes = cauchy_lib.Cauchy.param_shapes(sample_shape)
       loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
       self.assertAllEqual(expected, loc_shape.eval())
@@ -85,7 +85,7 @@ class CauchyTest(test.TestCase):
         tensor_shape.TensorShape(sample_shape), sample_shape)
 
   def testCauchyLogPDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       loc = constant_op.constant([3.0] * batch_size)
       scale = constant_op.constant([np.sqrt(10.0)] * batch_size)
@@ -112,7 +112,7 @@ class CauchyTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
 
   def testCauchyLogPDFMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       loc = constant_op.constant([[3.0, -3.0]] * batch_size)
       scale = constant_op.constant(
@@ -144,7 +144,7 @@ class CauchyTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
 
   def testCauchyCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       loc = self._rng.randn(batch_size)
       scale = self._rng.rand(batch_size) + 1.0
@@ -162,7 +162,7 @@ class CauchyTest(test.TestCase):
       self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
 
   def testCauchySurvivalFunction(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       loc = self._rng.randn(batch_size)
       scale = self._rng.rand(batch_size) + 1.0
@@ -181,7 +181,7 @@ class CauchyTest(test.TestCase):
       self.assertAllClose(expected_sf, sf.eval(), atol=0)
 
   def testCauchyLogCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       loc = self._rng.randn(batch_size)
       scale = self._rng.rand(batch_size) + 1.0
@@ -214,14 +214,14 @@ class CauchyTest(test.TestCase):
         ]:
           value = func(x)
           grads = gradients_impl.gradients(value, [loc, scale])
-          with self.test_session(graph=g):
+          with self.session(graph=g):
             variables.global_variables_initializer().run()
             self.assertAllFinite(value)
             self.assertAllFinite(grads[0])
             self.assertAllFinite(grads[1])
 
   def testCauchyLogSurvivalFunction(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       loc = self._rng.randn(batch_size)
       scale = self._rng.rand(batch_size) + 1.0
@@ -241,7 +241,7 @@ class CauchyTest(test.TestCase):
       self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
 
   def testCauchyEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       loc = np.array([1.0, 1.0, 1.0])
       scale = np.array([[1.0, 2.0, 3.0]])
       cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
@@ -259,7 +259,7 @@ class CauchyTest(test.TestCase):
       self.assertAllClose(expected_entropy, entropy.eval())
 
   def testCauchyMode(self):
-    with self.test_session():
+    with self.cached_session():
       # Mu will be broadcast to [7, 7, 7].
       loc = [7.]
       scale = [11., 12., 13.]
@@ -270,7 +270,7 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual([7., 7, 7], cauchy.mode().eval())
 
   def testCauchyMean(self):
-    with self.test_session():
+    with self.cached_session():
       loc = [1., 2., 3.]
       scale = [7.]
       cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
@@ -279,7 +279,7 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual([np.nan] * 3, cauchy.mean().eval())
 
   def testCauchyNanMean(self):
-    with self.test_session():
+    with self.cached_session():
       loc = [1., 2., 3.]
       scale = [7.]
       cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
@@ -288,7 +288,7 @@ class CauchyTest(test.TestCase):
         cauchy.mean().eval()
 
   def testCauchyQuantile(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       loc = self._rng.randn(batch_size)
       scale = self._rng.rand(batch_size) + 1.0
@@ -308,7 +308,7 @@ class CauchyTest(test.TestCase):
       self.assertAllClose(expected_x, x.eval(), atol=0.)
 
   def testCauchyVariance(self):
-    with self.test_session():
+    with self.cached_session():
       # scale will be broadcast to [7, 7, 7]
       loc = [1., 2., 3.]
       scale = [7.]
@@ -318,7 +318,7 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual([np.nan] * 3, cauchy.variance().eval())
 
   def testCauchyNanVariance(self):
-    with self.test_session():
+    with self.cached_session():
       # scale will be broadcast to [7, 7, 7]
       loc = [1., 2., 3.]
       scale = [7.]
@@ -328,7 +328,7 @@ class CauchyTest(test.TestCase):
         cauchy.variance().eval()
 
   def testCauchyStandardDeviation(self):
-    with self.test_session():
+    with self.cached_session():
       # scale will be broadcast to [7, 7, 7]
       loc = [1., 2., 3.]
       scale = [7.]
@@ -338,7 +338,7 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual([np.nan] * 3, cauchy.stddev().eval())
 
   def testCauchyNanStandardDeviation(self):
-    with self.test_session():
+    with self.cached_session():
       # scale will be broadcast to [7, 7, 7]
       loc = [1., 2., 3.]
       scale = [7.]
@@ -348,7 +348,7 @@ class CauchyTest(test.TestCase):
         cauchy.stddev().eval()
 
   def testCauchySample(self):
-    with self.test_session():
+    with self.cached_session():
       loc = constant_op.constant(3.0)
       scale = constant_op.constant(1.0)
       loc_v = 3.0
@@ -373,7 +373,7 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual(expected_shape, sample_values.shape)
 
   def testCauchySampleMultiDimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 2
       loc = constant_op.constant([[3.0, -3.0]] * batch_size)
       scale = constant_op.constant([[0.5, 1.0]] * batch_size)
@@ -399,13 +399,13 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual(expected_shape, sample_values.shape)
 
   def testCauchyNegativeLocFails(self):
-    with self.test_session():
+    with self.cached_session():
       cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True)
       with self.assertRaisesOpError("Condition x > 0 did not hold"):
         cauchy.mode().eval()
 
   def testCauchyShape(self):
-    with self.test_session():
+    with self.cached_session():
       loc = constant_op.constant([-3.0] * 5)
       scale = constant_op.constant(11.0)
       cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
@@ -420,7 +420,7 @@ class CauchyTest(test.TestCase):
     scale = array_ops.placeholder(dtype=dtypes.float32)
     cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # get_batch_shape should return an "<unknown>" tensor.
       self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape(None))
       self.assertEqual(cauchy.event_shape, ())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/chi2_test.py b/tensorflow/contrib/distributions/python/kernel_tests/chi2_test.py
index 75d48791ec..3b5a6aa90c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/chi2_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/chi2_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 class Chi2Test(test.TestCase):
 
   def testChi2LogPDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       df = constant_op.constant([2.0] * batch_size, dtype=np.float64)
       df_v = 2.0
@@ -46,7 +46,7 @@ class Chi2Test(test.TestCase):
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testChi2CDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       df = constant_op.constant([2.0] * batch_size, dtype=np.float64)
       df_v = 2.0
@@ -60,7 +60,7 @@ class Chi2Test(test.TestCase):
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testChi2Mean(self):
-    with self.test_session():
+    with self.cached_session():
       df_v = np.array([1., 3, 5], dtype=np.float64)
       expected_mean = stats.chi2.mean(df_v)
       chi2 = chi2_lib.Chi2(df=df_v)
@@ -68,7 +68,7 @@ class Chi2Test(test.TestCase):
       self.assertAllClose(chi2.mean().eval(), expected_mean)
 
   def testChi2Variance(self):
-    with self.test_session():
+    with self.cached_session():
       df_v = np.array([1., 3, 5], np.float64)
       expected_variances = stats.chi2.var(df_v)
       chi2 = chi2_lib.Chi2(df=df_v)
@@ -76,7 +76,7 @@ class Chi2Test(test.TestCase):
       self.assertAllClose(chi2.variance().eval(), expected_variances)
 
   def testChi2Entropy(self):
-    with self.test_session():
+    with self.cached_session():
       df_v = np.array([1., 3, 5], dtype=np.float64)
       expected_entropy = stats.chi2.entropy(df_v)
       chi2 = chi2_lib.Chi2(df=df_v)
@@ -84,7 +84,7 @@ class Chi2Test(test.TestCase):
       self.assertAllClose(chi2.entropy().eval(), expected_entropy)
 
   def testChi2WithAbsDf(self):
-    with self.test_session():
+    with self.cached_session():
       df_v = np.array([-1.3, -3.2, 5], dtype=np.float64)
       chi2 = chi2_lib.Chi2WithAbsDf(df=df_v)
       self.assertAllClose(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
index 4e8989b6c2..7e63b5ca5f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
@@ -69,7 +69,7 @@ class ConditionalTransformedDistributionTest(
     return ds.ConditionalTransformedDistribution
 
   def testConditioning(self):
-    with self.test_session():
+    with self.cached_session():
       conditional_normal = ds.ConditionalTransformedDistribution(
           distribution=ds.Normal(loc=0., scale=1.),
           bijector=_ChooseLocation(loc=[-100., 100.]))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
index 90910f3839..36fc7a70c8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py
@@ -29,7 +29,7 @@ rng = np.random.RandomState(0)
 class DeterministicTest(test.TestCase):
 
   def testShape(self):
-    with self.test_session():
+    with self.cached_session():
       loc = rng.rand(2, 3, 4)
       deterministic = deterministic_lib.Deterministic(loc)
 
@@ -42,20 +42,20 @@ class DeterministicTest(test.TestCase):
     loc = rng.rand(2, 3, 4).astype(np.float32)
     deterministic = deterministic_lib.Deterministic(
         loc, atol=-1, validate_args=True)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("Condition x >= 0"):
         deterministic.prob(0.).eval()
 
   def testProbWithNoBatchDimsIntegerType(self):
     deterministic = deterministic_lib.Deterministic(0)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(1, deterministic.prob(0).eval())
       self.assertAllClose(0, deterministic.prob(2).eval())
       self.assertAllClose([1, 0], deterministic.prob([0, 2]).eval())
 
   def testProbWithNoBatchDims(self):
     deterministic = deterministic_lib.Deterministic(0.)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(1., deterministic.prob(0.).eval())
       self.assertAllClose(0., deterministic.prob(2.).eval())
       self.assertAllClose([1., 0.], deterministic.prob([0., 2.]).eval())
@@ -65,7 +65,7 @@ class DeterministicTest(test.TestCase):
     x = [[0., 1.1], [1.99, 3.]]
     deterministic = deterministic_lib.Deterministic(loc)
     expected_prob = [[1., 0.], [0., 1.]]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((2, 2), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -75,7 +75,7 @@ class DeterministicTest(test.TestCase):
     x = [[0., 1.1], [1.99, 3.]]
     deterministic = deterministic_lib.Deterministic(loc, atol=0.05)
     expected_prob = [[1., 0.], [1., 1.]]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((2, 2), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -85,7 +85,7 @@ class DeterministicTest(test.TestCase):
     x = [[0, 2], [4, 2]]
     deterministic = deterministic_lib.Deterministic(loc, atol=1)
     expected_prob = [[1, 1], [0, 1]]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((2, 2), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -95,7 +95,7 @@ class DeterministicTest(test.TestCase):
     x = [[0., 1.1], [100.1, 103.]]
     deterministic = deterministic_lib.Deterministic(loc, rtol=0.01)
     expected_prob = [[1., 0.], [1., 0.]]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((2, 2), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -107,7 +107,7 @@ class DeterministicTest(test.TestCase):
     # Batch 1 will have rtol = 1 (100% slack allowed)
     deterministic = deterministic_lib.Deterministic(loc, rtol=[[0], [1]])
     expected_prob = [[1, 0, 0], [1, 1, 0]]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((2, 3), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -117,7 +117,7 @@ class DeterministicTest(test.TestCase):
     x = [[-1., -0.1], [-0.01, 1.000001]]
     deterministic = deterministic_lib.Deterministic(loc)
     expected_cdf = [[0., 0.], [0., 1.]]
-    with self.test_session():
+    with self.cached_session():
       cdf = deterministic.cdf(x)
       self.assertAllEqual((2, 2), cdf.get_shape())
       self.assertAllEqual(expected_cdf, cdf.eval())
@@ -127,7 +127,7 @@ class DeterministicTest(test.TestCase):
     x = [[-1., -0.1], [-0.01, 1.000001]]
     deterministic = deterministic_lib.Deterministic(loc, atol=0.05)
     expected_cdf = [[0., 0.], [1., 1.]]
-    with self.test_session():
+    with self.cached_session():
       cdf = deterministic.cdf(x)
       self.assertAllEqual((2, 2), cdf.get_shape())
       self.assertAllEqual(expected_cdf, cdf.eval())
@@ -137,7 +137,7 @@ class DeterministicTest(test.TestCase):
     x = [[0.9, 1.], [99.9, 97]]
     deterministic = deterministic_lib.Deterministic(loc, rtol=0.01)
     expected_cdf = [[0., 1.], [1., 0.]]
-    with self.test_session():
+    with self.cached_session():
       cdf = deterministic.cdf(x)
       self.assertAllEqual((2, 2), cdf.get_shape())
       self.assertAllEqual(expected_cdf, cdf.eval())
@@ -145,7 +145,7 @@ class DeterministicTest(test.TestCase):
   def testSampleNoBatchDims(self):
     deterministic = deterministic_lib.Deterministic(0.)
     for sample_shape in [(), (4,)]:
-      with self.test_session():
+      with self.cached_session():
         sample = deterministic.sample(sample_shape)
         self.assertAllEqual(sample_shape, sample.get_shape())
         self.assertAllClose(
@@ -154,7 +154,7 @@ class DeterministicTest(test.TestCase):
   def testSampleWithBatchDims(self):
     deterministic = deterministic_lib.Deterministic([0., 0.])
     for sample_shape in [(), (4,)]:
-      with self.test_session():
+      with self.cached_session():
         sample = deterministic.sample(sample_shape)
         self.assertAllEqual(sample_shape + (2,), sample.get_shape())
         self.assertAllClose(
@@ -166,18 +166,25 @@ class DeterministicTest(test.TestCase):
 
     deterministic = deterministic_lib.Deterministic(loc)
     for sample_shape_ in [(), (4,)]:
-      with self.test_session():
+      with self.cached_session():
         sample_ = deterministic.sample(sample_shape).eval(
             feed_dict={loc: [0., 0.],
                        sample_shape: sample_shape_})
         self.assertAllClose(
             np.zeros(sample_shape_ + (2,)).astype(np.float32), sample_)
 
+  def testEntropy(self):
+    loc = np.array([-0.1, -3.2, 7.])
+    deterministic = deterministic_lib.Deterministic(loc=loc)
+    with self.cached_session() as sess:
+      entropy_ = sess.run(deterministic.entropy())
+      self.assertAllEqual(np.zeros(3), entropy_)
+
 
 class VectorDeterministicTest(test.TestCase):
 
   def testShape(self):
-    with self.test_session():
+    with self.cached_session():
       loc = rng.rand(2, 3, 4)
       deterministic = deterministic_lib.VectorDeterministic(loc)
 
@@ -190,7 +197,7 @@ class VectorDeterministicTest(test.TestCase):
     loc = rng.rand(2, 3, 4).astype(np.float32)
     deterministic = deterministic_lib.VectorDeterministic(
         loc, atol=-1, validate_args=True)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("Condition x >= 0"):
         deterministic.prob(loc).eval()
 
@@ -198,14 +205,14 @@ class VectorDeterministicTest(test.TestCase):
     loc = rng.rand(2, 3, 4).astype(np.float32)
     deterministic = deterministic_lib.VectorDeterministic(
         loc, atol=-1, validate_args=True)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "must have rank at least 1"):
         deterministic.prob(0.).eval()
 
   def testProbVectorDeterministicWithNoBatchDims(self):
     # 0 batch of deterministics on R^1.
     deterministic = deterministic_lib.VectorDeterministic([0.])
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(1., deterministic.prob([0.]).eval())
       self.assertAllClose(0., deterministic.prob([2.]).eval())
       self.assertAllClose([1., 0.], deterministic.prob([[0.], [2.]]).eval())
@@ -216,7 +223,7 @@ class VectorDeterministicTest(test.TestCase):
     x = [[0., 1.], [1.9, 3.], [3.99, 5.]]
     deterministic = deterministic_lib.VectorDeterministic(loc)
     expected_prob = [1., 0., 0.]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((3,), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -227,7 +234,7 @@ class VectorDeterministicTest(test.TestCase):
     x = [[0., 1.], [1.9, 3.], [3.99, 5.]]
     deterministic = deterministic_lib.VectorDeterministic(loc, atol=0.05)
     expected_prob = [1., 0., 1.]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((3,), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -238,7 +245,7 @@ class VectorDeterministicTest(test.TestCase):
     x = [[0., 1.], [0.9, 1.], [99.9, 100.1]]
     deterministic = deterministic_lib.VectorDeterministic(loc, rtol=0.01)
     expected_prob = [1., 0., 1.]
-    with self.test_session():
+    with self.cached_session():
       prob = deterministic.prob(x)
       self.assertAllEqual((3,), prob.get_shape())
       self.assertAllEqual(expected_prob, prob.eval())
@@ -247,7 +254,7 @@ class VectorDeterministicTest(test.TestCase):
     # 0 batch of deterministics on R^0.
     deterministic = deterministic_lib.VectorDeterministic(
         [], validate_args=True)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(1., deterministic.prob([]).eval())
 
   def testProbVectorDeterministicWithNoBatchDimsOnRZeroRaisesIfXNotInSameRk(
@@ -255,14 +262,14 @@ class VectorDeterministicTest(test.TestCase):
     # 0 batch of deterministics on R^0.
     deterministic = deterministic_lib.VectorDeterministic(
         [], validate_args=True)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("not defined in the same space"):
         deterministic.prob([1.]).eval()
 
   def testSampleNoBatchDims(self):
     deterministic = deterministic_lib.VectorDeterministic([0.])
     for sample_shape in [(), (4,)]:
-      with self.test_session():
+      with self.cached_session():
         sample = deterministic.sample(sample_shape)
         self.assertAllEqual(sample_shape + (1,), sample.get_shape())
         self.assertAllClose(
@@ -271,7 +278,7 @@ class VectorDeterministicTest(test.TestCase):
   def testSampleWithBatchDims(self):
     deterministic = deterministic_lib.VectorDeterministic([[0.], [0.]])
     for sample_shape in [(), (4,)]:
-      with self.test_session():
+      with self.cached_session():
         sample = deterministic.sample(sample_shape)
         self.assertAllEqual(sample_shape + (2, 1), sample.get_shape())
         self.assertAllClose(
@@ -283,13 +290,20 @@ class VectorDeterministicTest(test.TestCase):
 
     deterministic = deterministic_lib.VectorDeterministic(loc)
     for sample_shape_ in [(), (4,)]:
-      with self.test_session():
+      with self.cached_session():
         sample_ = deterministic.sample(sample_shape).eval(
             feed_dict={loc: [[0.], [0.]],
                        sample_shape: sample_shape_})
         self.assertAllClose(
             np.zeros(sample_shape_ + (2, 1)).astype(np.float32), sample_)
 
+  def testEntropy(self):
+    loc = np.array([[8.3, 1.2, 3.3], [-0.1, -3.2, 7.]])
+    deterministic = deterministic_lib.VectorDeterministic(loc=loc)
+    with self.cached_session() as sess:
+      entropy_ = sess.run(deterministic.entropy())
+      self.assertAllEqual(np.zeros(2), entropy_)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index f42feae25d..f073f51a69 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -47,7 +47,7 @@ class DistributionTest(test.TestCase):
     ]
 
     sample_shapes = [(), (10,), (10, 20, 30)]
-    with self.test_session():
+    with self.cached_session():
       for cls in classes:
         for sample_shape in sample_shapes:
           param_shapes = cls.param_shapes(sample_shape)
@@ -62,7 +62,7 @@ class DistributionTest(test.TestCase):
           self.assertEqual(dist.parameters, dist_copy.parameters)
 
   def testCopyExtraArgs(self):
-    with self.test_session():
+    with self.cached_session():
       # Note: we cannot easily test all distributions since each requires
       # different initialization arguments. We therefore spot test a few.
       normal = tfd.Normal(loc=1., scale=2., validate_args=True)
@@ -72,7 +72,7 @@ class DistributionTest(test.TestCase):
       self.assertEqual(wishart.parameters, wishart.copy().parameters)
 
   def testCopyOverride(self):
-    with self.test_session():
+    with self.cached_session():
       normal = tfd.Normal(loc=1., scale=2., validate_args=True)
       unused_normal_copy = normal.copy(validate_args=False)
       base_params = normal.parameters.copy()
@@ -82,7 +82,7 @@ class DistributionTest(test.TestCase):
       self.assertEqual(base_params, copy_params)
 
   def testIsScalar(self):
-    with self.test_session():
+    with self.cached_session():
       mu = 1.
       sigma = 2.
 
@@ -152,7 +152,7 @@ class DistributionTest(test.TestCase):
   def testSampleShapeHints(self):
     fake_distribution = self._GetFakeDistribution()
 
-    with self.test_session():
+    with self.cached_session():
       # Make a new session since we're playing with static shapes. [And below.]
       x = array_ops.placeholder(dtype=dtypes.float32)
       dist = fake_distribution(batch_shape=[2, 3], event_shape=[5])
@@ -162,28 +162,28 @@ class DistributionTest(test.TestCase):
       # unknown values, ie, Dimension(None).
       self.assertAllEqual([6, 7, 2, 3, 5], y.get_shape().as_list())
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.float32)
       dist = fake_distribution(batch_shape=[None, 3], event_shape=[5])
       sample_shape = ops.convert_to_tensor([6, 7], dtype=dtypes.int32)
       y = dist._set_sample_static_shape(x, sample_shape)
       self.assertAllEqual([6, 7, None, 3, 5], y.get_shape().as_list())
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.float32)
       dist = fake_distribution(batch_shape=[None, 3], event_shape=[None])
       sample_shape = ops.convert_to_tensor([6, 7], dtype=dtypes.int32)
       y = dist._set_sample_static_shape(x, sample_shape)
       self.assertAllEqual([6, 7, None, 3, None], y.get_shape().as_list())
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.float32)
       dist = fake_distribution(batch_shape=None, event_shape=None)
       sample_shape = ops.convert_to_tensor([6, 7], dtype=dtypes.int32)
       y = dist._set_sample_static_shape(x, sample_shape)
       self.assertTrue(y.get_shape().ndims is None)
 
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.float32)
       dist = fake_distribution(batch_shape=[None, 3], event_shape=None)
       sample_shape = ops.convert_to_tensor([6, 7], dtype=dtypes.int32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 181c46d2e5..05f5d30666 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -100,7 +100,7 @@ class MakeTrilScaleTest(test.TestCase):
   def _testLegalInputs(
       self, loc=None, shape_hint=None, scale_params=None):
     for args in _powerset(scale_params.items()):
-      with self.test_session():
+      with self.cached_session():
         args = dict(args)
 
         scale_args = dict({
@@ -143,19 +143,19 @@ class MakeTrilScaleTest(test.TestCase):
         })
 
   def testZeroTriU(self):
-    with self.test_session():
+    with self.cached_session():
       scale = distribution_util.make_tril_scale(scale_tril=[[1., 1], [1., 1.]])
       self.assertAllClose([[1., 0], [1., 1.]], scale.to_dense().eval())
 
   def testValidateArgs(self):
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("diagonal part must be non-zero"):
         scale = distribution_util.make_tril_scale(
             scale_tril=[[0., 1], [1., 1.]], validate_args=True)
         scale.to_dense().eval()
 
   def testAssertPositive(self):
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("diagonal part must be positive"):
         scale = distribution_util.make_tril_scale(
             scale_tril=[[-1., 1], [1., 1.]],
@@ -169,7 +169,7 @@ class MakeDiagScaleTest(test.TestCase):
   def _testLegalInputs(
       self, loc=None, shape_hint=None, scale_params=None):
     for args in _powerset(scale_params.items()):
-      with self.test_session():
+      with self.cached_session():
         args = dict(args)
 
         scale_args = dict({
@@ -204,14 +204,14 @@ class MakeDiagScaleTest(test.TestCase):
         })
 
   def testValidateArgs(self):
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("diagonal part must be non-zero"):
         scale = distribution_util.make_diag_scale(
             scale_diag=[[0., 1], [1., 1.]], validate_args=True)
         scale.to_dense().eval()
 
   def testAssertPositive(self):
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("diagonal part must be positive"):
         scale = distribution_util.make_diag_scale(
             scale_diag=[[-1., 1], [1., 1.]],
@@ -241,7 +241,7 @@ class ShapesFromLocAndScaleTest(test.TestCase):
     loc = constant_op.constant(np.zeros((2, 3)))
     diag = array_ops.placeholder(dtypes.float64)
     scale = linear_operator_diag.LinearOperatorDiag(diag)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_shape, event_shape = sess.run(
           distribution_util.shapes_from_loc_and_scale(loc, scale),
           feed_dict={diag: np.ones((5, 1, 3))})
@@ -252,7 +252,7 @@ class ShapesFromLocAndScaleTest(test.TestCase):
     loc = array_ops.placeholder(dtypes.float64)
     diag = constant_op.constant(np.ones((5, 2, 3)))
     scale = linear_operator_diag.LinearOperatorDiag(diag)
-    with self.test_session():
+    with self.cached_session():
       batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
           loc, scale)
       # batch_shape depends on both args, and so is dynamic.  Since loc did not
@@ -266,7 +266,7 @@ class ShapesFromLocAndScaleTest(test.TestCase):
     loc = array_ops.placeholder(dtypes.float64)
     diag = array_ops.placeholder(dtypes.float64)
     scale = linear_operator_diag.LinearOperatorDiag(diag)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_shape, event_shape = sess.run(
           distribution_util.shapes_from_loc_and_scale(loc, scale),
           feed_dict={diag: np.ones((5, 2, 3)), loc: np.zeros((2, 3))})
@@ -286,7 +286,7 @@ class ShapesFromLocAndScaleTest(test.TestCase):
     loc = None
     diag = array_ops.placeholder(dtypes.float64)
     scale = linear_operator_diag.LinearOperatorDiag(diag)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_shape, event_shape = sess.run(
           distribution_util.shapes_from_loc_and_scale(loc, scale),
           feed_dict={diag: np.ones((5, 1, 3))})
@@ -307,7 +307,7 @@ class GetBroadcastShapeTest(test.TestCase):
     x = array_ops.ones((2, 1, 3))
     y = array_ops.placeholder(x.dtype)
     z = array_ops.ones(())
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       bcast_shape = sess.run(
           distribution_util.get_broadcast_shape(x, y, z),
           feed_dict={y: np.ones((1, 5, 3)).astype(np.float32)})
@@ -317,7 +317,7 @@ class GetBroadcastShapeTest(test.TestCase):
 class TridiagTest(test.TestCase):
 
   def testWorksCorrectlyNoBatches(self):
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           [[4., 8., 0., 0.],
            [1., 5., 9., 0.],
@@ -329,7 +329,7 @@ class TridiagTest(test.TestCase):
               [8., 9., 10.]).eval())
 
   def testWorksCorrectlyBatches(self):
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(
           [[[4., 8., 0., 0.],
             [1., 5., 9., 0.],
@@ -349,7 +349,7 @@ class TridiagTest(test.TestCase):
           rtol=1e-5, atol=0.)
 
   def testHandlesNone(self):
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(
           [[[4., 0., 0., 0.],
             [0., 5., 0., 0.],
@@ -396,7 +396,7 @@ class MixtureStddevTest(test.TestCase):
                                                means_tf,
                                                sigmas_tf)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_devs = sess.run(mix_dev)
 
     self.assertAllClose(actual_devs, expected_devs)
@@ -405,7 +405,7 @@ class MixtureStddevTest(test.TestCase):
 class PadMixtureDimensionsTest(test.TestCase):
 
   def test_pad_mixture_dimensions_mixture(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gm = mixture.Mixture(
           cat=categorical.Categorical(probs=[[0.3, 0.7]]),
           components=[
@@ -422,7 +422,7 @@ class PadMixtureDimensionsTest(test.TestCase):
     self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
 
   def test_pad_mixture_dimensions_mixture_same_family(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gm = mixture_same_family.MixtureSameFamily(
           mixture_distribution=categorical.Categorical(probs=[0.3, 0.7]),
           components_distribution=mvn_diag.MultivariateNormalDiag(
@@ -444,7 +444,7 @@ class _PadTest(object):
                      [4, 5, 6]])
     value_ = np.float32(0.25)
     count_ = np.int32(2)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder_with_default(
           x_, shape=x_.shape if self.is_static_shape else None)
       value = (constant_op.constant(value_) if self.is_static_shape
@@ -491,7 +491,7 @@ class _PadTest(object):
                      [4, 5, 6]])
     value_ = np.float32(0.25)
     count_ = np.int32(2)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder_with_default(
           x_, shape=x_.shape if self.is_static_shape else None)
       value = (constant_op.constant(value_) if self.is_static_shape
@@ -542,9 +542,9 @@ class PadDynamicTest(_PadTest, test.TestCase):
     return False
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TestMoveDimension(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_move_dimension_static_shape(self):
 
     x = random_ops.random_normal(shape=[200, 30, 4, 1, 6])
@@ -561,7 +561,6 @@ class TestMoveDimension(test.TestCase):
     x_perm = distribution_util.move_dimension(x, 4, 2)
     self.assertAllEqual(x_perm.shape.as_list(), [200, 30, 6, 4, 1])
 
-  @test_util.run_in_graph_and_eager_modes
   def test_move_dimension_dynamic_shape(self):
 
     x_ = random_ops.random_normal(shape=[200, 30, 4, 1, 6])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
index 87cdd0485a..a627d85229 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.platform import test
 class GeometricTest(test.TestCase):
 
   def testGeometricShape(self):
-    with self.test_session():
+    with self.cached_session():
       probs = constant_op.constant([.1] * 5)
       geom = geometric.Geometric(probs=probs)
 
@@ -45,19 +45,19 @@ class GeometricTest(test.TestCase):
 
   def testInvalidP(self):
     invalid_ps = [-.01, -0.01, -2.]
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("Condition x >= 0"):
         geom = geometric.Geometric(probs=invalid_ps, validate_args=True)
         geom.probs.eval()
 
     invalid_ps = [1.1, 3., 5.]
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("Condition x <= y"):
         geom = geometric.Geometric(probs=invalid_ps, validate_args=True)
         geom.probs.eval()
 
   def testGeomLogPmf(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = constant_op.constant([.2] * batch_size)
       probs_v = .2
@@ -73,7 +73,7 @@ class GeometricTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_prob), pmf.eval())
 
   def testGeometricLogPmf_validate_args(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = constant_op.constant([.9] * batch_size)
       x = array_ops.placeholder(dtypes.float32, shape=[6])
@@ -95,7 +95,7 @@ class GeometricTest(test.TestCase):
       self.assertEqual([6,], pmf.get_shape())
 
   def testGeometricLogPmfMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = constant_op.constant([[.2, .3, .5]] * batch_size)
       probs_v = np.array([.2, .3, .5])
@@ -113,7 +113,7 @@ class GeometricTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_prob), pmf_values)
 
   def testGeometricCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = constant_op.constant([[.2, .4, .5]] * batch_size)
       probs_v = np.array([.2, .4, .5])
@@ -127,7 +127,7 @@ class GeometricTest(test.TestCase):
       self.assertAllClose(expected_cdf, cdf.eval())
 
   def testGeometricEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       probs_v = np.array([.1, .3, .25], dtype=np.float32)
       geom = geometric.Geometric(probs=probs_v)
       expected_entropy = stats.geom.entropy(probs_v, loc=-1)
@@ -135,7 +135,7 @@ class GeometricTest(test.TestCase):
       self.assertAllClose(expected_entropy, geom.entropy().eval())
 
   def testGeometricMean(self):
-    with self.test_session():
+    with self.cached_session():
       probs_v = np.array([.1, .3, .25])
       geom = geometric.Geometric(probs=probs_v)
       expected_means = stats.geom.mean(probs_v, loc=-1)
@@ -143,7 +143,7 @@ class GeometricTest(test.TestCase):
       self.assertAllClose(expected_means, geom.mean().eval())
 
   def testGeometricVariance(self):
-    with self.test_session():
+    with self.cached_session():
       probs_v = np.array([.1, .3, .25])
       geom = geometric.Geometric(probs=probs_v)
       expected_vars = stats.geom.var(probs_v, loc=-1)
@@ -151,7 +151,7 @@ class GeometricTest(test.TestCase):
       self.assertAllClose(expected_vars, geom.variance().eval())
 
   def testGeometricStddev(self):
-    with self.test_session():
+    with self.cached_session():
       probs_v = np.array([.1, .3, .25])
       geom = geometric.Geometric(probs=probs_v)
       expected_stddevs = stats.geom.std(probs_v, loc=-1)
@@ -159,14 +159,14 @@ class GeometricTest(test.TestCase):
       self.assertAllClose(geom.stddev().eval(), expected_stddevs)
 
   def testGeometricMode(self):
-    with self.test_session():
+    with self.cached_session():
       probs_v = np.array([.1, .3, .25])
       geom = geometric.Geometric(probs=probs_v)
       self.assertEqual([3,], geom.mode().get_shape())
       self.assertAllClose([0.] * 3, geom.mode().eval())
 
   def testGeometricSample(self):
-    with self.test_session():
+    with self.cached_session():
       probs_v = [.3, .9]
       probs = constant_op.constant(probs_v)
       n = constant_op.constant(100000)
@@ -186,7 +186,7 @@ class GeometricTest(test.TestCase):
                             rtol=.02)
 
   def testGeometricSampleMultiDimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 2
       probs_v = [.3, .9]
       probs = constant_op.constant([probs_v] * batch_size)
@@ -215,7 +215,7 @@ class GeometricTest(test.TestCase):
                             rtol=.02)
 
   def testGeometricAtBoundary(self):
-    with self.test_session():
+    with self.cached_session():
       geom = geometric.Geometric(probs=1., validate_args=True)
 
       x = np.array([0., 2., 3., 4., 5., 6., 7.], dtype=np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
index a4e7566008..686de9d246 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
@@ -55,7 +55,7 @@ class HalfNormalTest(test.TestCase):
     self.assertAllEqual(all_true, is_finite)
 
   def _testParamShapes(self, sample_shape, expected):
-    with self.test_session():
+    with self.cached_session():
       param_shapes = hn_lib.HalfNormal.param_shapes(sample_shape)
       scale_shape = param_shapes["scale"]
       self.assertAllEqual(expected, scale_shape.eval())
@@ -87,7 +87,7 @@ class HalfNormalTest(test.TestCase):
         tensor_shape.TensorShape(sample_shape), sample_shape)
 
   def testHalfNormalLogPDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       scale = constant_op.constant([3.0] * batch_size)
       x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
@@ -106,7 +106,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
 
   def testHalfNormalLogPDFMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       scale = constant_op.constant([[3.0, 1.0]] * batch_size)
       x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
@@ -125,7 +125,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
 
   def testHalfNormalCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       scale = self._rng.rand(batch_size) + 1.0
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
@@ -144,7 +144,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllClose(np.exp(expected_logcdf), cdf.eval(), atol=0)
 
   def testHalfNormalSurvivalFunction(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       scale = self._rng.rand(batch_size) + 1.0
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
@@ -163,7 +163,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllClose(np.exp(expected_logsf), sf.eval(), atol=0)
 
   def testHalfNormalQuantile(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 50
       scale = self._rng.rand(batch_size) + 1.0
       p = np.linspace(0., 1.0, batch_size).astype(np.float64)
@@ -191,13 +191,13 @@ class HalfNormalTest(test.TestCase):
           print(func.__name__)
           value = func(x)
           grads = gradients_impl.gradients(value, [scale])
-          with self.test_session(graph=g):
+          with self.session(graph=g):
             variables.global_variables_initializer().run()
             self.assertAllFinite(value)
             self.assertAllFinite(grads[0])
 
   def testHalfNormalEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       scale = np.array([[1.0, 2.0, 3.0]])
       halfnorm = hn_lib.HalfNormal(scale=scale)
 
@@ -210,7 +210,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllClose(expected_entropy, entropy.eval())
 
   def testHalfNormalMeanAndMode(self):
-    with self.test_session():
+    with self.cached_session():
       scale = np.array([11., 12., 13.])
 
       halfnorm = hn_lib.HalfNormal(scale=scale)
@@ -223,7 +223,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllEqual([0., 0., 0.], halfnorm.mode().eval())
 
   def testHalfNormalVariance(self):
-    with self.test_session():
+    with self.cached_session():
       scale = np.array([7., 7., 7.])
       halfnorm = hn_lib.HalfNormal(scale=scale)
       expected_variance = scale ** 2.0 * (1.0 - 2.0 / np.pi)
@@ -232,7 +232,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllEqual(expected_variance, halfnorm.variance().eval())
 
   def testHalfNormalStandardDeviation(self):
-    with self.test_session():
+    with self.cached_session():
       scale = np.array([7., 7., 7.])
       halfnorm = hn_lib.HalfNormal(scale=scale)
       expected_variance = scale ** 2.0 * (1.0 - 2.0 / np.pi)
@@ -241,7 +241,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllEqual(np.sqrt(expected_variance), halfnorm.stddev().eval())
 
   def testHalfNormalSample(self):
-    with self.test_session():
+    with self.cached_session():
       scale = constant_op.constant(3.0)
       n = constant_op.constant(100000)
       halfnorm = hn_lib.HalfNormal(scale=scale)
@@ -263,7 +263,7 @@ class HalfNormalTest(test.TestCase):
       self.assertAllEqual(expected_shape_static, sample.eval().shape)
 
   def testHalfNormalSampleMultiDimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 2
       scale = constant_op.constant([[2.0, 3.0]] * batch_size)
       n = constant_op.constant(100000)
@@ -287,13 +287,13 @@ class HalfNormalTest(test.TestCase):
       self.assertAllEqual(expected_shape_static, sample.eval().shape)
 
   def testNegativeSigmaFails(self):
-    with self.test_session():
+    with self.cached_session():
       halfnorm = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G")
       with self.assertRaisesOpError("Condition x > 0 did not hold"):
         halfnorm.mean().eval()
 
   def testHalfNormalShape(self):
-    with self.test_session():
+    with self.cached_session():
       scale = constant_op.constant([6.0] * 5)
       halfnorm = hn_lib.HalfNormal(scale=scale)
 
@@ -306,7 +306,7 @@ class HalfNormalTest(test.TestCase):
     scale = array_ops.placeholder(dtype=dtypes.float32)
     halfnorm = hn_lib.HalfNormal(scale=scale)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # get_batch_shape should return an "<unknown>" tensor.
       self.assertEqual(halfnorm.batch_shape, tensor_shape.TensorShape(None))
       self.assertEqual(halfnorm.event_shape, ())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py b/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py
index 6a69f9e60b..ecf27289d7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/independent_test.py
@@ -52,7 +52,7 @@ class ProductDistributionTest(test.TestCase):
   def testSampleAndLogProbUnivariate(self):
     loc = np.float32([-1., 1])
     scale = np.float32([0.1, 0.5])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ind = independent_lib.Independent(
           distribution=normal_lib.Normal(loc=loc, scale=scale),
           reinterpreted_batch_ndims=1)
@@ -73,7 +73,7 @@ class ProductDistributionTest(test.TestCase):
   def testSampleAndLogProbMultivariate(self):
     loc = np.float32([[-1., 1], [1, -1]])
     scale = np.float32([1., 0.5])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ind = independent_lib.Independent(
           distribution=mvn_diag_lib.MultivariateNormalDiag(
               loc=loc,
@@ -98,7 +98,7 @@ class ProductDistributionTest(test.TestCase):
     loc = np.float32([[-1., 1], [1, -1]])
     scale = np.float32([1., 0.5])
     n_samp = 1e4
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ind = independent_lib.Independent(
           distribution=mvn_diag_lib.MultivariateNormalDiag(
               loc=loc,
@@ -231,7 +231,7 @@ class ProductDistributionTest(test.TestCase):
     def expected_log_prob(x, logits):
       return (x * logits - np.log1p(np.exp(logits))).sum(-1).sum(-1).sum(-1)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       logits_ph = array_ops.placeholder(
           dtypes.float32, shape=logits.shape if static_shape else None)
       ind = independent_lib.Independent(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py
index 6eb96ea9ff..70551d89d9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.platform import test
 class InverseGammaTest(test.TestCase):
 
   def testInverseGammaShape(self):
-    with self.test_session():
+    with self.cached_session():
       alpha = constant_op.constant([3.0] * 5)
       beta = constant_op.constant(11.0)
       inv_gamma = inverse_gamma.InverseGamma(concentration=alpha, rate=beta)
@@ -43,7 +43,7 @@ class InverseGammaTest(test.TestCase):
           []))
 
   def testInverseGammaLogPDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       alpha = constant_op.constant([2.0] * batch_size)
       beta = constant_op.constant([3.0] * batch_size)
@@ -61,7 +61,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testInverseGammaLogPDFMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       alpha = constant_op.constant([[2.0, 4.0]] * batch_size)
       beta = constant_op.constant([[3.0, 4.0]] * batch_size)
@@ -81,7 +81,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testInverseGammaLogPDFMultidimensionalBroadcasting(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       alpha = constant_op.constant([[2.0, 4.0]] * batch_size)
       beta = constant_op.constant(3.0)
@@ -101,7 +101,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testInverseGammaCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       alpha_v = 2.0
       beta_v = 3.0
@@ -117,7 +117,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testInverseGammaMode(self):
-    with self.test_session():
+    with self.cached_session():
       alpha_v = np.array([5.5, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       inv_gamma = inverse_gamma.InverseGamma(concentration=alpha_v, rate=beta_v)
@@ -126,7 +126,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(inv_gamma.mode().eval(), expected_modes)
 
   def testInverseGammaMeanAllDefined(self):
-    with self.test_session():
+    with self.cached_session():
       alpha_v = np.array([5.5, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       inv_gamma = inverse_gamma.InverseGamma(concentration=alpha_v, rate=beta_v)
@@ -135,7 +135,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(inv_gamma.mean().eval(), expected_means)
 
   def testInverseGammaMeanAllowNanStats(self):
-    with self.test_session():
+    with self.cached_session():
       # Mean will not be defined for the first entry.
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
@@ -145,7 +145,7 @@ class InverseGammaTest(test.TestCase):
         inv_gamma.mean().eval()
 
   def testInverseGammaMeanNanStats(self):
-    with self.test_session():
+    with self.cached_session():
       # Mode will not be defined for the first two entries.
       alpha_v = np.array([0.5, 1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 2.0, 4.0, 5.0])
@@ -158,7 +158,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(inv_gamma.mean().eval(), expected_means)
 
   def testInverseGammaVarianceAllDefined(self):
-    with self.test_session():
+    with self.cached_session():
       alpha_v = np.array([7.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       inv_gamma = inverse_gamma.InverseGamma(concentration=alpha_v, rate=beta_v)
@@ -167,7 +167,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(inv_gamma.variance().eval(), expected_variances)
 
   def testInverseGammaVarianceAllowNanStats(self):
-    with self.test_session():
+    with self.cached_session():
       alpha_v = np.array([1.5, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       inv_gamma = inverse_gamma.InverseGamma(
@@ -176,7 +176,7 @@ class InverseGammaTest(test.TestCase):
         inv_gamma.variance().eval()
 
   def testInverseGammaVarianceNanStats(self):
-    with self.test_session():
+    with self.cached_session():
       alpha_v = np.array([1.5, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       inv_gamma = inverse_gamma.InverseGamma(
@@ -187,7 +187,7 @@ class InverseGammaTest(test.TestCase):
       self.assertAllClose(inv_gamma.variance().eval(), expected_variances)
 
   def testInverseGammaEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       expected_entropy = stats.invgamma.entropy(alpha_v, scale=beta_v)
@@ -292,7 +292,7 @@ class InverseGammaTest(test.TestCase):
     self.assertNear(1., total, err=err)
 
   def testInverseGammaNonPositiveInitializationParamsRaises(self):
-    with self.test_session():
+    with self.cached_session():
       alpha_v = constant_op.constant(0.0, name="alpha")
       beta_v = constant_op.constant(1.0, name="beta")
       inv_gamma = inverse_gamma.InverseGamma(
@@ -307,7 +307,7 @@ class InverseGammaTest(test.TestCase):
         inv_gamma.mean().eval()
 
   def testInverseGammaWithSoftplusConcentrationRate(self):
-    with self.test_session():
+    with self.cached_session():
       alpha = constant_op.constant([-0.1, -2.9], name="alpha")
       beta = constant_op.constant([1.0, -4.8], name="beta")
       inv_gamma = inverse_gamma.InverseGammaWithSoftplusConcentrationRate(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
index 2980e2bfe9..e39db51728 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
@@ -77,7 +77,7 @@ def _kumaraswamy_pdf(a, b, x):
 class KumaraswamyTest(test.TestCase):
 
   def testSimpleShapes(self):
-    with self.test_session():
+    with self.cached_session():
       a = np.random.rand(3)
       b = np.random.rand(3)
       dist = kumaraswamy_lib.Kumaraswamy(a, b)
@@ -87,7 +87,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape)
 
   def testComplexShapes(self):
-    with self.test_session():
+    with self.cached_session():
       a = np.random.rand(3, 2, 2)
       b = np.random.rand(3, 2, 2)
       dist = kumaraswamy_lib.Kumaraswamy(a, b)
@@ -97,7 +97,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
 
   def testComplexShapesBroadcast(self):
-    with self.test_session():
+    with self.cached_session():
       a = np.random.rand(3, 2, 2)
       b = np.random.rand(2, 2)
       dist = kumaraswamy_lib.Kumaraswamy(a, b)
@@ -109,7 +109,7 @@ class KumaraswamyTest(test.TestCase):
   def testAProperty(self):
     a = [[1., 2, 3]]
     b = [[2., 4, 3]]
-    with self.test_session():
+    with self.cached_session():
       dist = kumaraswamy_lib.Kumaraswamy(a, b)
       self.assertEqual([1, 3], dist.concentration1.get_shape())
       self.assertAllClose(a, dist.concentration1.eval())
@@ -117,7 +117,7 @@ class KumaraswamyTest(test.TestCase):
   def testBProperty(self):
     a = [[1., 2, 3]]
     b = [[2., 4, 3]]
-    with self.test_session():
+    with self.cached_session():
       dist = kumaraswamy_lib.Kumaraswamy(a, b)
       self.assertEqual([1, 3], dist.concentration0.get_shape())
       self.assertAllClose(b, dist.concentration0.eval())
@@ -125,7 +125,7 @@ class KumaraswamyTest(test.TestCase):
   def testPdfXProper(self):
     a = [[1., 2, 3]]
     b = [[2., 4, 3]]
-    with self.test_session():
+    with self.cached_session():
       dist = kumaraswamy_lib.Kumaraswamy(a, b, validate_args=True)
       dist.prob([.1, .3, .6]).eval()
       dist.prob([.2, .3, .5]).eval()
@@ -136,7 +136,7 @@ class KumaraswamyTest(test.TestCase):
         dist.prob([.1, .2, 1.2]).eval()
 
   def testPdfTwoBatches(self):
-    with self.test_session():
+    with self.cached_session():
       a = [1., 2]
       b = [1., 2]
       x = [.5, .5]
@@ -147,7 +147,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual((2,), pdf.get_shape())
 
   def testPdfTwoBatchesNontrivialX(self):
-    with self.test_session():
+    with self.cached_session():
       a = [1., 2]
       b = [1., 2]
       x = [.3, .7]
@@ -158,7 +158,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual((2,), pdf.get_shape())
 
   def testPdfUniformZeroBatch(self):
-    with self.test_session():
+    with self.cached_session():
       # This is equivalent to a uniform distribution
       a = 1.
       b = 1.
@@ -170,7 +170,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual((5,), pdf.get_shape())
 
   def testPdfAStretchedInBroadcastWhenSameRank(self):
-    with self.test_session():
+    with self.cached_session():
       a = [[1., 2]]
       b = [[1., 2]]
       x = [[.5, .5], [.3, .7]]
@@ -181,7 +181,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfAStretchedInBroadcastWhenLowerRank(self):
-    with self.test_session():
+    with self.cached_session():
       a = [1., 2]
       b = [1., 2]
       x = [[.5, .5], [.2, .8]]
@@ -191,7 +191,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenSameRank(self):
-    with self.test_session():
+    with self.cached_session():
       a = [[1., 2], [2., 3]]
       b = [[1., 2], [2., 3]]
       x = [[.5, .5]]
@@ -201,7 +201,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenLowerRank(self):
-    with self.test_session():
+    with self.cached_session():
       a = [[1., 2], [2., 3]]
       b = [[1., 2], [2., 3]]
       x = [.5, .5]
@@ -289,7 +289,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertAllClose(expected_entropy, dist.entropy().eval())
 
   def testKumaraswamySample(self):
-    with self.test_session():
+    with self.cached_session():
       a = 1.
       b = 2.
       kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
@@ -316,7 +316,7 @@ class KumaraswamyTest(test.TestCase):
 
   # Test that sampling with the same seed twice gives the same results.
   def testKumaraswamySampleMultipleTimes(self):
-    with self.test_session():
+    with self.cached_session():
       a_val = 1.
       b_val = 2.
       n_val = 100
@@ -334,7 +334,7 @@ class KumaraswamyTest(test.TestCase):
       self.assertAllClose(samples1, samples2)
 
   def testKumaraswamySampleMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       a = np.random.rand(3, 2, 2).astype(np.float32)
       b = np.random.rand(3, 2, 2).astype(np.float32)
       kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
@@ -351,7 +351,7 @@ class KumaraswamyTest(test.TestCase):
           atol=1e-1)
 
   def testKumaraswamyCdf(self):
-    with self.test_session():
+    with self.cached_session():
       shape = (30, 40, 50)
       for dt in (np.float32, np.float64):
         a = 10. * np.random.random(shape).astype(dt)
@@ -366,7 +366,7 @@ class KumaraswamyTest(test.TestCase):
             _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
 
   def testKumaraswamyLogCdf(self):
-    with self.test_session():
+    with self.cached_session():
       shape = (30, 40, 50)
       for dt in (np.float32, np.float64):
         a = 10. * np.random.random(shape).astype(dt)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
index 251be9ed4f..12a2d4f8ec 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
@@ -39,7 +39,7 @@ class LogisticTest(test.TestCase):
         dist.reparameterization_type == distribution.FULLY_REPARAMETERIZED)
 
   def testLogisticLogProb(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       np_loc = np.array([2.0] * batch_size, dtype=np.float32)
       loc = constant_op.constant(np_loc)
@@ -57,7 +57,7 @@ class LogisticTest(test.TestCase):
       self.assertAllClose(prob.eval(), np.exp(expected_log_prob))
 
   def testLogisticCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       np_loc = np.array([2.0] * batch_size, dtype=np.float32)
       loc = constant_op.constant(np_loc)
@@ -72,7 +72,7 @@ class LogisticTest(test.TestCase):
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLogisticLogCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       np_loc = np.array([2.0] * batch_size, dtype=np.float32)
       loc = constant_op.constant(np_loc)
@@ -87,7 +87,7 @@ class LogisticTest(test.TestCase):
       self.assertAllClose(logcdf.eval(), expected_logcdf)
 
   def testLogisticSurvivalFunction(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       np_loc = np.array([2.0] * batch_size, dtype=np.float32)
       loc = constant_op.constant(np_loc)
@@ -102,7 +102,7 @@ class LogisticTest(test.TestCase):
       self.assertAllClose(survival_function.eval(), expected_survival_function)
 
   def testLogisticLogSurvivalFunction(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       np_loc = np.array([2.0] * batch_size, dtype=np.float32)
       loc = constant_op.constant(np_loc)
@@ -118,7 +118,7 @@ class LogisticTest(test.TestCase):
                           expected_logsurvival_function)
 
   def testLogisticMean(self):
-    with self.test_session():
+    with self.cached_session():
       loc = [2.0, 1.5, 1.0]
       scale = 1.5
       expected_mean = stats.logistic.mean(loc, scale)
@@ -126,7 +126,7 @@ class LogisticTest(test.TestCase):
       self.assertAllClose(dist.mean().eval(), expected_mean)
 
   def testLogisticVariance(self):
-    with self.test_session():
+    with self.cached_session():
       loc = [2.0, 1.5, 1.0]
       scale = 1.5
       expected_variance = stats.logistic.var(loc, scale)
@@ -134,7 +134,7 @@ class LogisticTest(test.TestCase):
       self.assertAllClose(dist.variance().eval(), expected_variance)
 
   def testLogisticEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 3
       np_loc = np.array([2.0] * batch_size, dtype=np.float32)
       loc = constant_op.constant(np_loc)
@@ -144,7 +144,7 @@ class LogisticTest(test.TestCase):
       self.assertAllClose(dist.entropy().eval(), expected_entropy)
 
   def testLogisticSample(self):
-    with self.test_session():
+    with self.cached_session():
       loc = [3.0, 4.0, 2.0]
       scale = 1.0
       dist = logistic.Logistic(loc, scale)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
index ff6092fc26..faff42d243 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
@@ -35,7 +35,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
                             test.TestCase):
 
   def testSampleAndLogProbUnivariateShapes(self):
-    with self.test_session():
+    with self.cached_session():
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[0.3, 0.7]),
           components_distribution=normal_lib.Normal(
@@ -46,7 +46,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
       self.assertEqual([4, 5], log_prob_x.shape)
 
   def testSampleAndLogProbBatch(self):
-    with self.test_session():
+    with self.cached_session():
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[[0.3, 0.7]]),
           components_distribution=normal_lib.Normal(
@@ -59,7 +59,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
   def testSampleAndLogProbShapesBroadcastMix(self):
     mix_probs = np.float32([.3, .7])
     bern_probs = np.float32([[.4, .6], [.25, .75]])
-    with self.test_session():
+    with self.cached_session():
       bm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=mix_probs),
           components_distribution=bernoulli_lib.Bernoulli(probs=bern_probs))
@@ -72,7 +72,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
           np.ones_like(x_, dtype=np.bool), np.logical_or(x_ == 0., x_ == 1.))
 
   def testSampleAndLogProbMultivariateShapes(self):
-    with self.test_session():
+    with self.cached_session():
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[0.3, 0.7]),
           components_distribution=mvn_diag_lib.MultivariateNormalDiag(
@@ -83,7 +83,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
       self.assertEqual([4, 5], log_prob_x.shape)
 
   def testSampleAndLogProbBatchMultivariateShapes(self):
-    with self.test_session():
+    with self.cached_session():
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[0.3, 0.7]),
           components_distribution=mvn_diag_lib.MultivariateNormalDiag(
@@ -98,7 +98,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
       self.assertEqual([4, 5, 2], log_prob_x.shape)
 
   def testSampleConsistentLogProb(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[0.3, 0.7]),
           components_distribution=mvn_diag_lib.MultivariateNormalDiag(
@@ -111,7 +111,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
           sess.run, gm, radius=1., center=[1., -1], rtol=0.02)
 
   def testLogCdf(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[0.3, 0.7]),
           components_distribution=normal_lib.Normal(
@@ -128,7 +128,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
                           rtol=1e-6, atol=0.0)
 
   def testSampleConsistentMeanCovariance(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[0.3, 0.7]),
           components_distribution=mvn_diag_lib.MultivariateNormalDiag(
@@ -136,7 +136,7 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
       self.run_test_sample_consistent_mean_covariance(sess.run, gm)
 
   def testVarianceConsistentCovariance(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       gm = mixture_same_family_lib.MixtureSameFamily(
           mixture_distribution=categorical_lib.Categorical(probs=[0.3, 0.7]),
           components_distribution=mvn_diag_lib.MultivariateNormalDiag(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index 0206489175..f8dbd34d02 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -152,7 +152,7 @@ class MixtureTest(test.TestCase):
   use_static_graph = False
 
   def testShapes(self):
-    with self.test_session():
+    with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
         dist = make_univariate_mixture(batch_shape, num_components=10,
                                        use_static_graph=self.use_static_graph)
@@ -200,7 +200,7 @@ class MixtureTest(test.TestCase):
           use_static_graph=self.use_static_graph)
 
   def testBrokenShapesDynamic(self):
-    with self.test_session():
+    with self.cached_session():
       d0_param = array_ops.placeholder(dtype=dtypes.float32)
       d1_param = array_ops.placeholder(dtype=dtypes.float32)
       d = ds.Mixture(
@@ -246,7 +246,7 @@ class MixtureTest(test.TestCase):
     # mixture are checked for equivalence.
 
   def testMeanUnivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
             batch_shape=batch_shape, num_components=2,
@@ -268,7 +268,7 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(true_mean, mean_value)
 
   def testMeanMultivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
             batch_shape=batch_shape, num_components=2, event_shape=(4,),
@@ -296,7 +296,7 @@ class MixtureTest(test.TestCase):
   def testStddevShapeUnivariate(self):
     num_components = 2
     # This is the same shape test which is done in 'testMeanUnivariate'.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
             batch_shape=batch_shape, num_components=num_components,
@@ -337,7 +337,7 @@ class MixtureTest(test.TestCase):
     num_components = 2
 
     # This is the same shape test which is done in 'testMeanMultivariate'.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
             batch_shape=batch_shape,
@@ -392,12 +392,12 @@ class MixtureTest(test.TestCase):
         ],
         use_static_graph=self.use_static_graph)
     mix_dev = mixture_dist.stddev()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_stddev = sess.run(mix_dev)
     self.assertAllClose(actual_stddev, ground_truth_stddev)
 
   def testProbScalarUnivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = make_univariate_mixture(batch_shape=[], num_components=2,
                                      use_static_graph=self.use_static_graph)
       for x in [
@@ -423,7 +423,7 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(total_prob, p_x_value)
 
   def testProbScalarMultivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = make_multivariate_mixture(
           batch_shape=[], num_components=2, event_shape=[3],
           use_static_graph=self.use_static_graph)
@@ -452,7 +452,7 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(total_prob, p_x_value)
 
   def testProbBatchUnivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2,
                                      use_static_graph=self.use_static_graph)
 
@@ -479,7 +479,7 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(total_prob, p_x_value)
 
   def testProbBatchMultivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = make_multivariate_mixture(
           batch_shape=[2, 3], num_components=2, event_shape=[4],
           use_static_graph=self.use_static_graph)
@@ -506,7 +506,7 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(total_prob, p_x_value)
 
   def testSampleScalarBatchUnivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       num_components = 3
       batch_shape = []
       dist = make_univariate_mixture(
@@ -539,7 +539,7 @@ class MixtureTest(test.TestCase):
     mus = [-5.0, 0.0, 5.0, 4.0, 20.0]
     sigmas = [0.1, 5.0, 3.0, 0.2, 4.0]
 
-    with self.test_session():
+    with self.cached_session():
       n = 100
 
       random_seed.set_random_seed(654321)
@@ -567,7 +567,7 @@ class MixtureTest(test.TestCase):
       self.assertAllClose(samples1, samples2)
 
   def testSampleScalarBatchMultivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       num_components = 3
       dist = make_multivariate_mixture(
           batch_shape=[], num_components=num_components, event_shape=[2],
@@ -592,7 +592,7 @@ class MixtureTest(test.TestCase):
         self.assertAllClose(which_dist_samples, sample_values[which_c, :])
 
   def testSampleBatchUnivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       num_components = 3
       dist = make_univariate_mixture(
           batch_shape=[2, 3], num_components=num_components,
@@ -620,7 +620,7 @@ class MixtureTest(test.TestCase):
                             sample_values[which_c_s, which_c_b0, which_c_b1])
 
   def _testSampleBatchMultivariate(self, fully_known_batch_shape):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       num_components = 3
       if fully_known_batch_shape:
         batch_shape = [2, 3]
@@ -672,7 +672,7 @@ class MixtureTest(test.TestCase):
     self._testSampleBatchMultivariate(fully_known_batch_shape=False)
 
   def testEntropyLowerBoundMultivariate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
             batch_shape=batch_shape, num_components=2, event_shape=(4,),
@@ -732,7 +732,7 @@ class MixtureTest(test.TestCase):
     x_cdf_tf = mixture_tf.cdf(x_tensor)
     x_log_cdf_tf = mixture_tf.log_cdf(x_tensor)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for x_feed in xs_to_check:
         x_cdf_tf_result, x_log_cdf_tf_result = sess.run(
             [x_cdf_tf, x_log_cdf_tf], feed_dict={x_tensor: x_feed})
@@ -778,7 +778,7 @@ class MixtureTest(test.TestCase):
     x_cdf_tf = mixture_tf.cdf(x_tensor)
     x_log_cdf_tf = mixture_tf.log_cdf(x_tensor)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for x_feed in xs_to_check:
         x_cdf_tf_result, x_log_cdf_tf_result = sess.run(
             [x_cdf_tf, x_log_cdf_tf],
@@ -802,7 +802,7 @@ class MixtureTest(test.TestCase):
     Mixture's use of dynamic partition requires `random_gamma` correctly returns
     an empty `Tensor`.
     """
-    with self.test_session():
+    with self.cached_session():
       gm = ds.Mixture(
           cat=ds.Categorical(probs=[.3, .7]),
           components=[ds.Gamma(1., 2.),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
index 509fc66c05..3c988dad8a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/moving_stats_test.py
@@ -36,7 +36,7 @@ class MovingReduceMeanVarianceTest(test.TestCase):
     shape = [1, 2]
     true_mean = np.array([[0., 3.]])
     true_stddev = np.array([[1.1, 0.5]])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Start "x" out with this mean.
       mean_var = variables.Variable(array_ops.zeros_like(true_mean))
       variance_var = variables.Variable(array_ops.ones_like(true_stddev))
@@ -84,7 +84,7 @@ class MovingReduceMeanVarianceTest(test.TestCase):
     shape = [1, 2]
     true_mean = np.array([[0., 3.]])
     true_stddev = np.array([[1.1, 0.5]])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Start "x" out with this mean.
       x = random_ops.random_normal(shape, dtype=np.float64, seed=0)
       x = true_stddev * x + true_mean
@@ -111,7 +111,7 @@ class MovingLogExponentialMovingMeanExpTest(test.TestCase):
     true_mean = np.array([[0., 3.]])
     true_stddev = np.array([[1.1, 0.5]])
     decay = 0.99
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Start "x" out with this mean.
       x = random_ops.random_normal(shape, dtype=np.float64, seed=0)
       x = true_stddev * x + true_mean
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
index a924d2e383..88d0d346a4 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
@@ -39,7 +39,7 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     diag = np.array([[1., 2], [3, 4], [5, 6]])
     # batch_shape: [1], event_shape: []
     identity_multiplier = np.array([5.])
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiagPlusLowRank(
           scale_diag=diag,
           scale_identity_multiplier=identity_multiplier,
@@ -61,7 +61,7 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     diag = np.array([[1., 2], [3, 4], [5, 6]])
     # batch_shape: [3, 1], event_shape: []
     identity_multiplier = np.array([[5.], [4], [3]])
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiagPlusLowRank(
           scale_diag=diag,
           scale_identity_multiplier=identity_multiplier,
@@ -75,7 +75,7 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     diag = np.array([[1., 2], [3, 4], [5, 6]])
     # batch_shape: [3], event_shape: []
     identity_multiplier = np.array([5., 4, 3])
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiagPlusLowRank(
           scale_diag=diag,
           scale_identity_multiplier=identity_multiplier,
@@ -94,7 +94,7 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     loc = np.array([1., 0, -1])
     # batch_shape: [3], event_shape: []
     identity_multiplier = np.array([5., 4, 3])
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiagPlusLowRank(
           loc=loc,
           scale_identity_multiplier=identity_multiplier,
@@ -116,7 +116,7 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     diag_large = [1.0, 5.0]
     v = [[2.0], [3.0]]
     diag_small = [3.0]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiagPlusLowRank(
           loc=mu,
           scale_diag=diag_large,
@@ -146,7 +146,7 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     true_variance = np.diag(true_covariance)
     true_stddev = np.sqrt(true_variance)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = ds.MultivariateNormalDiagPlusLowRank(
           loc=mu,
           scale_diag=diag_large,
@@ -380,7 +380,7 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     cov = np.stack([np.matmul(scale[0], scale[0].T),
                     np.matmul(scale[1], scale[1].T)])
     logging.vlog(2, "expected_cov:\n{}".format(cov))
-    with self.test_session():
+    with self.cached_session():
       mvn = ds.MultivariateNormalDiagPlusLowRank(
           loc=mu,
           scale_perturb_factor=u,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
index 9635134b08..6a3d171f6c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -45,14 +45,14 @@ class MultivariateNormalDiagTest(test.TestCase):
   def testScalarParams(self):
     mu = -1.
     diag = -5.
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
         ds.MultivariateNormalDiag(mu, diag)
 
   def testVectorParams(self):
     mu = [-1.]
     diag = [-5.]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
       self.assertAllEqual([3, 1], dist.sample(3).get_shape())
 
@@ -63,7 +63,7 @@ class MultivariateNormalDiagTest(test.TestCase):
     # Batch shape = [1], event shape = [3]
     mu = array_ops.zeros((1, 3))
     diag = array_ops.ones((1, 3))
-    with self.test_session():
+    with self.cached_session():
       base_dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
       dist = ds.TransformedDistribution(
           base_dist,
@@ -75,14 +75,14 @@ class MultivariateNormalDiagTest(test.TestCase):
   def testMean(self):
     mu = [-1., 1]
     diag = [1., -5]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
       self.assertAllEqual(mu, dist.mean().eval())
 
   def testMeanWithBroadcastLoc(self):
     mu = [-1.]
     diag = [1., -5]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
       self.assertAllEqual([-1., -1.], dist.mean().eval())
 
@@ -91,14 +91,14 @@ class MultivariateNormalDiagTest(test.TestCase):
     diag = [-1., 5]
     diag_mat = np.diag(diag)
     scipy_mvn = stats.multivariate_normal(mean=mu, cov=diag_mat**2)
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
       self.assertAllClose(scipy_mvn.entropy(), dist.entropy().eval(), atol=1e-4)
 
   def testSample(self):
     mu = [-1., 1]
     diag = [1., -2]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
       samps = dist.sample(int(1e3), seed=0).eval()
       cov_mat = array_ops.matrix_diag(diag).eval()**2
@@ -111,7 +111,7 @@ class MultivariateNormalDiagTest(test.TestCase):
   def testSingularScaleRaises(self):
     mu = [-1., 1]
     diag = [1., 0]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
       with self.assertRaisesOpError("Singular"):
         dist.sample().eval()
@@ -123,7 +123,7 @@ class MultivariateNormalDiagTest(test.TestCase):
     # diag corresponds to no batches of 3-variate normals
     diag = np.ones([3])
 
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
 
       mean = dist.mean()
@@ -142,7 +142,7 @@ class MultivariateNormalDiagTest(test.TestCase):
                           atol=0.10, rtol=0.05)
 
   def testCovariance(self):
-    with self.test_session():
+    with self.cached_session():
       mvn = ds.MultivariateNormalDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
@@ -178,7 +178,7 @@ class MultivariateNormalDiagTest(test.TestCase):
           mvn.covariance().eval())
 
   def testVariance(self):
-    with self.test_session():
+    with self.cached_session():
       mvn = ds.MultivariateNormalDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
@@ -203,7 +203,7 @@ class MultivariateNormalDiagTest(test.TestCase):
           mvn.variance().eval())
 
   def testStddev(self):
-    with self.test_session():
+    with self.cached_session():
       mvn = ds.MultivariateNormalDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
@@ -229,7 +229,7 @@ class MultivariateNormalDiagTest(test.TestCase):
   def testMultivariateNormalDiagWithSoftplusScale(self):
     mu = [-1.0, 1.0]
     diag = [-1.0, -2.0]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.MultivariateNormalDiagWithSoftplusScale(
           mu, diag, validate_args=True)
       samps = dist.sample(1000, seed=0).eval()
@@ -241,7 +241,7 @@ class MultivariateNormalDiagTest(test.TestCase):
   def testMultivariateNormalDiagNegLogLikelihood(self):
     num_draws = 50
     dims = 3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_pl = array_ops.placeholder(dtype=dtypes.float32,
                                    shape=[None, dims],
                                    name="x")
@@ -291,7 +291,7 @@ class MultivariateNormalDiagTest(test.TestCase):
 
   def testKLDivIdenticalGradientDefined(self):
     dims = 3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       loc = array_ops.zeros([dims], dtype=dtypes.float32)
       mvn = ds.MultivariateNormalDiag(
           loc=loc,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
index b003526392..bbf803f045 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -40,7 +40,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
     return math_ops.matmul(chol, chol, adjoint_b=True).eval()
 
   def testRaisesIfInitializedWithNonSymmetricMatrix(self):
-    with self.test_session():
+    with self.cached_session():
       mu = [1., 2.]
       sigma = [[1., 0.], [1., 1.]]  # Nonsingular, but not symmetric
       mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
@@ -48,14 +48,14 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
         mvn.covariance().eval()
 
   def testNamePropertyIsSetByInitArg(self):
-    with self.test_session():
+    with self.cached_session():
       mu = [1., 2.]
       sigma = [[1., 0.], [0., 1.]]
       mvn = ds.MultivariateNormalFullCovariance(mu, sigma, name="Billy")
       self.assertEqual(mvn.name, "Billy/")
 
   def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self):
-    with self.test_session():
+    with self.cached_session():
       mu = rng.rand(10)
       sigma = self._random_pd_matrix(10, 10)
       mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
@@ -63,7 +63,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
       mvn.covariance().eval()
 
   def testLogPDFScalarBatch(self):
-    with self.test_session():
+    with self.cached_session():
       mu = rng.rand(2)
       sigma = self._random_pd_matrix(2, 2)
       mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
@@ -82,7 +82,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
       self.assertAllClose(expected_pdf, pdf.eval())
 
   def testLogPDFScalarBatchCovarianceNotProvided(self):
-    with self.test_session():
+    with self.cached_session():
       mu = rng.rand(2)
       mvn = ds.MultivariateNormalFullCovariance(
           mu, covariance_matrix=None, validate_args=True)
@@ -102,7 +102,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
       self.assertAllClose(expected_pdf, pdf.eval())
 
   def testShapes(self):
-    with self.test_session():
+    with self.cached_session():
       mu = rng.rand(3, 5, 2)
       covariance = self._random_pd_matrix(3, 5, 2, 2)
 
@@ -133,7 +133,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
   def testKLBatch(self):
     batch_shape = [2]
     event_shape = [3]
-    with self.test_session():
+    with self.cached_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
       mvn_a = ds.MultivariateNormalFullCovariance(
@@ -159,7 +159,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase):
   def testKLBatchBroadcast(self):
     batch_shape = [2]
     event_shape = [3]
-    with self.test_session():
+    with self.cached_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       # No batch shape.
       mu_b, sigma_b = self._random_mu_and_sigma([], event_shape)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
index b556d06123..776fc2ca9d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
@@ -45,7 +45,7 @@ class MultivariateNormalTriLTest(test.TestCase):
     return chol.eval(), sigma.eval()
 
   def testLogPDFScalarBatch(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(2)
       chol, sigma = self._random_chol(2, 2)
       chol[1, 1] = -chol[1, 1]
@@ -65,7 +65,7 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_pdf, pdf.eval())
 
   def testLogPDFXIsHigherRank(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(2)
       chol, sigma = self._random_chol(2, 2)
       chol[0, 0] = -chol[0, 0]
@@ -85,7 +85,7 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_pdf, pdf.eval(), atol=0., rtol=0.03)
 
   def testLogPDFXLowerDimension(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(3, 2)
       chol, sigma = self._random_chol(3, 2, 2)
       chol[0, 0, 0] = -chol[0, 0, 0]
@@ -108,7 +108,7 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_pdf, pdf.eval()[1])
 
   def testEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(2)
       chol, sigma = self._random_chol(2, 2)
       chol[0, 0] = -chol[0, 0]
@@ -121,7 +121,7 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_entropy, entropy.eval())
 
   def testEntropyMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(3, 5, 2)
       chol, sigma = self._random_chol(3, 5, 2, 2)
       chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
@@ -136,7 +136,7 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_entropy, entropy.eval()[1, 1])
 
   def testSample(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(2)
       chol, sigma = self._random_chol(2, 2)
       chol[0, 0] = -chol[0, 0]
@@ -152,7 +152,7 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(np.cov(sample_values, rowvar=0), sigma, atol=0.06)
 
   def testSingularScaleRaises(self):
-    with self.test_session():
+    with self.cached_session():
       mu = None
       chol = [[1., 0.], [0., 0.]]
       mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
@@ -160,7 +160,7 @@ class MultivariateNormalTriLTest(test.TestCase):
         mvn.sample().eval()
 
   def testSampleWithSampleShape(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(3, 5, 2)
       chol, sigma = self._random_chol(3, 5, 2, 2)
       chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
@@ -185,7 +185,7 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(expected_log_pdf, x_log_pdf)
 
   def testSampleMultiDimensional(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(3, 5, 2)
       chol, sigma = self._random_chol(3, 5, 2, 2)
       chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
@@ -205,7 +205,7 @@ class MultivariateNormalTriLTest(test.TestCase):
           atol=1e-1)
 
   def testShapes(self):
-    with self.test_session():
+    with self.cached_session():
       mu = self._rng.rand(3, 5, 2)
       chol, _ = self._random_chol(3, 5, 2, 2)
       chol[1, 0, 0, 0] = -chol[1, 0, 0, 0]
@@ -237,7 +237,7 @@ class MultivariateNormalTriLTest(test.TestCase):
   def testKLNonBatch(self):
     batch_shape = []
     event_shape = [2]
-    with self.test_session():
+    with self.cached_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
       mvn_a = ds.MultivariateNormalTriL(
@@ -259,7 +259,7 @@ class MultivariateNormalTriLTest(test.TestCase):
   def testKLBatch(self):
     batch_shape = [2]
     event_shape = [3]
-    with self.test_session():
+    with self.cached_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
       mvn_a = ds.MultivariateNormalTriL(
@@ -285,7 +285,7 @@ class MultivariateNormalTriLTest(test.TestCase):
   def testKLBatchBroadcast(self):
     batch_shape = [2]
     event_shape = [3]
-    with self.test_session():
+    with self.cached_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       # No batch shape.
       mu_b, sigma_b = self._random_mu_and_sigma([], event_shape)
@@ -312,7 +312,7 @@ class MultivariateNormalTriLTest(test.TestCase):
   def testKLTwoIdenticalDistributionsIsZero(self):
     batch_shape = [2]
     event_shape = [3]
-    with self.test_session():
+    with self.cached_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mvn_a = ds.MultivariateNormalTriL(
           loc=mu_a,
@@ -336,7 +336,7 @@ class MultivariateNormalTriLTest(test.TestCase):
     true_variance = np.diag(true_covariance)
     true_stddev = np.sqrt(true_variance)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = ds.MultivariateNormalTriL(
           loc=mu,
           scale_tril=scale_tril,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
index 37edaa42cd..a46b81af35 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.platform import test
 class NegativeBinomialTest(test.TestCase):
 
   def testNegativeBinomialShape(self):
-    with self.test_session():
+    with self.cached_session():
       probs = [.1] * 5
       total_count = [2.0] * 5
       negbinom = negative_binomial.NegativeBinomial(
@@ -46,7 +46,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([]), negbinom.event_shape)
 
   def testNegativeBinomialShapeBroadcast(self):
-    with self.test_session():
+    with self.cached_session():
       probs = [[.1, .2, .3]] * 5
       total_count = [[2.]] * 5
       negbinom = negative_binomial.NegativeBinomial(
@@ -60,7 +60,7 @@ class NegativeBinomialTest(test.TestCase):
 
   def testLogits(self):
     logits = [[0., 9., -0.5]]
-    with self.test_session():
+    with self.cached_session():
       negbinom = negative_binomial.NegativeBinomial(
           total_count=3., logits=logits)
       self.assertEqual([1, 3], negbinom.probs.get_shape())
@@ -69,14 +69,14 @@ class NegativeBinomialTest(test.TestCase):
 
   def testInvalidP(self):
     invalid_ps = [-.01, 0., -2.,]
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("Condition x >= 0"):
         negbinom = negative_binomial.NegativeBinomial(
             5., probs=invalid_ps, validate_args=True)
         negbinom.probs.eval()
 
     invalid_ps = [1.01, 2., 1.001,]
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("probs has components greater than 1."):
         negbinom = negative_binomial.NegativeBinomial(
             5., probs=invalid_ps, validate_args=True)
@@ -84,14 +84,14 @@ class NegativeBinomialTest(test.TestCase):
 
   def testInvalidNegativeCount(self):
     invalid_rs = [-.01, 0., -2.,]
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("Condition x > 0"):
         negbinom = negative_binomial.NegativeBinomial(
             total_count=invalid_rs, probs=0.1, validate_args=True)
         negbinom.total_count.eval()
 
   def testNegativeBinomialLogCdf(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = [.2] * batch_size
       probs_v = .2
@@ -109,7 +109,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_cdf), cdf.eval())
 
   def testNegativeBinomialLogCdfValidateArgs(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = [.9] * batch_size
       total_count = 5.
@@ -119,7 +119,7 @@ class NegativeBinomialTest(test.TestCase):
         negbinom.log_cdf(-1.).eval()
 
   def testNegativeBinomialLogPmf(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = [.2] * batch_size
       probs_v = .2
@@ -137,7 +137,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_pmf), pmf.eval())
 
   def testNegativeBinomialLogPmfValidateArgs(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = [.9] * batch_size
       total_count = 5.
@@ -162,7 +162,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertEqual([6], pmf.get_shape())
 
   def testNegativeBinomialLogPmfMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       probs = constant_op.constant([[.2, .3, .5]] * batch_size)
       probs_v = np.array([.2, .3, .5])
@@ -183,7 +183,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertAllClose(np.exp(expected_log_pmf), pmf_values)
 
   def testNegativeBinomialMean(self):
-    with self.test_session():
+    with self.cached_session():
       total_count = 5.
       probs = np.array([.1, .3, .25], dtype=np.float32)
       negbinom = negative_binomial.NegativeBinomial(
@@ -193,7 +193,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertAllClose(expected_means, negbinom.mean().eval())
 
   def testNegativeBinomialVariance(self):
-    with self.test_session():
+    with self.cached_session():
       total_count = 5.
       probs = np.array([.1, .3, .25], dtype=np.float32)
       negbinom = negative_binomial.NegativeBinomial(
@@ -203,7 +203,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertAllClose(expected_vars, negbinom.variance().eval())
 
   def testNegativeBinomialStddev(self):
-    with self.test_session():
+    with self.cached_session():
       total_count = 5.
       probs = np.array([.1, .3, .25], dtype=np.float32)
       negbinom = negative_binomial.NegativeBinomial(
@@ -213,7 +213,7 @@ class NegativeBinomialTest(test.TestCase):
       self.assertAllClose(expected_stds, negbinom.stddev().eval())
 
   def testNegativeBinomialSample(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       probs = [.3, .9]
       total_count = [4., 11.]
       n = int(100e3)
@@ -242,7 +242,7 @@ class NegativeBinomialTest(test.TestCase):
                             rtol=.02)
 
   def testLogProbOverflow(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       logits = np.float32([20., 30., 40.])
       total_count = np.float32(1.)
       x = np.float32(0.)
@@ -253,7 +253,7 @@ class NegativeBinomialTest(test.TestCase):
                           np.isfinite(log_prob_))
 
   def testLogProbUnderflow(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       logits = np.float32([-90, -100, -110])
       total_count = np.float32(1.)
       x = np.float32(0.)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
index 111f88eeb5..84ee19123c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
@@ -44,7 +44,7 @@ class OneHotCategoricalTest(test.TestCase):
   def testP(self):
     p = [0.2, 0.8]
     dist = onehot_categorical.OneHotCategorical(probs=p)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(p, dist.probs.eval())
       self.assertAllEqual([2], dist.logits.get_shape())
 
@@ -52,14 +52,14 @@ class OneHotCategoricalTest(test.TestCase):
     p = np.array([0.2, 0.8], dtype=np.float32)
     logits = np.log(p) - 50.
     dist = onehot_categorical.OneHotCategorical(logits=logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([2], dist.probs.get_shape())
       self.assertAllEqual([2], dist.logits.get_shape())
       self.assertAllClose(dist.probs.eval(), p)
       self.assertAllClose(dist.logits.eval(), logits)
 
   def testShapes(self):
-    with self.test_session():
+    with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
         dist = make_onehot_categorical(batch_shape, 10)
         self.assertAllEqual(batch_shape, dist.batch_shape.as_list())
@@ -97,7 +97,7 @@ class OneHotCategoricalTest(test.TestCase):
         np.array([1]+[0]*4, dtype=np.int64)).dtype)
 
   def testUnknownShape(self):
-    with self.test_session():
+    with self.cached_session():
       logits = array_ops.placeholder(dtype=dtypes.float32)
       dist = onehot_categorical.OneHotCategorical(logits)
       sample = dist.sample()
@@ -112,7 +112,7 @@ class OneHotCategoricalTest(test.TestCase):
   def testEntropyNoBatch(self):
     logits = np.log([0.2, 0.8]) - 50.
     dist = onehot_categorical.OneHotCategorical(logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(
           dist.entropy().eval(),
           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)))
@@ -120,7 +120,7 @@ class OneHotCategoricalTest(test.TestCase):
   def testEntropyWithBatch(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = onehot_categorical.OneHotCategorical(logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(dist.entropy().eval(), [
           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)),
           -(0.6 * np.log(0.6) + 0.4 * np.log(0.4))
@@ -128,7 +128,7 @@ class OneHotCategoricalTest(test.TestCase):
 
   def testPmf(self):
     # check that probability of samples correspond to their class probabilities
-    with self.test_session():
+    with self.cached_session():
       logits = self._rng.random_sample(size=(8, 2, 10))
       prob = np.exp(logits)/np.sum(np.exp(logits), axis=-1, keepdims=True)
       dist = onehot_categorical.OneHotCategorical(logits=logits)
@@ -138,7 +138,7 @@ class OneHotCategoricalTest(test.TestCase):
       self.assertAllClose(expected_prob, np_prob.flatten())
 
   def testSample(self):
-    with self.test_session():
+    with self.cached_session():
       probs = [[[0.2, 0.8], [0.4, 0.6]]]
       dist = onehot_categorical.OneHotCategorical(math_ops.log(probs) - 50.)
       n = 100
@@ -150,7 +150,7 @@ class OneHotCategoricalTest(test.TestCase):
       self.assertFalse(np.any(sample_values > 1))
 
   def testSampleWithSampleShape(self):
-    with self.test_session():
+    with self.cached_session():
       probs = [[[0.2, 0.8], [0.4, 0.6]]]
       dist = onehot_categorical.OneHotCategorical(math_ops.log(probs) - 50.)
       samples = dist.sample((100, 100), seed=123)
@@ -166,7 +166,7 @@ class OneHotCategoricalTest(test.TestCase):
       exp_logits = np.exp(logits)
       return exp_logits / exp_logits.sum(axis=-1, keepdims=True)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for categories in [2, 10]:
         for batch_size in [1, 2]:
           p_logits = self._rng.random_sample((batch_size, categories))
@@ -193,7 +193,7 @@ class OneHotCategoricalTest(test.TestCase):
           self.assertAllClose(kl_sample_, kl_expected, atol=1e-2, rtol=0.)
 
   def testSampleUnbiasedNonScalarBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       logits = self._rng.rand(4, 3, 2).astype(np.float32)
       dist = onehot_categorical.OneHotCategorical(logits=logits)
       n = int(3e3)
@@ -221,7 +221,7 @@ class OneHotCategoricalTest(test.TestCase):
           actual_covariance_, sample_covariance_, atol=0., rtol=0.10)
 
   def testSampleUnbiasedScalarBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       logits = self._rng.rand(3).astype(np.float32)
       dist = onehot_categorical.OneHotCategorical(logits=logits)
       n = int(1e4)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
index 1035cb00f7..e2d04c9c27 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
@@ -29,7 +29,7 @@ class _PoissonLogNormalQuadratureCompoundTest(
   """Tests the PoissonLogNormalQuadratureCompoundTest distribution."""
 
   def testSampleProbConsistent(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
           loc=array_ops.placeholder_with_default(
               -2.,
@@ -43,7 +43,7 @@ class _PoissonLogNormalQuadratureCompoundTest(
           sess.run, pln, batch_size=1, rtol=0.1)
 
   def testMeanVariance(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
           loc=array_ops.placeholder_with_default(
               0.,
@@ -57,7 +57,7 @@ class _PoissonLogNormalQuadratureCompoundTest(
           sess.run, pln, rtol=0.02)
 
   def testSampleProbConsistentBroadcastScalar(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
           loc=array_ops.placeholder_with_default(
               [0., -0.5],
@@ -71,7 +71,7 @@ class _PoissonLogNormalQuadratureCompoundTest(
           sess.run, pln, batch_size=2, rtol=0.1, atol=0.01)
 
   def testMeanVarianceBroadcastScalar(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
           loc=array_ops.placeholder_with_default(
               [0., -0.5],
@@ -85,7 +85,7 @@ class _PoissonLogNormalQuadratureCompoundTest(
           sess.run, pln, rtol=0.1, atol=0.01)
 
   def testSampleProbConsistentBroadcastBoth(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
           loc=array_ops.placeholder_with_default(
               [[0.], [-0.5]],
@@ -99,7 +99,7 @@ class _PoissonLogNormalQuadratureCompoundTest(
           sess.run, pln, batch_size=4, rtol=0.1, atol=0.08)
 
   def testMeanVarianceBroadcastBoth(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
           loc=array_ops.placeholder_with_default(
               [[0.], [-0.5]],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
index 19a7472d91..29eba5afca 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
@@ -35,7 +35,7 @@ class PoissonTest(test.TestCase):
     return poisson_lib.Poisson(rate=rate, validate_args=validate_args)
 
   def testPoissonShape(self):
-    with self.test_session():
+    with self.cached_session():
       lam = constant_op.constant([3.0] * 5)
       poisson = self._make_poisson(rate=lam)
 
@@ -47,13 +47,13 @@ class PoissonTest(test.TestCase):
   def testInvalidLam(self):
     invalid_lams = [-.01, 0., -2.]
     for lam in invalid_lams:
-      with self.test_session():
+      with self.cached_session():
         with self.assertRaisesOpError("Condition x > 0"):
           poisson = self._make_poisson(rate=lam, validate_args=True)
           poisson.rate.eval()
 
   def testPoissonLogPmf(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       lam = constant_op.constant([3.0] * batch_size)
       lam_v = 3.0
@@ -68,7 +68,7 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(pmf.eval(), stats.poisson.pmf(x, lam_v))
 
   def testPoissonLogPmfValidateArgs(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       lam = constant_op.constant([3.0] * batch_size)
       x = array_ops.placeholder(dtypes.float32, shape=[6])
@@ -91,7 +91,7 @@ class PoissonTest(test.TestCase):
       self.assertEqual(pmf.get_shape(), (6,))
 
   def testPoissonLogPmfMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       lam = constant_op.constant([[2.0, 4.0, 5.0]] * batch_size)
       lam_v = [2.0, 4.0, 5.0]
@@ -107,7 +107,7 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(pmf.eval(), stats.poisson.pmf(x, lam_v))
 
   def testPoissonCDF(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       lam = constant_op.constant([3.0] * batch_size)
       lam_v = 3.0
@@ -123,7 +123,7 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(cdf.eval(), stats.poisson.cdf(x, lam_v))
 
   def testPoissonCDFNonIntegerValues(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       lam = constant_op.constant([3.0] * batch_size)
       lam_v = 3.0
@@ -142,7 +142,7 @@ class PoissonTest(test.TestCase):
         poisson_validate.cdf(x).eval()
 
   def testPoissonCdfMultidimensional(self):
-    with self.test_session():
+    with self.cached_session():
       batch_size = 6
       lam = constant_op.constant([[2.0, 4.0, 5.0]] * batch_size)
       lam_v = [2.0, 4.0, 5.0]
@@ -158,7 +158,7 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(cdf.eval(), stats.poisson.cdf(x, lam_v))
 
   def testPoissonMean(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = [1.0, 3.0, 2.5]
       poisson = self._make_poisson(rate=lam_v)
       self.assertEqual(poisson.mean().get_shape(), (3,))
@@ -166,7 +166,7 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(poisson.mean().eval(), lam_v)
 
   def testPoissonVariance(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = [1.0, 3.0, 2.5]
       poisson = self._make_poisson(rate=lam_v)
       self.assertEqual(poisson.variance().get_shape(), (3,))
@@ -174,7 +174,7 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(poisson.variance().eval(), lam_v)
 
   def testPoissonStd(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = [1.0, 3.0, 2.5]
       poisson = self._make_poisson(rate=lam_v)
       self.assertEqual(poisson.stddev().get_shape(), (3,))
@@ -182,14 +182,14 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(poisson.stddev().eval(), np.sqrt(lam_v))
 
   def testPoissonMode(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = [1.0, 3.0, 2.5, 3.2, 1.1, 0.05]
       poisson = self._make_poisson(rate=lam_v)
       self.assertEqual(poisson.mode().get_shape(), (6,))
       self.assertAllClose(poisson.mode().eval(), np.floor(lam_v))
 
   def testPoissonMultipleMode(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = [1.0, 3.0, 2.0, 4.0, 5.0, 10.0]
       poisson = self._make_poisson(rate=lam_v)
       # For the case where lam is an integer, the modes are: lam and lam - 1.
@@ -198,7 +198,7 @@ class PoissonTest(test.TestCase):
       self.assertAllClose(lam_v, poisson.mode().eval())
 
   def testPoissonSample(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = 4.0
       lam = constant_op.constant(lam_v)
       # Choosing `n >= (k/rtol)**2, roughly ensures our sample mean should be
@@ -215,7 +215,7 @@ class PoissonTest(test.TestCase):
           sample_values.var(), stats.poisson.var(lam_v), rtol=.01)
 
   def testPoissonSampleMultidimensionalMean(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = np.array([np.arange(1, 51, dtype=np.float32)])  # 1 x 50
       poisson = self._make_poisson(rate=lam_v)
       # Choosing `n >= (k/rtol)**2, roughly ensures our sample mean should be
@@ -232,7 +232,7 @@ class PoissonTest(test.TestCase):
           atol=0)
 
   def testPoissonSampleMultidimensionalVariance(self):
-    with self.test_session():
+    with self.cached_session():
       lam_v = np.array([np.arange(5, 15, dtype=np.float32)])  # 1 x 10
       poisson = self._make_poisson(rate=lam_v)
       # Choosing `n >= 2 * lam * (k/rtol)**2, roughly ensures our sample
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
index 6a7ee3a8bf..07528cafaf 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
@@ -38,7 +38,7 @@ class QuantizedDistributionTest(test.TestCase):
     self.assertTrue(np.isfinite(array).all())
 
   def testQuantizationOfUniformWithCutoffsHavingNoEffect(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # The Quantized uniform with cutoffs == None divides the real line into:
       # R = ...(-1, 0](0, 1](1, 2](2, 3](3, 4]...
       # j = ...     0     1     2     3     4 ...
@@ -93,7 +93,7 @@ class QuantizedDistributionTest(test.TestCase):
         self.assertAllClose(3 / 3, cdf_5)
 
   def testQuantizationOfUniformWithCutoffsInTheMiddle(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # The uniform is supported on [-3, 3]
       # Consider partitions the real line in intervals
       # ...(-3, -2](-2, -1](-1, 0](0, 1](1, 2](2, 3] ...
@@ -131,7 +131,7 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testQuantizationOfBatchOfUniforms(self):
     batch_shape = (5, 5)
-    with self.test_session():
+    with self.cached_session():
       # The uniforms are supported on [0, 10].  The qdist considers the
       # intervals
       # ... (0, 1](1, 2]...(9, 10]...
@@ -165,7 +165,7 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testSamplingFromBatchOfNormals(self):
     batch_shape = (2,)
-    with self.test_session():
+    with self.cached_session():
       normal = distributions.Normal(
           loc=array_ops.zeros(
               batch_shape, dtype=dtypes.float32),
@@ -199,7 +199,7 @@ class QuantizedDistributionTest(test.TestCase):
     # pretend that the cdf F is a bijection, and hence F(X) is uniform.
     # Note that F cannot be bijection since it is constant between the
     # integers.  Hence, F(X) (see below) will not be uniform exactly.
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Exponential(rate=0.01))
       # X ~ QuantizedExponential
@@ -222,7 +222,7 @@ class QuantizedDistributionTest(test.TestCase):
     # it makes sure the bin edges are consistent.
 
     # Make an exponential with mean 5.
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Exponential(rate=0.2))
       # Standard error should be less than 1 / (2 * sqrt(n_samples))
@@ -243,7 +243,7 @@ class QuantizedDistributionTest(test.TestCase):
     batch_shape = (3, 3)
     mu = rng.randn(*batch_shape)
     sigma = rng.rand(*batch_shape) + 1.0
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(
               loc=mu, scale=sigma))
@@ -260,7 +260,7 @@ class QuantizedDistributionTest(test.TestCase):
     batch_shape = (3, 3)
     mu = rng.randn(*batch_shape)
     sigma = rng.rand(*batch_shape) + 1.0
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(
               loc=mu, scale=sigma))
@@ -275,7 +275,7 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testNormalProbWithCutoffs(self):
     # At integer values, the result should be the same as the standard normal.
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(loc=0., scale=1.),
           low=-2.,
@@ -297,7 +297,7 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testNormalLogProbWithCutoffs(self):
     # At integer values, the result should be the same as the standard normal.
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(loc=0., scale=1.),
           low=-2.,
@@ -335,14 +335,14 @@ class QuantizedDistributionTest(test.TestCase):
         x = np.arange(-100, 100, 2).astype(dtype)
         proba = qdist.log_prob(x)
         grads = gradients_impl.gradients(proba, [mu, sigma])
-        with self.test_session(graph=g):
+        with self.session(graph=g):
           variables.global_variables_initializer().run()
           self._assert_all_finite(proba.eval())
           self._assert_all_finite(grads[0].eval())
           self._assert_all_finite(grads[1].eval())
 
   def testProbAndGradGivesFiniteResultsForCommonEvents(self):
-    with self.test_session():
+    with self.cached_session():
       mu = variables.Variable(0.0, name="mu")
       sigma = variables.Variable(1.0, name="sigma")
       qdist = distributions.QuantizedDistribution(
@@ -360,7 +360,7 @@ class QuantizedDistributionTest(test.TestCase):
       self._assert_all_finite(grads[1].eval())
 
   def testLowerCutoffMustBeBelowUpperCutoffOrWeRaise(self):
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(loc=0., scale=1.),
           low=1.,  # not strictly less than high.
@@ -372,7 +372,7 @@ class QuantizedDistributionTest(test.TestCase):
         qdist.sample().eval()
 
   def testCutoffsMustBeIntegerValuedIfValidateArgsTrue(self):
-    with self.test_session():
+    with self.cached_session():
       low = array_ops.placeholder(dtypes.float32)
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(loc=0., scale=1.),
@@ -385,7 +385,7 @@ class QuantizedDistributionTest(test.TestCase):
         qdist.sample().eval(feed_dict={low: 1.5})
 
   def testCutoffsCanBeFloatValuedIfValidateArgsFalse(self):
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(
               loc=0., scale=1., validate_args=False),
@@ -399,7 +399,7 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testDtypeAndShapeInheritedFromBaseDist(self):
     batch_shape = (2, 3)
-    with self.test_session():
+    with self.cached_session():
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(
               loc=array_ops.zeros(batch_shape),
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
index 2cf12bbe50..fec2374928 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py
@@ -34,29 +34,29 @@ class RelaxedBernoulliTest(test.TestCase):
     temperature = 1.0
     p = [0.1, 0.4]
     dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(p, dist.probs.eval())
 
   def testLogits(self):
     temperature = 2.0
     logits = [-42., 42.]
     dist = relaxed_bernoulli.RelaxedBernoulli(temperature, logits=logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(logits, dist.logits.eval())
 
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(scipy.special.expit(logits), dist.probs.eval())
 
     p = [0.01, 0.99, 0.42]
     dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(scipy.special.logit(p), dist.logits.eval())
 
   def testInvalidP(self):
     temperature = 1.0
     invalid_ps = [1.01, 2.]
     for p in invalid_ps:
-      with self.test_session():
+      with self.cached_session():
         with self.assertRaisesOpError("probs has components greater than 1"):
           dist = relaxed_bernoulli.RelaxedBernoulli(temperature,
                                                     probs=p,
@@ -65,7 +65,7 @@ class RelaxedBernoulliTest(test.TestCase):
 
     invalid_ps = [-0.01, -3.]
     for p in invalid_ps:
-      with self.test_session():
+      with self.cached_session():
         with self.assertRaisesOpError("Condition x >= 0"):
           dist = relaxed_bernoulli.RelaxedBernoulli(temperature,
                                                     probs=p,
@@ -74,13 +74,13 @@ class RelaxedBernoulliTest(test.TestCase):
 
     valid_ps = [0.0, 0.5, 1.0]
     for p in valid_ps:
-      with self.test_session():
+      with self.cached_session():
         dist = relaxed_bernoulli.RelaxedBernoulli(temperature,
                                                   probs=p)
         self.assertEqual(p, dist.probs.eval())
 
   def testShapes(self):
-    with self.test_session():
+    with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
         temperature = 1.0
         p = np.random.random(batch_shape).astype(np.float32)
@@ -96,7 +96,7 @@ class RelaxedBernoulliTest(test.TestCase):
     p = constant_op.constant([0.1, 0.4])
     dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p,
                                               validate_args=True)
-    with self.test_session():
+    with self.cached_session():
       sample = dist.sample()
       with self.assertRaises(errors_impl.InvalidArgumentError):
         sample.eval()
@@ -117,7 +117,7 @@ class RelaxedBernoulliTest(test.TestCase):
     self.assertEqual(dist64.dtype, dist64.sample(5).dtype)
 
   def testLogProb(self):
-    with self.test_session():
+    with self.cached_session():
       t = np.array(1.0, dtype=np.float64)
       p = np.array(0.1, dtype=np.float64)  # P(x=1)
       dist = relaxed_bernoulli.RelaxedBernoulli(t, probs=p)
@@ -131,7 +131,7 @@ class RelaxedBernoulliTest(test.TestCase):
       self.assertAllClose(expected_log_pdf, log_pdf)
 
   def testBoundaryConditions(self):
-    with self.test_session():
+    with self.cached_session():
       temperature = 1e-2
       dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=1.0)
       self.assertAllClose(np.nan, dist.log_prob(0.0).eval())
@@ -139,7 +139,7 @@ class RelaxedBernoulliTest(test.TestCase):
 
   def testSampleN(self):
     """mean of quantized samples still approximates the Bernoulli mean."""
-    with self.test_session():
+    with self.cached_session():
       temperature = 1e-2
       p = [0.2, 0.6, 0.5]
       dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_onehot_categorical_test.py b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_onehot_categorical_test.py
index faae9da6ad..ff13c2decc 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_onehot_categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_onehot_categorical_test.py
@@ -46,7 +46,7 @@ class ExpRelaxedOneHotCategoricalTest(test.TestCase):
     dist = relaxed_onehot_categorical.ExpRelaxedOneHotCategorical(temperature,
                                                                   logits)
     expected_p = np.exp(logits)/np.sum(np.exp(logits))
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(expected_p, dist.probs.eval())
       self.assertAllEqual([3], dist.probs.get_shape())
 
@@ -57,7 +57,7 @@ class ExpRelaxedOneHotCategoricalTest(test.TestCase):
     p = np.exp(logits)/np.sum(np.exp(logits))
     dist = relaxed_onehot_categorical.ExpRelaxedOneHotCategorical(temperature,
                                                                   logits)
-    with self.test_session():
+    with self.cached_session():
       x = dist.sample().eval()
       # analytical ExpConcrete density presented in Maddison et al. 2016
       prod_term = p*np.exp(-temperature * x)
@@ -74,14 +74,14 @@ class RelaxedOneHotCategoricalTest(test.TestCase):
     logits = [2.0, 3.0, -4.0]
     dist = relaxed_onehot_categorical.RelaxedOneHotCategorical(temperature,
                                                                logits)
-    with self.test_session():
+    with self.cached_session():
       # check p for ExpRelaxed base distribution
       self.assertAllClose(logits, dist._distribution.logits.eval())
       self.assertAllEqual([3], dist._distribution.logits.get_shape())
 
   def testSample(self):
     temperature = 1.4
-    with self.test_session():
+    with self.cached_session():
       # single logit
       logits = [.3, .1, .4]
       dist = relaxed_onehot_categorical.RelaxedOneHotCategorical(temperature,
@@ -115,7 +115,7 @@ class RelaxedOneHotCategoricalTest(test.TestCase):
       expected_pdf = term1*np.power(term2, -k)*term3
       return expected_pdf
 
-    with self.test_session():
+    with self.cached_session():
       temperature = .4
       logits = np.array([[.3, .1, .4]]).astype(np.float32)
       dist = relaxed_onehot_categorical.RelaxedOneHotCategorical(temperature,
@@ -136,7 +136,7 @@ class RelaxedOneHotCategoricalTest(test.TestCase):
       self.assertAllClose(expected_pdf.flatten(), pdf, rtol=1e-4)
 
   def testShapes(self):
-    with self.test_session():
+    with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
         dist = make_relaxed_categorical(batch_shape, 10)
         self.assertAllEqual(batch_shape, dist.batch_shape.as_list())
@@ -153,12 +153,12 @@ class RelaxedOneHotCategoricalTest(test.TestCase):
         self.assertAllEqual([10], dist.event_shape_tensor().eval())
 
   def testUnknownShape(self):
-    with self.test_session():
+    with self.cached_session():
       logits_pl = array_ops.placeholder(dtypes.float32)
       temperature = 1.0
       dist = relaxed_onehot_categorical.ExpRelaxedOneHotCategorical(temperature,
                                                                     logits_pl)
-      with self.test_session():
+      with self.cached_session():
         feed_dict = {logits_pl: [.3, .1, .4]}
         self.assertAllEqual([3], dist.sample().eval(feed_dict=feed_dict).shape)
         self.assertAllEqual([5, 3],
@@ -166,7 +166,7 @@ class RelaxedOneHotCategoricalTest(test.TestCase):
 
   def testDTypes(self):
     # check that sampling and log_prob work for a range of dtypes
-    with self.test_session():
+    with self.cached_session():
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         logits = random_ops.random_uniform(shape=[3, 3], dtype=dtype)
         dist = relaxed_onehot_categorical.RelaxedOneHotCategorical(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
index ea04e8c29a..d6020e7866 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -47,7 +47,7 @@ class _AutoCorrelationTest(object):
         input=x_,
         shape=x_.shape if self.use_static_shape else None)
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         # Setting normalize = True means we divide by zero.
         auto_corr = sample_stats.auto_correlation(
             x_ph, axis=1, center=False, normalize=False)
@@ -65,7 +65,7 @@ class _AutoCorrelationTest(object):
         input=x_,
         shape=x_.shape if self.use_static_shape else None)
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         # Setting normalize = True means we divide by zero.
         auto_corr = sample_stats.auto_correlation(
             x_ph, axis=1, normalize=False, center=True)
@@ -100,7 +100,7 @@ class _AutoCorrelationTest(object):
     x_ph = array_ops.placeholder_with_default(
         x, shape=x.shape if self.use_static_shape else None)
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session():
+      with self.cached_session():
         auto_corr = sample_stats.auto_correlation(
             x_ph, axis=axis, max_lags=max_lags, center=center,
             normalize=normalize)
@@ -167,7 +167,7 @@ class _AutoCorrelationTest(object):
     x_ph = array_ops.placeholder_with_default(
         x, shape=(l,) if self.use_static_shape else None)
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session():
+      with self.cached_session():
         rxx = sample_stats.auto_correlation(
             x_ph, max_lags=l // 2, center=True, normalize=False)
         if self.use_static_shape:
@@ -188,7 +188,7 @@ class _AutoCorrelationTest(object):
     x_ph = array_ops.placeholder_with_default(
         x, shape=(1000 * 10,) if self.use_static_shape else None)
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session():
+      with self.cached_session():
         rxx = sample_stats.auto_correlation(
             x_ph, max_lags=1000 * 10 // 2, center=True, normalize=False)
         if self.use_static_shape:
@@ -209,7 +209,7 @@ class _AutoCorrelationTest(object):
     x_ph = array_ops.placeholder_with_default(
         x, shape=(l,) if self.use_static_shape else None)
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session():
+      with self.cached_session():
         rxx = sample_stats.auto_correlation(
             x_ph, max_lags=l // 2, center=True, normalize=True)
         if self.use_static_shape:
@@ -271,7 +271,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation, axis=0)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(
             x, q=q, interpolation=self._interpolation, axis=[0])
         self.assertAllEqual((), pct.get_shape())
@@ -282,7 +282,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
         self.assertAllEqual((), pct.get_shape())
         self.assertAllClose(expected_percentile, pct.eval())
@@ -292,7 +292,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation, axis=0)
-      with self.test_session():
+      with self.cached_session():
         # Get dim 1 with negative and positive indices.
         pct_neg_index = sample_stats.percentile(
             x, q=q, interpolation=self._interpolation, axis=[0])
@@ -308,7 +308,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation, axis=0)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(
             x, q=q, interpolation=self._interpolation, axis=[0])
         self.assertAllEqual((2,), pct.get_shape())
@@ -319,7 +319,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation, keepdims=True, axis=0)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(
             x,
             q=q,
@@ -334,7 +334,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for axis in [None, 0, 1, -2, (0,), (-1,), (-1, 1), (3, 1), (-3, 0)]:
       expected_percentile = np.percentile(
           x, q=0.77, interpolation=self._interpolation, axis=axis)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(
             x,
             q=0.77,
@@ -352,7 +352,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
           interpolation=self._interpolation,
           axis=axis,
           keepdims=True)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(
             x,
             q=0.77,
@@ -368,7 +368,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for axis in [None, 0, 1, -2, (0,), (-1,), (-1, 1), (3, 1), (-3, 0)]:
       expected_percentile = np.percentile(
           x, q=0.77, interpolation=self._interpolation, axis=axis)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(
             x_ph,
             q=0.77,
@@ -386,7 +386,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
           interpolation=self._interpolation,
           axis=axis,
           keepdims=True)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(
             x_ph,
             q=0.77,
@@ -400,7 +400,7 @@ class PercentileTestWithLowerInterpolation(test.TestCase):
     for q in [0, 10, 25, 49.9, 50, 50.01, 90, 95, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
         self.assertEqual(dtypes.int32, pct.dtype)
         self.assertAllEqual((), pct.get_shape())
@@ -423,7 +423,7 @@ class PercentileTestWithNearestInterpolation(test.TestCase):
     for q in [0, 10.1, 25.1, 49.9, 50.1, 50.01, 89, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
         self.assertAllEqual((), pct.get_shape())
         self.assertAllClose(expected_percentile, pct.eval())
@@ -433,7 +433,7 @@ class PercentileTestWithNearestInterpolation(test.TestCase):
     for q in [0, 10.1, 25.1, 49.9, 50.1, 50.01, 89, 100]:
       expected_percentile = np.percentile(
           x, q=q, interpolation=self._interpolation)
-      with self.test_session():
+      with self.cached_session():
         pct = sample_stats.percentile(x, q=q, interpolation=self._interpolation)
         self.assertAllEqual((), pct.get_shape())
         self.assertAllClose(expected_percentile, pct.eval())
@@ -452,7 +452,7 @@ class PercentileTestWithNearestInterpolation(test.TestCase):
     x = [1., 5., 3., 2., 4.]
     q_ph = array_ops.placeholder(dtypes.float32)
     pct = sample_stats.percentile(x, q=q_ph, validate_args=True)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("rank"):
         pct.eval(feed_dict={q_ph: [0.5]})
 
@@ -462,7 +462,7 @@ class PercentileTestWithNearestInterpolation(test.TestCase):
     # If float is used, it fails with InvalidArgumentError about an index out of
     # bounds.
     x = math_ops.linspace(0., 3e7, num=int(3e7))
-    with self.test_session():
+    with self.cached_session():
       minval = sample_stats.percentile(x, q=0, validate_args=True)
       self.assertAllEqual(0, minval.eval())
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
index 243b5a0348..a4d2aa381c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
@@ -73,7 +73,7 @@ class MakeBatchReadyTest(test.TestCase):
     return y, sample_shape, should_be_x_value
 
   def _test_dynamic(self, x, batch_ndims, event_ndims, expand_batch_dim=True):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_pl = array_ops.placeholder(x.dtype)
       batch_ndims_pl = array_ops.placeholder(dtypes.int32)
       event_ndims_pl = array_ops.placeholder(dtypes.int32)
@@ -91,7 +91,7 @@ class MakeBatchReadyTest(test.TestCase):
     self.assertAllEqual(x, should_be_x_value_)
 
   def _test_static(self, x, batch_ndims, event_ndims, expand_batch_dim):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       [y_, sample_shape_, should_be_x_value_] = sess.run(
           self._build_graph(x, batch_ndims, event_ndims, expand_batch_dim))
     expected_y, expected_sample_shape = self._get_expected(
@@ -544,7 +544,7 @@ class DistributionShapeTest(test.TestCase):
       self.assertAllEqual(expected_item, next(actual_item))
 
   def testDistributionShapeGetNdimsStatic(self):
-    with self.test_session():
+    with self.cached_session():
       shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       x = 1
       self.assertEqual(0, shaper.get_sample_ndims(x).eval())
@@ -572,7 +572,7 @@ class DistributionShapeTest(test.TestCase):
       self.assertEqual(1, shaper.event_ndims.eval())
 
   def testDistributionShapeGetNdimsDynamic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_ndims = array_ops.placeholder(dtypes.int32)
       event_ndims = array_ops.placeholder(dtypes.int32)
       shaper = _DistributionShape(
@@ -583,7 +583,7 @@ class DistributionShapeTest(test.TestCase):
       self.assertEqual(2, sess.run(shaper.get_ndims(y), feed_dict=feed_dict))
 
   def testDistributionShapeGetDimsStatic(self):
-    with self.test_session():
+    with self.cached_session():
       shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       x = 1
       self.assertAllEqual((_empty_shape, _empty_shape, _empty_shape),
@@ -597,7 +597,7 @@ class DistributionShapeTest(test.TestCase):
                                _constant(shaper.get_dims(x)))
 
   def testDistributionShapeGetDimsDynamic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Works for static {batch,event}_ndims despite unfed input.
       shaper = _DistributionShape(batch_ndims=1, event_ndims=2)
       y = array_ops.placeholder(dtypes.float32, shape=(10, None, 5, 5))
@@ -615,7 +615,7 @@ class DistributionShapeTest(test.TestCase):
           ([0], [1], [2, 3]), sess.run(shaper.get_dims(y), feed_dict=feed_dict))
 
   def testDistributionShapeGetShapeStatic(self):
-    with self.test_session():
+    with self.cached_session():
       shaper = _DistributionShape(batch_ndims=0, event_ndims=0)
       self.assertAllEqual((_empty_shape, _empty_shape, _empty_shape),
                           _constant(shaper.get_shape(1.)))
@@ -657,7 +657,7 @@ class DistributionShapeTest(test.TestCase):
                                _constant(shaper.get_shape(np.ones((3, 2, 1)))))
 
   def testDistributionShapeGetShapeDynamic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Works for static ndims despite unknown static shape.
       shaper = _DistributionShape(batch_ndims=1, event_ndims=1)
       y = array_ops.placeholder(dtypes.int32, shape=(None, None, 2))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sinh_arcsinh_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sinh_arcsinh_test.py
index 88b48736dd..1811d85b7e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sinh_arcsinh_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sinh_arcsinh_test.py
@@ -34,7 +34,7 @@ class SinhArcsinhTest(test.TestCase):
     b = 10
     scale = rng.rand(b) + 0.5
     loc = rng.randn(b)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       norm = ds.Normal(
           loc=loc,
           scale=scale,
@@ -58,7 +58,7 @@ class SinhArcsinhTest(test.TestCase):
           norm_samps.std(axis=0), sasnorm_samps.std(axis=0), atol=0.1)
 
   def test_broadcast_params_dynamic(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       loc = array_ops.placeholder(dtypes.float64)
       scale = array_ops.placeholder(dtypes.float64)
       skewness = array_ops.placeholder(dtypes.float64)
@@ -78,7 +78,7 @@ class SinhArcsinhTest(test.TestCase):
     b = 10
     scale = rng.rand(b) + 0.5
     loc = rng.randn(b)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       lap = ds.Laplace(
           loc=loc,
           scale=scale,
@@ -106,7 +106,7 @@ class SinhArcsinhTest(test.TestCase):
     batch_size = 10
     scale = rng.rand(batch_size) + 0.5
     loc = 0.1 * rng.randn(batch_size)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       norm = ds.Normal(
           loc=loc,
           scale=scale,
@@ -148,7 +148,7 @@ class SinhArcsinhTest(test.TestCase):
     batch_size = 10
     scale = rng.rand(batch_size) + 0.5
     loc = np.float64(0.)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       norm = ds.Normal(
           loc=loc,
           scale=scale,
@@ -190,7 +190,7 @@ class SinhArcsinhTest(test.TestCase):
     batch_size = 10
     scale = rng.rand(batch_size) + 0.5
     loc = rng.randn(batch_size)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sasnorm = ds.SinhArcsinh(
           loc=loc,
           scale=scale,
@@ -201,7 +201,7 @@ class SinhArcsinhTest(test.TestCase):
       np.testing.assert_array_less(loc, sasnorm_samps.mean(axis=0))
 
   def test_pdf_reflected_for_negative_skewness(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sas_pos_skew = ds.SinhArcsinh(
           loc=0.,
           scale=1.,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index 5fe1331d2c..196cc41335 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -91,7 +91,7 @@ class TransformedDistributionTest(test.TestCase):
       # sample
       sample = log_normal.sample(100000, seed=235)
       self.assertAllEqual([], log_normal.event_shape)
-      with self.test_session(graph=g):
+      with self.session(graph=g):
         self.assertAllEqual([], log_normal.event_shape_tensor().eval())
         self.assertAllClose(
             sp_dist.mean(), np.mean(sample.eval()), atol=0.0, rtol=0.05)
@@ -107,7 +107,7 @@ class TransformedDistributionTest(test.TestCase):
                    [log_normal.log_survival_function, sp_dist.logsf]]:
         actual = func[0](test_vals)
         expected = func[1](test_vals)
-        with self.test_session(graph=g):
+        with self.session(graph=g):
           self.assertAllClose(expected, actual.eval(), atol=0, rtol=0.01)
 
   def testNonInjectiveTransformedDistribution(self):
@@ -123,7 +123,7 @@ class TransformedDistributionTest(test.TestCase):
       # sample
       sample = abs_normal.sample(100000, seed=235)
       self.assertAllEqual([], abs_normal.event_shape)
-      with self.test_session(graph=g):
+      with self.session(graph=g):
         sample_ = sample.eval()
         self.assertAllEqual([], abs_normal.event_shape_tensor().eval())
 
@@ -147,7 +147,7 @@ class TransformedDistributionTest(test.TestCase):
             abs_normal.log_prob(2.13).eval())
 
   def testQuantile(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       logit_normal = self._cls()(
           distribution=ds.Normal(loc=0., scale=1.),
           bijector=bs.Sigmoid(),
@@ -169,7 +169,7 @@ class TransformedDistributionTest(test.TestCase):
     exp_forward_only._inverse_log_det_jacobian = self._make_unimplemented(
         "inverse_log_det_jacobian ")
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       mu = 3.0
       sigma = 0.02
       log_normal = self._cls()(
@@ -195,7 +195,7 @@ class TransformedDistributionTest(test.TestCase):
 
     log_forward_only = bs.Invert(exp_inverse_only)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # The log bijector isn't defined over the whole real line, so we make
       # sigma sufficiently small so that the draws are positive.
       mu = 2.
@@ -211,7 +211,7 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllClose(expected_log_pdf, log_pdf_val, atol=0.)
 
   def testShapeChangingBijector(self):
-    with self.test_session():
+    with self.cached_session():
       softmax = bs.SoftmaxCentered()
       standard_normal = ds.Normal(loc=0., scale=1.)
       multi_logit_normal = self._cls()(
@@ -235,7 +235,7 @@ class TransformedDistributionTest(test.TestCase):
   def testCastLogDetJacobian(self):
     """Test log_prob when Jacobian and log_prob dtypes do not match."""
 
-    with self.test_session():
+    with self.cached_session():
       # Create an identity bijector whose jacobians have dtype int32
       int_identity = bs.Inline(
           forward_fn=array_ops.identity,
@@ -257,7 +257,7 @@ class TransformedDistributionTest(test.TestCase):
       normal.entropy().eval()
 
   def testEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       shift = np.array([[-1, 0, 1], [-1, -2, -3]], dtype=np.float32)
       diag = np.array([[1, 2, 3], [2, 3, 2]], dtype=np.float32)
       actual_mvn_entropy = np.concatenate([
@@ -277,7 +277,7 @@ class TransformedDistributionTest(test.TestCase):
                           fake_mvn.entropy().eval())
 
   def testScalarBatchScalarEventIdentityScale(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       exp2 = self._cls()(
           ds.Exponential(rate=0.25),
           bijector=ds.bijectors.AffineScalar(scale=2.)
@@ -310,7 +310,7 @@ class ScalarToMultiTest(test.TestCase):
                batch_shape=(),
                event_shape=(),
                not_implemented_message=None):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Overriding shapes must be compatible w/bijector; most bijectors are
       # batch_shape agnostic and only care about event_ndims.
       # In the case of `Affine`, if we got it wrong then it would fire an
@@ -428,7 +428,7 @@ class ScalarToMultiTest(test.TestCase):
         batch_shape=[2],
         not_implemented_message="not implemented")
 
-    with self.test_session():
+    with self.cached_session():
       # Can't override event_shape for scalar batch, non-scalar event.
       with self.assertRaisesRegexp(ValueError, "base distribution not scalar"):
         self._cls()(
@@ -445,7 +445,7 @@ class ScalarToMultiTest(test.TestCase):
         event_shape=[3],
         not_implemented_message="not implemented when overriding event_shape")
 
-    with self.test_session():
+    with self.cached_session():
       # Can't override batch_shape for non-scalar batch, scalar event.
       with self.assertRaisesRegexp(ValueError, "base distribution not scalar"):
         self._cls()(
@@ -456,7 +456,7 @@ class ScalarToMultiTest(test.TestCase):
             validate_args=True)
 
   def testNonScalarBatchNonScalarEvent(self):
-    with self.test_session():
+    with self.cached_session():
       # Can't override event_shape and/or batch_shape for non_scalar batch,
       # non-scalar event.
       with self.assertRaisesRegexp(ValueError, "base distribution not scalar"):
@@ -469,7 +469,7 @@ class ScalarToMultiTest(test.TestCase):
             validate_args=True)
 
   def testMatrixEvent(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_shape = [2]
       event_shape = [2, 3, 3]
       batch_shape_pl = array_ops.placeholder(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
index 04f047aa0c..856579da32 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
@@ -35,7 +35,7 @@ class VectorDiffeomixtureTest(
   """Tests the VectorDiffeomixture distribution."""
 
   def testSampleProbConsistentBroadcastMixNoBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
@@ -64,7 +64,7 @@ class VectorDiffeomixtureTest(
           sess.run, vdm, radius=4., center=2., rtol=0.015)
 
   def testSampleProbConsistentBroadcastMixNonStandardBase(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
@@ -93,7 +93,7 @@ class VectorDiffeomixtureTest(
           sess.run, vdm, radius=4., center=3., rtol=0.01)
 
   def testSampleProbConsistentBroadcastMixBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
@@ -128,7 +128,7 @@ class VectorDiffeomixtureTest(
     dims = 4
     loc_1 = rng.randn(2, 3, dims).astype(np.float32)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=(rng.rand(2, 3, 1) - 0.5).astype(np.float32),
           temperature=[1.],
@@ -152,7 +152,7 @@ class VectorDiffeomixtureTest(
           sess.run, vdm, radius=3., center=loc_1, rtol=0.02)
 
   def testMeanCovarianceNoBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 3
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
@@ -179,7 +179,7 @@ class VectorDiffeomixtureTest(
   def testTemperatureControlsHowMuchThisLooksLikeDiscreteMixture(self):
     # As temperature decreases, this should approach a mixture of normals, with
     # components at -2, 2.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 1
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[0.],
@@ -216,7 +216,7 @@ class VectorDiffeomixtureTest(
           sess.run, vdm, rtol=0.02, cov_rtol=0.08)
 
   def testConcentrationLocControlsHowMuchWeightIsOnEachComponent(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 1
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[-1.], [0.], [1.]],
@@ -259,7 +259,7 @@ class VectorDiffeomixtureTest(
           sess.run, vdm, rtol=0.02, cov_rtol=0.08)
 
   def testMeanCovarianceNoBatchUncenteredNonStandardBase(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 3
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
@@ -284,7 +284,7 @@ class VectorDiffeomixtureTest(
           sess.run, vdm, num_samples=int(1e6), rtol=0.01, cov_atol=0.025)
 
   def testMeanCovarianceBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 3
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
@@ -312,7 +312,7 @@ class VectorDiffeomixtureTest(
           sess.run, vdm, rtol=0.02, cov_rtol=0.07)
 
   def testSampleProbConsistentQuadrature(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[0.],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_exponential_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_exponential_diag_test.py
index fd05bd207f..db8186b79a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_exponential_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_exponential_diag_test.py
@@ -37,42 +37,42 @@ class VectorExponentialDiagTest(test.TestCase):
   def testScalarParams(self):
     mu = -1.
     diag = -5.
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
         ds.VectorExponentialDiag(mu, diag)
 
   def testVectorParams(self):
     mu = [-1.]
     diag = [-5.]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
       self.assertAllEqual([3, 1], dist.sample(3).get_shape())
 
   def testMean(self):
     mu = [-1., 1]
     diag = [1., -5]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
       self.assertAllEqual([-1. + 1., 1. - 5.], dist.mean().eval())
 
   def testMode(self):
     mu = [-1.]
     diag = [1., -5]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
       self.assertAllEqual([-1., -1.], dist.mode().eval())
 
   def testMeanWithBroadcastLoc(self):
     mu = [-1.]
     diag = [1., -5]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
       self.assertAllEqual([-1. + 1, -1. - 5], dist.mean().eval())
 
   def testSample(self):
     mu = [-2., 1]
     diag = [1., -2]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
       samps = dist.sample(int(1e4), seed=0).eval()
       cov_mat = array_ops.matrix_diag(diag).eval()**2
@@ -85,7 +85,7 @@ class VectorExponentialDiagTest(test.TestCase):
   def testSingularScaleRaises(self):
     mu = [-1., 1]
     diag = [1., 0]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
       with self.assertRaisesOpError("Singular"):
         dist.sample().eval()
@@ -97,7 +97,7 @@ class VectorExponentialDiagTest(test.TestCase):
     # diag corresponds to no batches of 3-variate normals
     diag = np.ones([3])
 
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorExponentialDiag(mu, diag, validate_args=True)
 
       mean = dist.mean()
@@ -117,7 +117,7 @@ class VectorExponentialDiagTest(test.TestCase):
                           atol=0.10, rtol=0.05)
 
   def testCovariance(self):
-    with self.test_session():
+    with self.cached_session():
       vex = ds.VectorExponentialDiag(
           loc=array_ops.ones([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
@@ -153,7 +153,7 @@ class VectorExponentialDiagTest(test.TestCase):
           vex.covariance().eval())
 
   def testVariance(self):
-    with self.test_session():
+    with self.cached_session():
       vex = ds.VectorExponentialDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
@@ -178,7 +178,7 @@ class VectorExponentialDiagTest(test.TestCase):
           vex.variance().eval())
 
   def testStddev(self):
-    with self.test_session():
+    with self.cached_session():
       vex = ds.VectorExponentialDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
index 1226c66113..9ee19b7e93 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
@@ -38,14 +38,14 @@ class VectorLaplaceDiagTest(test.TestCase):
   def testScalarParams(self):
     mu = -1.
     diag = -5.
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
         ds.VectorLaplaceDiag(mu, diag)
 
   def testVectorParams(self):
     mu = [-1.]
     diag = [-5.]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
       self.assertAllEqual([3, 1], dist.sample(3).get_shape())
 
@@ -56,7 +56,7 @@ class VectorLaplaceDiagTest(test.TestCase):
     # Batch shape = [1], event shape = [3]
     mu = array_ops.zeros((1, 3))
     diag = array_ops.ones((1, 3))
-    with self.test_session():
+    with self.cached_session():
       base_dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
       dist = ds.TransformedDistribution(
           base_dist,
@@ -68,21 +68,21 @@ class VectorLaplaceDiagTest(test.TestCase):
   def testMean(self):
     mu = [-1., 1]
     diag = [1., -5]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
       self.assertAllEqual(mu, dist.mean().eval())
 
   def testMeanWithBroadcastLoc(self):
     mu = [-1.]
     diag = [1., -5]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
       self.assertAllEqual([-1., -1.], dist.mean().eval())
 
   def testSample(self):
     mu = [-1., 1]
     diag = [1., -2]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
       samps = dist.sample(int(1e4), seed=0).eval()
       cov_mat = 2. * array_ops.matrix_diag(diag).eval()**2
@@ -95,7 +95,7 @@ class VectorLaplaceDiagTest(test.TestCase):
   def testSingularScaleRaises(self):
     mu = [-1., 1]
     diag = [1., 0]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
       with self.assertRaisesOpError("Singular"):
         dist.sample().eval()
@@ -107,7 +107,7 @@ class VectorLaplaceDiagTest(test.TestCase):
     # diag corresponds to no batches of 3-variate normals
     diag = np.ones([3])
 
-    with self.test_session():
+    with self.cached_session():
       dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
 
       mean = dist.mean()
@@ -126,7 +126,7 @@ class VectorLaplaceDiagTest(test.TestCase):
                           atol=0.10, rtol=0.05)
 
   def testCovariance(self):
-    with self.test_session():
+    with self.cached_session():
       vla = ds.VectorLaplaceDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
@@ -162,7 +162,7 @@ class VectorLaplaceDiagTest(test.TestCase):
           vla.covariance().eval())
 
   def testVariance(self):
-    with self.test_session():
+    with self.cached_session():
       vla = ds.VectorLaplaceDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
@@ -187,7 +187,7 @@ class VectorLaplaceDiagTest(test.TestCase):
           vla.variance().eval())
 
   def testStddev(self):
-    with self.test_session():
+    with self.cached_session():
       vla = ds.VectorLaplaceDiag(
           loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
       self.assertAllClose(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_sinh_arcsinh_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_sinh_arcsinh_diag_test.py
index 2bc6a926dd..0dd7d23eb0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_sinh_arcsinh_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_sinh_arcsinh_diag_test.py
@@ -35,7 +35,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
     scale_diag = rng.rand(d)
     scale_identity_multiplier = np.float64(1.0)
     loc = rng.randn(d)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       norm = ds.MultivariateNormalDiag(
           loc=loc,
           scale_diag=scale_diag,
@@ -65,7 +65,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
     scale_diag = rng.rand(d)
     scale_identity_multiplier = np.float64(1.2)
     loc = rng.randn(d)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       vlap = ds.VectorLaplaceDiag(
           loc=loc,
           scale_diag=scale_diag,
@@ -96,7 +96,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
     scale_diag = rng.rand(d)
     scale_identity_multiplier = np.float64(0.9)
     loc = rng.randn(d)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       norm = ds.MultivariateNormalDiag(
           loc=loc,
           scale_diag=scale_diag,
@@ -141,7 +141,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
     scale_diag = rng.rand(d)
     scale_identity_multiplier = np.float64(1.0)
     loc = rng.randn(d)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       norm = ds.MultivariateNormalDiag(
           loc=loc,
           scale_diag=scale_diag,
@@ -186,7 +186,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
     scale_diag = rng.rand(d)
     scale_identity_multiplier = np.float64(1.0)
     loc = rng.randn(d)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sasnorm = ds.VectorSinhArcsinhDiag(
           loc=loc,
           scale_diag=scale_diag,
@@ -201,7 +201,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
     b, d = 5, 2
     scale_diag = rng.rand(b, d)
     scale_identity_multiplier = np.float64(1.1)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sasnorm = ds.VectorSinhArcsinhDiag(
           scale_diag=scale_diag,
           scale_identity_multiplier=scale_identity_multiplier,
@@ -228,7 +228,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
     d = 3
     scale_diag = rng.rand(d)
     scale_identity_multiplier = np.float64(1.1)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sasnorm = ds.VectorSinhArcsinhDiag(
           scale_diag=scale_diag,
           scale_identity_multiplier=scale_identity_multiplier,
@@ -252,7 +252,7 @@ class VectorSinhArcsinhDiagTest(test_util.VectorDistributionTestHelpers,
           rtol=0.1)
 
   def test_pdf_reflected_for_negative_skewness(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sas_pos_skew = ds.VectorSinhArcsinhDiag(
           loc=[0.],
           scale_identity_multiplier=1.,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
index b8a3a262ce..aaec1f09d9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
@@ -75,7 +75,7 @@ class VectorStudentTTest(test.TestCase):
     self._rng = np.random.RandomState(42)
 
   def testProbStaticScalar(self):
-    with self.test_session():
+    with self.cached_session():
       # Scalar batch_shape.
       df = np.asarray(3., dtype=np.float32)
       # Scalar batch_shape.
@@ -116,7 +116,7 @@ class VectorStudentTTest(test.TestCase):
     expected_mst = _FakeVectorStudentT(
         df=df, loc=loc, scale_tril=scale_tril)
 
-    with self.test_session():
+    with self.cached_session():
       actual_mst = _VectorStudentT(df=df, loc=loc, scale_diag=scale_diag,
                                    validate_args=True)
       self.assertAllClose(expected_mst.log_prob(x),
@@ -145,7 +145,7 @@ class VectorStudentTTest(test.TestCase):
     expected_mst = _FakeVectorStudentT(
         df=df, loc=loc, scale_tril=scale_tril)
 
-    with self.test_session():
+    with self.cached_session():
       df_pl = array_ops.placeholder(dtypes.float32, name="df")
       loc_pl = array_ops.placeholder(dtypes.float32, name="loc")
       scale_diag_pl = array_ops.placeholder(dtypes.float32, name="scale_diag")
@@ -180,7 +180,7 @@ class VectorStudentTTest(test.TestCase):
         loc=loc,
         scale_tril=scale_tril)
 
-    with self.test_session():
+    with self.cached_session():
       actual_mst = _VectorStudentT(df=df, loc=loc, scale_diag=scale_diag,
                                    validate_args=True)
       self.assertAllClose(expected_mst.log_prob(x),
@@ -211,7 +211,7 @@ class VectorStudentTTest(test.TestCase):
         loc=loc,
         scale_tril=scale_tril)
 
-    with self.test_session():
+    with self.cached_session():
       df_pl = array_ops.placeholder(dtypes.float32, name="df")
       loc_pl = array_ops.placeholder(dtypes.float32, name="loc")
       scale_diag_pl = array_ops.placeholder(dtypes.float32, name="scale_diag")
@@ -240,7 +240,7 @@ class VectorStudentTTest(test.TestCase):
         scale_tril=np.tile(scale_tril[array_ops.newaxis, :, :],
                            reps=[len(df), 1, 1]))
 
-    with self.test_session():
+    with self.cached_session():
       actual_mst = _VectorStudentT(df=df, loc=loc, scale_diag=scale_diag,
                                    validate_args=True)
       self.assertAllClose(expected_mst.log_prob(x),
@@ -266,7 +266,7 @@ class VectorStudentTTest(test.TestCase):
         scale_tril=np.tile(scale_tril[array_ops.newaxis, :, :],
                            reps=[len(df), 1, 1]))
 
-    with self.test_session():
+    with self.cached_session():
       df_pl = array_ops.placeholder(dtypes.float32, name="df")
       loc_pl = array_ops.placeholder(dtypes.float32, name="loc")
       scale_diag_pl = array_ops.placeholder(dtypes.float32, name="scale_diag")
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index dcecce981f..a60056c444 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -52,7 +52,7 @@ def wishart_var(df, x):
 class WishartCholeskyTest(test.TestCase):
 
   def testEntropy(self):
-    with self.test_session():
+    with self.cached_session():
       scale = make_pd(1., 2)
       df = 4
       w = distributions.WishartCholesky(df, chol(scale))
@@ -64,7 +64,7 @@ class WishartCholeskyTest(test.TestCase):
       self.assertAllClose(0.78375711047393404, w.entropy().eval())
 
   def testMeanLogDetAndLogNormalizingConstant(self):
-    with self.test_session():
+    with self.cached_session():
 
       def entropy_alt(w):
         return (
@@ -80,35 +80,35 @@ class WishartCholeskyTest(test.TestCase):
       self.assertAllClose(w.entropy().eval(), entropy_alt(w))
 
   def testMean(self):
-    with self.test_session():
+    with self.cached_session():
       scale = make_pd(1., 2)
       df = 4
       w = distributions.WishartCholesky(df, chol(scale))
       self.assertAllEqual(df * scale, w.mean().eval())
 
   def testMode(self):
-    with self.test_session():
+    with self.cached_session():
       scale = make_pd(1., 2)
       df = 4
       w = distributions.WishartCholesky(df, chol(scale))
       self.assertAllEqual((df - 2. - 1.) * scale, w.mode().eval())
 
   def testStd(self):
-    with self.test_session():
+    with self.cached_session():
       scale = make_pd(1., 2)
       df = 4
       w = distributions.WishartCholesky(df, chol(scale))
       self.assertAllEqual(chol(wishart_var(df, scale)), w.stddev().eval())
 
   def testVariance(self):
-    with self.test_session():
+    with self.cached_session():
       scale = make_pd(1., 2)
       df = 4
       w = distributions.WishartCholesky(df, chol(scale))
       self.assertAllEqual(wishart_var(df, scale), w.variance().eval())
 
   def testSample(self):
-    with self.test_session():
+    with self.cached_session():
       scale = make_pd(1., 2)
       df = 4
 
@@ -161,7 +161,7 @@ class WishartCholeskyTest(test.TestCase):
 
   # Test that sampling with the same seed twice gives the same results.
   def testSampleMultipleTimes(self):
-    with self.test_session():
+    with self.cached_session():
       df = 4.
       n_val = 100
 
@@ -184,7 +184,7 @@ class WishartCholeskyTest(test.TestCase):
       self.assertAllClose(samples1, samples2)
 
   def testProb(self):
-    with self.test_session():
+    with self.cached_session():
       # Generate some positive definite (pd) matrices and their Cholesky
       # factorizations.
       x = np.array(
@@ -271,7 +271,7 @@ class WishartCholeskyTest(test.TestCase):
                             w.log_prob(np.reshape(x, (2, 2, 2, 2))).get_shape())
 
   def testBatchShape(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       scale = make_pd(1., 2)
       chol_scale = chol(scale)
 
@@ -295,7 +295,7 @@ class WishartCholeskyTest(test.TestCase):
                    feed_dict={scale_deferred: [chol_scale, chol_scale]}))
 
   def testEventShape(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       scale = make_pd(1., 2)
       chol_scale = chol(scale)
 
@@ -320,7 +320,7 @@ class WishartCholeskyTest(test.TestCase):
                    feed_dict={scale_deferred: [chol_scale, chol_scale]}))
 
   def testValidateArgs(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       df_deferred = array_ops.placeholder(dtypes.float32)
       chol_scale_deferred = array_ops.placeholder(dtypes.float32)
       x = make_pd(1., 3)
@@ -374,7 +374,7 @@ class WishartCholeskyTest(test.TestCase):
                           chol_scale_deferred: np.ones((3, 3))})
 
   def testStaticAsserts(self):
-    with self.test_session():
+    with self.cached_session():
       x = make_pd(1., 3)
       chol_scale = chol(x)
 
@@ -404,7 +404,7 @@ class WishartCholeskyTest(test.TestCase):
                        batch_shape + [dims, dims])
     wishart = distributions.WishartFull(df=5, scale=scale)
     x = wishart.sample(sample_shape, seed=42)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_ = sess.run(x)
     expected_shape = sample_shape + batch_shape + [dims, dims]
     self.assertAllEqual(expected_shape, x.shape)
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index ad853ee293..affc64a14f 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -152,6 +152,9 @@ class _BaseDeterministic(distribution.Distribution):
     """Relative tolerance for comparing points to `self.loc`."""
     return self._rtol
 
+  def _entropy(self):
+    return array_ops.zeros(self.batch_shape_tensor(), dtype=self.dtype)
+
   def _mean(self):
     return array_ops.identity(self.loc)
 
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index f7933639a0..84517b57c7 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -14,6 +14,7 @@ py_library(
         ":datasets",
         ":metrics",
         ":network",
+        ":remote",
         ":saver",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
@@ -104,7 +105,6 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/eager:graph_callable",
         "//tensorflow/python/eager:test",
         "//tensorflow/python:variables",
     ],
@@ -224,11 +224,24 @@ py_test(
     ],
 )
 
+py_library(
+    name = "remote",
+    srcs = ["remote.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_test(
     name = "remote_test",
     srcs = ["remote_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":remote",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py
index 0736ed02b7..e5058bfd94 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py
+++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py
@@ -218,7 +218,7 @@ class DensenetBenchmark(tf.test.Benchmark):
     tf.constant(1.).cpu()
 
   def _benchmark_eager_apply(self, label, device_and_format, defun=False,
-                             execution_mode=None, compiled=False):
+                             execution_mode=None):
     with tfe.execution_mode(execution_mode):
       device, data_format = device_and_format
       model = densenet.DenseNet(self.depth, self.growth_rate, self.num_blocks,
@@ -228,7 +228,7 @@ class DensenetBenchmark(tf.test.Benchmark):
                                 weight_decay=1e-4, dropout_rate=0,
                                 pool_initial=True, include_top=True)
       if defun:
-        model.call = tfe.defun(model.call, compiled=compiled)
+        model.call = tfe.defun(model.call)
       batch_size = 64
       num_burn = 5
       num_iters = 30
@@ -264,8 +264,7 @@ class DensenetBenchmark(tf.test.Benchmark):
                              make_iterator,
                              device_and_format,
                              defun=False,
-                             execution_mode=None,
-                             compiled=False):
+                             execution_mode=None):
     with tfe.execution_mode(execution_mode):
       device, data_format = device_and_format
       for batch_size in self._train_batch_sizes():
@@ -279,8 +278,8 @@ class DensenetBenchmark(tf.test.Benchmark):
         optimizer = tf.train.GradientDescentOptimizer(0.1)
         apply_grads = apply_gradients
         if defun:
-          model.call = tfe.defun(model.call, compiled=compiled)
-          apply_grads = tfe.defun(apply_gradients, compiled=compiled)
+          model.call = tfe.defun(model.call)
+          apply_grads = tfe.defun(apply_gradients)
 
         num_burn = 3
         num_iters = 10
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 1a5a186e7a..315d7a4893 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1056,7 +1056,7 @@
         "\n",
         "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
         "\n",
-        "        predicted_id = tf.multinomial(tf.exp(predictions), num_samples=1)[0][0].numpy()\n",
+        "        predicted_id = tf.multinomial(predictions, num_samples=1)[0][0].numpy()\n",
         "        result.append(index_word[predicted_id])\n",
         "\n",
         "        if index_word[predicted_id] == '<end>':\n",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index 027097908f..40bc098724 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -610,7 +610,7 @@
         "\n",
         "    # using a multinomial distribution to predict the word returned by the model\n",
         "    predictions = predictions / temperature\n",
-        "    predicted_id = tf.multinomial(tf.exp(predictions), num_samples=1)[0][0].numpy()\n",
+        "    predicted_id = tf.multinomial(predictions, num_samples=1)[0][0].numpy()\n",
         "    \n",
         "    # We pass the predicted word as the next input to the model\n",
         "    # along with the previous hidden state\n",
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 08d8364978..f1e1f99c57 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -466,10 +466,10 @@
         "        # passing the concatenated vector to the GRU\n",
         "        output, state = self.gru(x)\n",
         "        \n",
-        "        # output shape == (batch_size * max_length, hidden_size)\n",
+        "        # output shape == (batch_size * 1, hidden_size)\n",
         "        output = tf.reshape(output, (-1, output.shape[2]))\n",
         "        \n",
-        "        # output shape == (batch_size * max_length, vocab)\n",
+        "        # output shape == (batch_size * 1, vocab)\n",
         "        x = self.fc(output)\n",
         "        \n",
         "        return x, state, attention_weights\n",
@@ -677,7 +677,7 @@
         "        attention_weights = tf.reshape(attention_weights, (-1, ))\n",
         "        attention_plot[t] = attention_weights.numpy()\n",
         "\n",
-        "        predicted_id = tf.multinomial(tf.exp(predictions), num_samples=1)[0][0].numpy()\n",
+        "        predicted_id = tf.multinomial(predictions, num_samples=1)[0][0].numpy()\n",
         "\n",
         "        result += targ_lang.idx2word[predicted_id] + ' '\n",
         "\n",
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
index a28bc8a43d..3f70f573b1 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
@@ -272,8 +272,8 @@ class ResNet50(tf.keras.Model):
       else:
         self.global_pooling = None
 
-  def call(self, input_tensor, training):
-    x = self.conv1(input_tensor)
+  def call(self, inputs, training=True):
+    x = self.conv1(inputs)
     x = self.bn_conv1(x, training=training)
     x = tf.nn.relu(x)
     x = self.max_pool(x)
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 07d8788882..d265169b5e 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -216,12 +216,12 @@ class ResNet50Benchmarks(tf.test.Benchmark):
     tf.constant(1.).cpu()
 
   def _benchmark_eager_apply(self, label, device_and_format, defun=False,
-                             execution_mode=None, compiled=False):
+                             execution_mode=None):
     with tfe.execution_mode(execution_mode):
       device, data_format = device_and_format
       model = resnet50.ResNet50(data_format)
       if defun:
-        model.call = tfe.defun(model.call, compiled=compiled)
+        model.call = tfe.defun(model.call)
       batch_size = 64
       num_burn = 5
       num_iters = 30
@@ -257,8 +257,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
                              make_iterator,
                              device_and_format,
                              defun=False,
-                             execution_mode=None,
-                             compiled=False):
+                             execution_mode=None):
     with tfe.execution_mode(execution_mode):
       device, data_format = device_and_format
       for batch_size in self._train_batch_sizes():
@@ -267,8 +266,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         optimizer = tf.train.GradientDescentOptimizer(0.1)
         apply_grads = apply_gradients
         if defun:
-          model.call = tfe.defun(model.call, compiled=compiled)
-          apply_grads = tfe.defun(apply_gradients, compiled=compiled)
+          model.call = tfe.defun(model.call)
+          apply_grads = tfe.defun(apply_gradients)
 
         num_burn = 3
         num_iters = 10
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
index 84b2ddf0de..6a921e1997 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
@@ -226,14 +226,13 @@ class RevNetBenchmark(tf.test.Benchmark):
                              label,
                              device_and_format,
                              defun=False,
-                             execution_mode=None,
-                             compiled=False):
+                             execution_mode=None):
     config = config_.get_hparams_imagenet_56()
     with tfe.execution_mode(execution_mode):
       device, data_format = device_and_format
       model = revnet.RevNet(config=config)
       if defun:
-        model.call = tfe.defun(model.call, compiled=compiled)
+        model.call = tfe.defun(model.call)
       batch_size = 64
       num_burn = 5
       num_iters = 10
@@ -271,8 +270,7 @@ class RevNetBenchmark(tf.test.Benchmark):
                              make_iterator,
                              device_and_format,
                              defun=False,
-                             execution_mode=None,
-                             compiled=False):
+                             execution_mode=None):
     config = config_.get_hparams_imagenet_56()
     with tfe.execution_mode(execution_mode):
       device, data_format = device_and_format
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 6efafccd6b..930e62b680 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -336,9 +336,27 @@ class Mean(Metric):
       return values
     return values, weights
 
-  def result(self):
+  def result(self, write_summary=True):
+    """Returns the result of the Metric.
+
+    Args:
+      write_summary: bool indicating whether to feed the result to the summary
+        before returning.
+    Returns:
+      aggregated metric as float.
+    Raises:
+      ValueError: if the optional argument is not bool
+    """
+     # Convert the boolean to tensor for tf.cond, if it is not.
+    if not isinstance(write_summary, ops.Tensor):
+      write_summary = ops.convert_to_tensor(write_summary)
     t = self.numer / self.denom
-    summary_ops.scalar(name=self.name, tensor=t)
+    def write_summary_f():
+      summary_ops.scalar(name=self.name, tensor=t)
+      return t
+    control_flow_ops.cond(write_summary,
+                          write_summary_f,
+                          lambda: t)
     return t
 
 
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 20d938d492..aa99616810 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -46,6 +46,18 @@ class MetricsTest(test.TestCase):
     self.assertEqual(dtypes.float64, m.dtype)
     self.assertEqual(dtypes.float64, m.result().dtype)
 
+  def testSummaryArg(self):
+    m = metrics.Mean()
+    m([1, 10, 100])
+    m(1000)
+    m([10000.0, 100000.0])
+    self.assertEqual(111111.0/6, m.result(write_summary=True).numpy())
+    self.assertEqual(111111.0/6, m.result(write_summary=False).numpy())
+    with self.assertRaises(ValueError):
+      m.result(write_summary=5)
+    with self.assertRaises(ValueError):
+      m.result(write_summary=[True])
+
   def testVariableCollections(self):
     with context.graph_mode(), ops.Graph().as_default():
       m = metrics.Mean()
@@ -93,6 +105,16 @@ class MetricsTest(test.TestCase):
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].simple_value, 37.0)
 
+    # Get result without saving the summary.
+    logdir = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logdir, max_queue=0,
+        name="t0").as_default(), summary_ops.always_record_summaries():
+      m.result(write_summary=False)  # As a side-effect will write summaries.
+      # events_from_logdir(_) asserts the directory exists.
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(len(events), 1)
+
   def testWeightedMean(self):
     m = metrics.Mean()
     m([1, 100, 100000], weights=[1, 0.2, 0.3])
diff --git a/tensorflow/contrib/eager/python/remote.py b/tensorflow/contrib/eager/python/remote.py
new file mode 100644
index 0000000000..b74cf394f6
--- /dev/null
+++ b/tensorflow/contrib/eager/python/remote.py
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers to connect to remote servers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.core.protobuf.cluster_pb2 import ClusterDef
+from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
+from tensorflow.python.eager import context
+
+
+def connect_to_remote_host(remote_host=None, job_name="worker"):
+  """Connects to a single machine to enable remote execution on it.
+
+  Will make devices on the remote host available to use. Note that calling this
+  more than once will work, but will invalidate any tensor handles on the old
+  remote devices.
+
+  Using the default job_name of worker, you can schedule ops to run remotely as
+  follows:
+  ```python
+  # Enable eager execution, and connect to the remote host.
+  tf.enable_eager_execution()
+  tf.contrib.eager.connect_to_remote_host("exampleaddr.com:9876")
+
+  with ops.device("job:worker/replica:0/task:1/device:CPU:0"):
+    # The following tensors should be resident on the remote device, and the op
+    # will also execute remotely.
+    x1 = array_ops.ones([2, 2])
+    x2 = array_ops.ones([2, 2])
+    y = math_ops.matmul(x1, x2)
+  ```
+
+  Args:
+    remote_host: The addr of the remote server in host-port format.
+    job_name: The job name under which the new server will be accessible.
+
+  Raises:
+    ValueError: if remote_host is None.
+  """
+  if remote_host is None:
+    raise ValueError("Must provide an remote_host")
+  cluster_def = ClusterDef()
+  job_def = cluster_def.job.add()
+  job_def.name = job_name
+  job_def.tasks[0] = "127.0.0.1:0"
+  job_def.tasks[1] = remote_host
+
+  server_def = ServerDef(
+      cluster=cluster_def,
+      job_name=job_name,
+      task_index=0,
+      protocol="grpc")
+
+  # TODO(nareshmodi): Make this default since it works in more situations.
+  os.environ["TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC"] = "1"
+  context.set_server_def(server_def)
diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py
index 76f48eeb1c..13029db975 100644
--- a/tensorflow/contrib/eager/python/remote_test.py
+++ b/tensorflow/contrib/eager/python/remote_test.py
@@ -23,6 +23,7 @@ import os
 
 import numpy as np
 
+from tensorflow.contrib.eager.python import remote
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.eager import backprop
@@ -85,6 +86,7 @@ class RemoteExecutionTest(test.TestCase):
     self._cached_server1_target = self._cached_server1.target[len("grpc://"):]
     self._cached_server2_target = self._cached_server2.target[len("grpc://"):]
 
+  def setUp(self):
     # Start the local server.
     context.set_server_def(
         server_def=get_server_def(
@@ -172,6 +174,17 @@ class RemoteExecutionTest(test.TestCase):
     y = math_ops.matmul(x1, x1)
     np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
 
+  @run_sync_and_async
+  def testConnectToRemoteServer(self):
+    """Basic server connection."""
+    remote.connect_to_remote_host(self._cached_server1_target)
+
+    with ops.device("job:worker/replica:0/task:1/device:CPU:0"):
+      x1 = array_ops.ones([2, 2])
+      x2 = array_ops.ones([2, 2])
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index 90a3711475..91bc75213c 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -21,15 +21,11 @@ import os
 
 from tensorflow.contrib.eager.python import saver as _saver
 from tensorflow.python.eager import context
-from tensorflow.python.eager import graph_callable
 from tensorflow.python.eager import test
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
@@ -142,53 +138,6 @@ class SaverTest(test.TestCase):
         with _saver.restore_variables_on_create(ckpt_prefix):
           _ = model(resource_variable_ops.ResourceVariable(1.0, name='v2'))
 
-  def testSaveRestoreGraphCallable(self):
-    with ops.device(self._dev()):
-      @graph_callable.graph_callable(
-          [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-      def model(x):
-        v = variable_scope.get_variable(
-            'v', initializer=init_ops.zeros_initializer(), shape=())
-        return v + x
-
-      # Default 2 + 0 = 2
-      self.assertEqual(
-          2, model(array_ops.constant(2, dtype=dtypes.float32)).numpy())
-
-      # Save the variable value 0.
-      ckpt_prefix = os.path.join(test.get_temp_dir(), 'ckpt')
-      _saver.Saver(model.variables).save(ckpt_prefix)
-
-      # update variable to 1, so that 2 + 1 = 3
-      model.variables[0].assign(1.)
-      self.assertEqual(
-          3, model(array_ops.constant(2, dtype=dtypes.float32)).numpy())
-
-      # load the variable value 0, so that 2 + 0 = 2
-      _saver.Saver(model.variables).restore(ckpt_prefix)
-      self.assertEqual(
-          2, model(array_ops.constant(2, dtype=dtypes.float32)).numpy())
-
-      # update checkpoint variable to 1 and memory value to 2.
-      model.variables[0].assign(1.)
-      _saver.Saver(model.variables).save(ckpt_prefix)
-      model.variables[0].assign(2.)
-      self.assertEqual(
-          4, model(array_ops.constant(2, dtype=dtypes.float32)).numpy())
-
-      # reset the graph and reload on create, so that 1 + 2 = 3
-      ops.reset_default_graph()
-      with _saver.restore_variables_on_create(ckpt_prefix):
-        @graph_callable.graph_callable(
-            [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-        def model2(x):
-          v = variable_scope.get_variable(
-              'v', initializer=init_ops.zeros_initializer(), shape=())
-          return v + x
-
-        self.assertEqual(
-            3, model2(array_ops.constant(2, dtype=dtypes.float32)).numpy())
-
 
 class GetOptimizerTests(test.TestCase):
 
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index de11d00a1a..fe7f1b72fc 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -16,7 +16,7 @@
 
 EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
-To use, at program startup, call `tfe.enable_eager_execution()`.
+To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@metrics
 
@@ -67,12 +67,15 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@execution_mode
 @@async_wait
 @@async_clear_error
+@@set_server_def
 
 @@run_test_in_graph_and_eager_modes
 @@run_all_tests_in_graph_and_eager_modes
 
 @@TensorSpec
 
+@@connect_to_cloud_tpu
+
 @@DEVICE_PLACEMENT_EXPLICIT
 @@DEVICE_PLACEMENT_WARN
 @@DEVICE_PLACEMENT_SILENT
@@ -93,6 +96,7 @@ from tensorflow.contrib.eager.python.network import Network
 from tensorflow.contrib.eager.python.network import Sequential
 from tensorflow.contrib.eager.python.network import save_network_checkpoint
 from tensorflow.contrib.eager.python.network import restore_network_checkpoint
+from tensorflow.contrib.eager.python.remote import connect_to_remote_host
 from tensorflow.contrib.eager.python.saver import get_optimizer_variables
 from tensorflow.contrib.eager.python.saver import restore_variables_on_create
 from tensorflow.contrib.eager.python.saver import Saver
@@ -110,6 +114,7 @@ from tensorflow.python.eager.context import async_clear_error
 from tensorflow.python.eager.context import SYNC
 from tensorflow.python.eager.context import ASYNC
 from tensorflow.python.eager.context import num_gpus
+from tensorflow.python.eager.context import set_server_def
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
 from tensorflow.python.eager.execution_callbacks import inf_callback
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 82272bf120..77f62df99d 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -20,6 +20,7 @@ py_library(
         ":dnn_linear_combined",
         ":early_stopping",
         ":export",
+        ":exporter",
         ":extenders",
         ":head",
         ":hooks",
@@ -220,6 +221,33 @@ py_test(
 )
 
 py_library(
+    name = "exporter",
+    srcs = [
+        "python/estimator/exporter.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python/estimator:exporter",
+    ],
+)
+
+py_test(
+    name = "exporter_test",
+    size = "medium",
+    srcs = ["python/estimator/exporter_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":exporter",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:exporter",
+    ],
+)
+
+py_library(
     name = "head",
     srcs = [
         "python/estimator/head.py",
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index e1453ae1d0..258860f263 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -45,6 +45,7 @@ _allowed_symbols = [
     'clip_gradients_by_norm',
     'forward_features',
     'InMemoryEvaluatorHook',
+    'make_stop_at_checkpoint_step_hook',
     'logistic_regression_head',
     'multi_class_head',
     'multi_head',
diff --git a/tensorflow/contrib/estimator/python/estimator/export.py b/tensorflow/contrib/estimator/python/estimator/export.py
index 03cf6f107c..b0deb9b494 100644
--- a/tensorflow/contrib/estimator/python/estimator/export.py
+++ b/tensorflow/contrib/estimator/python/estimator/export.py
@@ -31,8 +31,8 @@ def export_saved_model_for_mode(
   # pylint: disable=line-too-long
   """Exports a single train/eval/predict graph as a SavedModel.
 
-  For a detailed guide, see
-  @{$saved_model#using_savedmodel_with_estimators$Using SavedModel with Estimators}.
+  For a detailed guide, see [Using SavedModel with Estimators](
+  https://tensorflow.org/guide/saved_model#using_savedmodel_with_estimators).
 
   Sample usage:
   ```python
diff --git a/tensorflow/contrib/estimator/python/estimator/exporter.py b/tensorflow/contrib/estimator/python/estimator/exporter.py
new file mode 100644
index 0000000000..09d7440605
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/exporter.py
@@ -0,0 +1,280 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements StepsExporter to export the model in user specified steps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.estimator import exporter
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.summary import summary_iterator
+
+DEFAULT_GLOBAL_STEP_KEY = ops.GraphKeys.GLOBAL_STEP
+
+
+class StepsExporter(exporter.Exporter):
+  """This class exports the model in user specified steps.
+
+  This class exports the model at the steps given by the `steps_to_keep`
+  argument. Each number in the list is treated as a lower bound for model
+  exports, to handle the case when evaluation is performed at different steps.
+
+  Consider this example:
+
+  ```
+  steps_to_keep = [1, 2, 3, 6, 7, 10, 12, 25]
+  ```
+
+  The model is evaluated at step increments of 5: `[5, 10, 15, 20, 25, 30]`.
+  The `StepsExporter` will export the model when it has reached steps
+  `[5, 10, 15, 25]`.
+
+  This example illustrates the two cases when the model is exported:
+
+  1. Model is evaluated on a step defined in the list `steps_to_keep`.
+
+     In the example, the model is exported on step `10` and `25`.
+
+  2. Model is evaluated on a step not defined in the list `steps_to_keep`, but
+     is still exported because a step in `steps_to_keep` was missed.
+
+     In the example, when the model reaches step `5`, the model is exported even
+     though  `steps_to_keep` does not contain `5`. Step `5` is exported to make
+     up for step `3`, which was missed. Steps `1` and `2` in `steps_to_keep` are
+     skipped completely (e.g. say the model is evaluated at step `6`. It will
+     **not** be exported to make up for step `2`).
+
+  Using the `steps_to_keep` list as a lower bound allows users to define
+  approximate step boundaries for exporting their models, and avoid frustrating
+  off-by-one calculation errors.
+
+  Sample Use Cases:
+    There are specific points during the training when having a saved version of
+    the model would be useful. One example is at the end of each training phase
+    when the set of freezed weights is changed.
+    Another good use case is saving the model at the end of each epoch for
+    visualization or retraining.
+  """
+
+  def __init__(self,
+               steps_to_keep,
+               name='steps_exporter',
+               serving_input_receiver_fn=None,
+               event_file_pattern='eval/*.tfevents.*',
+               assets_extra=None,
+               as_text=False):
+    """Create an `StepsExporter` to use with `tf.estimator.EvalSpec`.
+
+    Example of creating a StepsExporter for training and evaluation:
+
+    ```python
+    categorical_feature_a = categorical_column_with_hash_bucket(...)
+    categorical_feature_b = categorical_column_with_hash_bucket(...)
+
+    categorical_feature_a_emb = embedding_column(
+        categorical_column=categorical_feature_a, ...)
+    categorical_feature_b_emb = embedding_column(
+        categorical_column=categorical_feature_b, ...)
+
+    estimator = tf.estimator.DNNClassifier(
+        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
+        hidden_units=[1024, 512, 256])
+
+    # Input pipeline for train and evaluate.
+    def train_input_fn: # returns x, y
+      # please shuffle the data.
+      pass
+    def eval_input_fn_eval: # returns x, y
+      pass
+
+    exporter = tf.contrib.estimator.exporter.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=serving_input_receiver_fn,
+        event_file_pattern='eval/*.tfevents.*'
+        steps_to_keep=[...])
+
+    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=1000)
+
+    eval_spec = [tf.estimator.EvalSpec(
+      input_fn=eval_input_fn,
+      steps=1,
+      exporters=exporter,
+      start_delay_secs=0,
+      throttle_secs=5)]
+
+    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
+
+    # Models will be exported to estimator.model_dir in timestamped directories,
+    # which can be used for serving, analysis with TFMA, or directly loaded in.
+    # For example:
+    export_dir = os.path.join(estimator.model_dir,
+                              <timestamped directory name>)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        tf.saved_model.loader.load(
+            sess, [tf.saved_model.tag_constants.SERVING], export_dir)
+
+    ```
+
+    Args:
+      steps_to_keep: Non-empty list of positive integers containing
+        the step numbers at which the model should be exported. All the exports
+        will be kept, so there is no garbage collection.
+      name: Unique name of this `Exporter` that is going to be used in the
+        export path.
+      serving_input_receiver_fn: A function that takes no arguments and returns
+        a `ServingInputReceiver`.
+      event_file_pattern: Event file name pattern relative to model_dir. If
+        None, however, the exporter would not be preemption-safe. To be
+        preemption-safe, event_file_pattern should be specified.
+      assets_extra: An optional dict specifying how to populate the assets.extra
+        directory within the exported SavedModel.  Each key should give the
+        destination path (including the filename) relative to the assets.extra
+        directory.  The corresponding value gives the full path of the source
+        file to be copied.  For example, the simple case of copying a single
+        file without renaming it is specified as `{'my_asset_file.txt':
+        '/path/to/my_asset_file.txt'}`.
+      as_text: Whether to write the SavedModel proto in text format. Defaults to
+        `False`.
+
+    Raises:
+      ValueError: If any arguments is invalid.
+    """
+    # pylint: disable=protected-access
+    self._saved_model_exporter = exporter._SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
+    # pylint: enable=protected-access
+
+    self._event_file_pattern = event_file_pattern
+    self._model_dir = None
+
+    self._input_steps_to_keep = steps_to_keep
+    steps_to_keep = [step for step in steps_to_keep if isinstance(step, int)]
+    steps_to_keep = [step for step in steps_to_keep if step > 0]
+    if not steps_to_keep:
+      raise ValueError(
+          '`steps_to_keep` list must have at least one positive integer')
+    elif self._input_steps_to_keep != steps_to_keep:
+      tf_logging.warn('Changed `steps_to_keep`, by omitting non-integer or'
+                      ' less than 1 elements, to [%s]',
+                      ', '.join(str(step) for step in steps_to_keep))
+    self._steps_to_keep = sorted(steps_to_keep)
+    self._steps_kept = []
+
+  @property
+  def name(self):
+    return self._saved_model_exporter.name
+
+  def export(self, estimator, export_path, checkpoint_path, eval_result,
+             is_the_final_export):
+    """Exports the given Estimator to a specific format.
+
+    Args:
+      estimator: A `tf.estimator.Estimator` instance to export.
+      export_path: A string containing a directory where to write the export.
+      checkpoint_path: The checkpoint path to export.
+      eval_result: The output of Estimator.evaluate on this checkpoint.
+      is_the_final_export: This boolean is True when this is an export in the
+        end of training. It is False for the intermediate exports during the
+        training. When passing Exporter to tf.estimator.train_and_evaluate
+        is_the_final_export is always False if TrainSpec.max_steps is None.
+
+    Returns:
+      The string path to the exported directory or None if export is skipped.
+
+    Raises:
+      ValueError: If `eval_result` is None or doesn't have
+        `ops.GraphKeys.GLOBAL_STEP` as a key.
+    """
+    export_result = None
+
+    if not eval_result or DEFAULT_GLOBAL_STEP_KEY not in eval_result:
+      raise ValueError(
+          '`eval_result` is empty, or does not have global step. This'
+          ' should never happen as Estimator always sets the global step in '
+          '`eval_result`. Please file a bug report. Got eval_result: %s'
+          % str(eval_result))
+
+    if self._model_dir != estimator.model_dir and self._event_file_pattern:
+      tf_logging.info('Loads the steps that the model was already evaluated at,'
+                      'from event files')
+      self._model_dir = estimator.model_dir
+      full_event_file_pattern = os.path.join(self._model_dir,
+                                             self._event_file_pattern)
+      self._steps_kept = self._get_kept_steps(full_event_file_pattern)
+
+      if self._steps_kept:
+        self._steps_kept = sorted(self._steps_kept)
+        self._steps_to_keep = [step for step in self._steps_to_keep if
+                               step > self._steps_kept[-1]]
+    # It is assumed that the model is exported at any evaluated step 'n' if
+    # there is any `steps_missed` lower than 'n'. As a result, all the steps in
+    # `_steps_to_keep` lower than the last evaluated step will be removed.
+    steps_missed = [step for step in self._steps_to_keep
+                    if step <= eval_result[DEFAULT_GLOBAL_STEP_KEY]]
+
+    if steps_missed:
+      # update the `_steps_to_keep` list by omitting all steps smaller than the
+      # current global step which are missed to be exported
+      export_result = self._saved_model_exporter.export(estimator, export_path,
+                                                        checkpoint_path,
+                                                        eval_result,
+                                                        is_the_final_export)
+      self._steps_to_keep = [step for step in self._steps_to_keep if step
+                             not in steps_missed]
+      # contains all the steps in which export has happened.
+      self._steps_kept.append(eval_result[DEFAULT_GLOBAL_STEP_KEY])
+      # Show warning for all the missed steps except the last one
+      if steps_missed[:-1]:
+        tf_logging.warn('Missed steps [%s] for exporting, as no evaluation'
+                        ' took place at them.', ', '.join(str(step) for step in
+                                                          steps_missed[:-1]))
+      # Log model export if the last missed step is the same as the current step
+      if steps_missed[-1] == eval_result[DEFAULT_GLOBAL_STEP_KEY]:
+        tf_logging.info('Performing model export at step %d.',
+                        eval_result[DEFAULT_GLOBAL_STEP_KEY])
+      # Show warning for exporting model at another step instead of the user
+      #   specified one
+      else:
+        tf_logging.warn('Performing model export at step %d instead of %d, as'
+                        ' no evaluation took place at step %d.',
+                        eval_result[DEFAULT_GLOBAL_STEP_KEY], steps_missed[-1],
+                        steps_missed[-1])
+    return export_result
+
+  def _get_kept_steps(self, event_files):
+    """Get the steps that the model was evaluated at, from event files.
+
+    Args:
+      event_files: Absolute pattern of event files.
+
+    Returns:
+      steps_kept: A list of steps in which the model was evaluated.
+    """
+    if not event_files:
+      return None
+
+    steps_kept = []
+    for event_file in gfile.Glob(os.path.join(event_files)):
+      for event in summary_iterator.summary_iterator(event_file):
+        if event.step not in steps_kept:
+          steps_kept.append(event.step)
+    return steps_kept
diff --git a/tensorflow/contrib/estimator/python/estimator/exporter_test.py b/tensorflow/contrib/estimator/python/estimator/exporter_test.py
new file mode 100644
index 0000000000..0d009b945e
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/exporter_test.py
@@ -0,0 +1,206 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `StepsExporter`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.contrib.estimator.python.estimator import exporter as exporter_lib
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class StepsExporterTest(test.TestCase):
+
+  def test_error_out_if_steps_to_keep_has_no_positive_integers(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, "positive integer"):
+      exporter = exporter_lib.StepsExporter(
+          name="specified_steps_exporter",
+          serving_input_receiver_fn=_serving_input_receiver_fn,
+          steps_to_keep=[-1, 0, 1.1])
+      self.assertEqual("specified_steps_exporter", exporter.name)
+
+  def test_steps_exporter(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1])
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 1},
+                                    False)
+
+    self.assertEqual("export_result_path", export_result)
+    estimator.export_savedmodel.assert_called_with(
+        export_dir_base,
+        _serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        checkpoint_path="checkpoint_path",
+        strip_default_attrs=True)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+  def test_steps_exporter_with_preemption(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
+    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 1)
+    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 2)
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        event_file_pattern="eval_continuous/*.tfevents.*",
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1, 2, 6, 8])
+
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir = export_dir_base
+    estimator.export_savedmodel.return_value = "export_result_path"
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 3},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 6},
+                                    False)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 7},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+  def test_specified_step_is_saved(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1, 5, 8, 10, 11])
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 1},
+                                    False)
+
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 2},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 5},
+                                    False)
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 10},
+                                    False)
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 15},
+                                    False)
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"global_step": 20},
+                                    False)
+    self.assertEqual(None, export_result)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+  def test_steps_exporter_with_no_global_step_key(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.StepsExporter(
+        name="steps_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        steps_to_keep=[1])
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    with self.assertRaisesRegexp(ValueError, "does not have global step"):
+      exporter.export(estimator, export_dir_base, "checkpoint_path", {}, False)
+
+    shutil.rmtree(export_dir_base, ignore_errors=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 26449b4651..e3c44bea66 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -26,6 +26,7 @@ from tensorflow.python.estimator.export.export_output import PredictOutput
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.util import function_utils
 
@@ -140,7 +141,7 @@ def clip_gradients_by_norm(optimizer, clip_norm):
       name='ClipByNorm' + optimizer.get_name())
 
 
-def forward_features(estimator, keys=None):
+def forward_features(estimator, keys=None, sparse_default_values=None):
   """Forward features to predictions dictionary.
 
   In some cases, user wants to see some of the features in estimators prediction
@@ -148,39 +149,36 @@ def forward_features(estimator, keys=None):
   runs inference on the users graph and returns the results. Keys are essential
   because there is no order guarantee on the outputs so they need to be rejoined
   to the inputs via keys or transclusion of the inputs in the outputs.
-
   Example:
-
   ```python
     def input_fn():
       features, labels = ...
       features['unique_example_id'] = ...
       features, labels
-
     estimator = tf.estimator.LinearClassifier(...)
     estimator = tf.contrib.estimator.forward_features(
         estimator, 'unique_example_id')
     estimator.train(...)
     assert 'unique_example_id' in estimator.predict(...)
   ```
-
   Args:
     estimator: A `tf.estimator.Estimator` object.
-    keys: a `string` or a `list` of `string`. If it is `None`, all of the
+    keys: A `string` or a `list` of `string`. If it is `None`, all of the
       `features` in `dict` is forwarded to the `predictions`. If it is a
       `string`, only given key is forwarded. If it is a `list` of strings, all
       the given `keys` are forwarded.
+    sparse_default_values: A dict of `str` keys mapping the name of the sparse
+      features to be converted to dense, to the default value to use. Only
+      sparse features indicated in the dictionary are converted to dense and the
+      provided default value is used.
 
   Returns:
       A new `tf.estimator.Estimator` which forwards features to predictions.
-
   Raises:
     ValueError:
       * if `keys` is already part of `predictions`. We don't allow
         override.
       * if 'keys' does not exist in `features`.
-      * if feature key refers to a `SparseTensor`, since we don't support
-        `SparseTensor` in `predictions`. `SparseTensor` is common in `features`.
     TypeError: if `keys` type is not one of `string` or list/tuple of `string`.
   """
 
@@ -231,11 +229,18 @@ def forward_features(estimator, keys=None):
     for key in get_keys(features):
       feature = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
           features[key])
+      if sparse_default_values and (key in sparse_default_values):
+        if not isinstance(feature, sparse_tensor_lib.SparseTensor):
+          raise ValueError(
+              'Feature ({}) is expected to be a `SparseTensor`.'.format(key))
+        feature = sparse_ops.sparse_tensor_to_dense(
+            feature, default_value=sparse_default_values[key])
       if not isinstance(feature, ops.Tensor):
         raise ValueError(
-            'Forwarded feature ({}) should be a Tensor. Please use keys '
-            'argument of forward_features to filter unwanted features. Type of '
-            'features[{}] is {}.'.format(key, key, type(feature)))
+            'Feature ({}) should be a Tensor. Please use `keys` '
+            'argument of forward_features to filter unwanted features, or'
+            'add key to argument `sparse_default_values`.'
+            'Type of features[{}] is {}.'.format(key, key, type(feature)))
       predictions[key] = feature
     spec = spec._replace(predictions=predictions)
     if spec.export_outputs:
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders_test.py b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
index 407af2deaf..c8fdaa8791 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """extenders tests."""
 
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -23,6 +24,7 @@ import tempfile
 import numpy as np
 
 from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.predictor import from_saved_model
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator_lib
@@ -170,19 +172,53 @@ class ClipGradientsByNormTest(test.TestCase):
 class ForwardFeaturesTest(test.TestCase):
   """Tests forward_features."""
 
-  def test_forward_single_key(self):
-
-    def input_fn():
-      return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
+  def _export_estimator(self, estimator, serving_input_fn):
+    tmpdir = tempfile.mkdtemp()
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = estimator.export_savedmodel(export_dir_base, serving_input_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+    return export_dir, tmpdir
 
+  def make_dummy_input_fn(self):
+    def _input_fn():
+      dataset = dataset_ops.Dataset.from_tensors({
+          'x': [[3.], [5.]],
+          'id': [[101], [102]],
+          'sparse_id': sparse_tensor.SparseTensor(
+              values=[1, 2, 3],
+              indices=[[0, 0], [1, 0], [1, 1]],
+              dense_shape=[2, 2]),
+          'labels': [[1.], [2.]]
+      })
+      def _split(x):
+        labels = x.pop('labels')
+        return x, labels
+      dataset = dataset.map(_split)
+      return dataset
+    return _input_fn
+
+  def test_forward_keys(self):
+
+    input_fn = self.make_dummy_input_fn()
     estimator = linear.LinearRegressor([fc.numeric_column('x')])
     estimator.train(input_fn=input_fn, steps=1)
 
-    self.assertNotIn('id', next(estimator.predict(input_fn=input_fn)))
-    estimator = extenders.forward_features(estimator, 'id')
-    predictions = next(estimator.predict(input_fn=input_fn))
-    self.assertIn('id', predictions)
-    self.assertEqual(101, predictions['id'])
+    forwarded_keys = ['id', 'sparse_id']
+
+    for key in forwarded_keys:
+      self.assertNotIn(key, next(estimator.predict(input_fn=input_fn)))
+
+    estimator = extenders.forward_features(
+        estimator, forwarded_keys, sparse_default_values={'sparse_id': 1})
+
+    expected_results = [101, 2, 102, 5]
+    predictions = estimator.predict(input_fn=input_fn)
+    for _ in range(2):
+      prediction = next(predictions)
+      for key in forwarded_keys:
+        self.assertIn(key, prediction)
+        self.assertEqual(expected_results.pop(0), sum(prediction[key]))
 
   def test_forward_in_exported(self):
 
@@ -205,11 +241,7 @@ class ForwardFeaturesTest(test.TestCase):
     estimator = extenders.forward_features(estimator, 'id')
 
     # export saved model
-    tmpdir = tempfile.mkdtemp()
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = estimator.export_savedmodel(export_dir_base, serving_input_fn)
-    self.assertTrue(gfile.Exists(export_dir))
+    export_dir, tmpdir = self._export_estimator(estimator, serving_input_fn)
 
     # restore model
     predict_fn = from_saved_model(export_dir, signature_def_key='predict')
@@ -222,6 +254,47 @@ class ForwardFeaturesTest(test.TestCase):
     # Clean up.
     gfile.DeleteRecursively(tmpdir)
 
+  def test_forward_in_exported_sparse(self):
+    features_columns = [fc.indicator_column(
+        fc.categorical_column_with_vocabulary_list('x', range(10)))]
+
+    classifier = linear.LinearClassifier(feature_columns=features_columns)
+
+    def train_input_fn():
+      dataset = dataset_ops.Dataset.from_tensors({
+          'x': sparse_tensor.SparseTensor(
+              values=[1, 2, 3],
+              indices=[[0, 0], [1, 0], [1, 1]],
+              dense_shape=[2, 2]),
+          'labels': [[0], [1]]
+      })
+      def _split(x):
+        labels = x.pop('labels')
+        return x, labels
+      dataset = dataset.map(_split)
+      return dataset
+
+    classifier.train(train_input_fn, max_steps=1)
+
+    classifier = extenders.forward_features(
+        classifier, keys=['x'], sparse_default_values={'x': 0})
+
+    def serving_input_fn():
+      features_ph = array_ops.placeholder(dtype=dtypes.int32, name='x',
+                                          shape=[None])
+      features = {'x': layers.dense_to_sparse(features_ph)}
+      return estimator_lib.export.ServingInputReceiver(features,
+                                                       {'x': features_ph})
+    export_dir, tmpdir = self._export_estimator(classifier, serving_input_fn)
+    prediction_fn = from_saved_model(export_dir, signature_def_key='predict')
+
+    features = (0, 2)
+    prediction = prediction_fn({'x': features})
+
+    self.assertIn('x', prediction)
+    self.assertEqual(features, tuple(prediction['x']))
+    gfile.DeleteRecursively(tmpdir)
+
   def test_forward_list(self):
 
     def input_fn():
@@ -266,7 +339,6 @@ class ForwardFeaturesTest(test.TestCase):
       extenders.forward_features(estimator, ['x', estimator])
 
   def test_key_should_be_in_features(self):
-
     def input_fn():
       return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
 
@@ -279,27 +351,36 @@ class ForwardFeaturesTest(test.TestCase):
       next(estimator.predict(input_fn=input_fn))
 
   def test_forwarded_feature_should_not_be_a_sparse_tensor(self):
-
     def input_fn():
       return {
           'x': [[3.], [5.]],
-          'id':
-              sparse_tensor.SparseTensor(
-                  values=['1', '2'],
-                  indices=[[0, 0], [1, 0]],
-                  dense_shape=[2, 1])
-      }, [[1.], [2.]]
+          'id': sparse_tensor.SparseTensor(
+              values=['1', '2'],
+              indices=[[0, 0], [1, 0]],
+              dense_shape=[2, 1])
+          }, [[1.], [2.]]
 
     estimator = linear.LinearRegressor([fc.numeric_column('x')])
     estimator.train(input_fn=input_fn, steps=1)
 
     estimator = extenders.forward_features(estimator)
     with self.assertRaisesRegexp(ValueError,
-                                 'Forwarded feature.* should be a Tensor.'):
+                                 'Feature .* should be a Tensor.*'):
       next(estimator.predict(input_fn=input_fn))
 
-  def test_predictions_should_be_dict(self):
+  def test_forwarded_feature_should_be_a_sparse_tensor(self):
+    input_fn = self.make_dummy_input_fn()
+
+    estimator = linear.LinearRegressor([fc.numeric_column('x')])
+    estimator.train(input_fn=input_fn, steps=1)
 
+    estimator = extenders.forward_features(
+        estimator, sparse_default_values={'id': 0, 'sparse_id': 0})
+    with self.assertRaisesRegexp(
+        ValueError, 'Feature .* is expected to be a `SparseTensor`.'):
+      next(estimator.predict(input_fn=input_fn))
+
+  def test_predictions_should_be_dict(self):
     def input_fn():
       return {'x': [[3.], [5.]], 'id': [[101], [102]]}
 
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 2d367adb47..c6e75f8d46 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -215,7 +215,7 @@ class MultiLabelHead(test.TestCase):
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       predictions = sess.run(spec.predictions)
@@ -246,7 +246,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.PREDICT,
         logits=logits)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertAllEqual(
           expected_export_classes,
@@ -271,7 +271,7 @@ class MultiLabelHead(test.TestCase):
         logits=logits)
 
     # Assert predictions and export_outputs.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       predictions = sess.run(spec.predictions)
@@ -297,7 +297,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels)[0]
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(expected_training_loss,
                           actual_training_loss.eval())
@@ -321,7 +321,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels)[0]
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
           expected_training_loss, actual_training_loss.eval(), atol=1e-4)
@@ -338,7 +338,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels_placeholder)[0]
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -375,7 +375,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_input,
         labels=labels_input)[0]
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(np.sum(loss) / 2., actual_training_loss.eval())
 
@@ -394,7 +394,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels)[0]
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -433,7 +433,7 @@ class MultiLabelHead(test.TestCase):
 
     # Assert predictions, loss, and metrics.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
@@ -753,7 +753,7 @@ class MultiLabelHead(test.TestCase):
 
     # Assert predictions, loss, and metrics.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
@@ -791,7 +791,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels)
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
           expected_training_loss, training_loss.eval(), atol=1e-4)
@@ -825,7 +825,7 @@ class MultiLabelHead(test.TestCase):
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels)
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
           expected_training_loss, training_loss.eval(), atol=1e-4)
@@ -864,7 +864,7 @@ class MultiLabelHead(test.TestCase):
         logits=logits,
         labels=labels,
         train_op_fn=_train_op_fn)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -890,7 +890,7 @@ class MultiLabelHead(test.TestCase):
         logits=logits,
         labels=labels,
         train_op_fn=_train_op_fn)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -919,7 +919,7 @@ class MultiLabelHead(test.TestCase):
 
     # Assert predictions, loss, train_op, and summaries.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNotNone(spec.scaffold.summary_op)
       loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
@@ -1011,7 +1011,7 @@ class MultiLabelHead(test.TestCase):
         optimizer=_Optimizer())
 
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       loss, train_result = sess.run((spec.loss, spec.train_op))
       self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
@@ -1040,7 +1040,7 @@ class MultiLabelHead(test.TestCase):
           labels=np.array([[1, 0], [1, 1]], dtype=np.int64),
           train_op_fn=_train_op_fn)
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         _initialize_variables(self, spec.scaffold)
         sess.run(spec.train_op)
         w_value, t_value = sess.run([w, t])
@@ -1079,7 +1079,7 @@ class MultiLabelHead(test.TestCase):
 
     # Assert predictions, loss, train_op, and summaries.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNotNone(spec.scaffold.summary_op)
       loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
@@ -1127,7 +1127,7 @@ class MultiLabelHead(test.TestCase):
 
     # Assert predictions, loss, train_op, and summaries.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNotNone(spec.scaffold.summary_op)
       loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
@@ -1162,7 +1162,7 @@ class MultiLabelHead(test.TestCase):
         logits=logits,
         labels=labels)
     atol = 1.e-3
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
           expected_training_loss, training_loss.eval(), atol=atol)
@@ -1197,7 +1197,7 @@ class MultiLabelHead(test.TestCase):
         train_op_fn=_train_op_fn)
 
     atol = 1.e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, monitored_session.Scaffold())
       loss, train_result = sess.run((spec.loss, spec.train_op))
       self.assertAllClose(expected_loss, loss, atol=atol)
@@ -1224,7 +1224,7 @@ class MultiLabelHead(test.TestCase):
         logits=logits,
         labels=labels,
         train_op_fn=_train_op_fn)
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -1252,7 +1252,7 @@ class MultiLabelHead(test.TestCase):
         logits=logits,
         labels=labels,
         train_op_fn=_train_op_fn)
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -1327,7 +1327,7 @@ class PoissonRegressionHead(test.TestCase):
         labels=labels,
         train_op_fn=_train_op_fn)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       loss, train_result = sess.run([spec.loss, spec.train_op])
       self.assertAlmostEqual(expected_loss, loss, delta=atol)
@@ -1352,7 +1352,7 @@ class PoissonRegressionHead(test.TestCase):
     self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
 
     # Assert predictions.
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, spec.scaffold)
       self.assertAllClose(
           expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
@@ -1395,7 +1395,7 @@ class LogisticRegressionHead(test.TestCase):
         labels=labels,
         train_op_fn=_train_op_fn)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       loss, train_result = sess.run([spec.loss, spec.train_op])
       self.assertAlmostEqual(expected_loss, loss, delta=atol)
@@ -1419,7 +1419,7 @@ class LogisticRegressionHead(test.TestCase):
         labels=labels,
         train_op_fn=_train_op_fn)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -1444,7 +1444,7 @@ class LogisticRegressionHead(test.TestCase):
         labels=labels,
         train_op_fn=_train_op_fn)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
@@ -1471,7 +1471,7 @@ class LogisticRegressionHead(test.TestCase):
     self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
 
     # Assert predictions.
-    with self.test_session():
+    with self.cached_session():
       _initialize_variables(self, spec.scaffold)
       self.assertAllClose(
           expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks.py b/tensorflow/contrib/estimator/python/estimator/hooks.py
index caadafdfa6..66c46e66b7 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import time
 
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.framework import ops
@@ -26,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training
+from tensorflow.python.training import training_util
 
 
 # pylint: disable=protected-access
@@ -72,8 +74,9 @@ class InMemoryEvaluatorHook(training.SessionRunHook):
       estimator: A `tf.estimator.Estimator` instance to call evaluate.
       input_fn:  Equivalent to the `input_fn` arg to `estimator.evaluate`. A
         function that constructs the input data for evaluation.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        See [Createing input functions](
+        https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
 
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
@@ -210,4 +213,72 @@ class InMemoryEvaluatorHook(training.SessionRunHook):
     self._evaluate(session)
 
 
+class _StopAtCheckpointStepHook(training.SessionRunHook):
+  """Hook that requests stop at a specified step based on checkpoint.
+
+  Note: We recommend using 'make_stop_at_checkpoint_step_hook` to get the proper
+  hook.
+  """
+
+  def __init__(self, model_dir, last_step,
+               wait_after_file_check_secs=30):
+    """Initializes a `StopAtCheckpointStepHook`.
+
+    This hook requests stop after a last step has been reached. It checks latest
+    checkpoint to verify last step is written on disk or not.
+
+    Args:
+      model_dir: Directory to read global step from latest checkpoint.
+      last_step: Step after which to stop.
+      wait_after_file_check_secs: Reading same file by many workers may create
+      I/O issues. To throttle that we will wait given secs after each read of
+      the file.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if last_step is None:
+      raise ValueError('last_step must be specified.')
+    if model_dir is None:
+      raise ValueError('model_dir must be specified.')
+
+    self._model_dir = model_dir
+    self._last_step = last_step
+    self._wait_after_file_check_secs = wait_after_file_check_secs
+
+  def begin(self):
+    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          'Global step should be created to use StopAtCheckpointStepHook.')
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return training.SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):
+    global_step = run_values.results + 1
+    if global_step >= self._last_step:
+      # Check latest global step in the checkpoint to ensure that the targeted
+      # last step is written on disk.
+
+      step = estimator_lib._load_global_step_from_checkpoint_dir(
+          self._model_dir)
+      if step >= self._last_step:
+        run_context.request_stop()
+      else:
+        time.sleep(self._wait_after_file_check_secs)
+
+
+def make_stop_at_checkpoint_step_hook(estimator,
+                                      last_step,
+                                      wait_after_file_check_secs=30):
+  """Creates a proper StopAtCheckpointStepHook based on chief status."""
+
+  if estimator.config.is_chief:
+    return training.StopAtStepHook(last_step=last_step)
+  return _StopAtCheckpointStepHook(
+      model_dir=estimator.model_dir,
+      last_step=last_step,
+      wait_after_file_check_secs=wait_after_file_check_secs)
+
 # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
index ee88d5ecf5..c6c6cad95a 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import glob
 import json
 import os
+import tempfile
+import time
 
 from tensorflow.contrib.estimator.python.estimator import hooks as hooks_lib
+from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator import run_config as run_config_lib
@@ -316,5 +319,85 @@ class InMemoryEvaluatorHookTest(test.TestCase):
       estimator.train(input_fn, hooks=[evaluator])
 
 
+class StopAtCheckpointStepHookTest(test.TestCase):
+
+  def test_do_not_stop_if_checkpoint_is_not_there(self):
+    with ops.Graph().as_default():
+      step = training.create_global_step()
+      assign_ten = step.assign(10)
+      no_op = control_flow_ops.no_op()
+      hook = hooks_lib._StopAtCheckpointStepHook(
+          model_dir=tempfile.mkdtemp(), last_step=10)
+      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
+        mon_sess.raw_session().run(assign_ten)
+        with test.mock.patch.object(time, 'sleep') as mock_sleep:
+          mon_sess.run(no_op)
+          self.assertTrue(mock_sleep.called)
+        self.assertFalse(mon_sess.should_stop())
+
+  def test_do_not_stop_if_checkpoint_step_is_smaller(self):
+    model_dir = tempfile.mkdtemp()
+    with ops.Graph().as_default():
+      step = training.create_global_step()
+      assign_nine = step.assign(9)
+      assign_ten = step.assign(10)
+      no_op = control_flow_ops.no_op()
+      hook = hooks_lib._StopAtCheckpointStepHook(
+          model_dir=model_dir, last_step=10)
+      with tf_session.Session() as sess:
+        sess.run(assign_nine)
+        training.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
+        mon_sess.raw_session().run(assign_ten)
+        with test.mock.patch.object(time, 'sleep') as mock_sleep:
+          mon_sess.run(no_op)
+          self.assertTrue(mock_sleep.called)
+        self.assertFalse(mon_sess.should_stop())
+
+  def test_stop_if_checkpoint_step_is_laststep(self):
+    model_dir = tempfile.mkdtemp()
+    with ops.Graph().as_default():
+      step = training.create_global_step()
+      assign_ten = step.assign(10)
+      no_op = control_flow_ops.no_op()
+      hook = hooks_lib._StopAtCheckpointStepHook(
+          model_dir=model_dir, last_step=10)
+      with tf_session.Session() as sess:
+        sess.run(assign_ten)
+        training.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
+        mon_sess.raw_session().run(assign_ten)
+        with test.mock.patch.object(time, 'sleep') as mock_sleep:
+          mon_sess.run(no_op)
+          self.assertFalse(mock_sleep.called)
+        self.assertTrue(mon_sess.should_stop())
+
+  def test_creates_regular_stop_at_step_hook_for_chief(self):
+    # by default an estimator is in chief mode
+    dnn = estimator_lib.DNNClassifier(
+        feature_columns=[feature_column_lib.numeric_column('x')],
+        hidden_units=[3, 1])
+    hook = hooks_lib.make_stop_at_checkpoint_step_hook(dnn, 300)
+    self.assertIsInstance(hook, training.StopAtStepHook)
+    self.assertEqual(300, hook._last_step)
+
+  def test_creates_checkpoint_hook_for_workers(self):
+
+    class FakeWorkerConfig(estimator_lib.RunConfig):
+
+      @property
+      def is_chief(self):
+        return False
+
+    dnn = estimator_lib.DNNClassifier(
+        feature_columns=[feature_column_lib.numeric_column('x')],
+        hidden_units=[3, 1],
+        config=FakeWorkerConfig())
+    hook = hooks_lib.make_stop_at_checkpoint_step_hook(dnn, 300)
+    self.assertIsInstance(hook, hooks_lib._StopAtCheckpointStepHook)
+    self.assertEqual(300, hook._last_step)
+    self.assertEqual(dnn.model_dir, hook._model_dir)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 3d6fccb118..2b4d5f5261 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -132,7 +132,7 @@ class MultiHeadTest(test.TestCase):
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       predictions = sess.run(spec.predictions)
@@ -202,7 +202,7 @@ class MultiHeadTest(test.TestCase):
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       predictions = sess.run(spec.predictions)
@@ -259,7 +259,7 @@ class MultiHeadTest(test.TestCase):
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       predictions = sess.run(spec.predictions)
@@ -336,7 +336,7 @@ class MultiHeadTest(test.TestCase):
 
     # Assert predictions, loss, and metrics.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNone(spec.scaffold.summary_op)
       value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
@@ -362,7 +362,7 @@ class MultiHeadTest(test.TestCase):
         logits=logits,
         labels=labels)[0]
     tol = 1e-3
-    with self.test_session():
+    with self.cached_session():
       # Unreduced loss of the head is [[(10 + 10) / 2], (15 + 0) / 2]
       # (averaged over classes, averaged over examples).
       self.assertAllClose(8.75, loss.eval(), rtol=tol, atol=tol)
@@ -397,7 +397,7 @@ class MultiHeadTest(test.TestCase):
         logits=logits,
         labels=labels)
     tol = 1e-3
-    with self.test_session():
+    with self.cached_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
       # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
@@ -445,7 +445,7 @@ class MultiHeadTest(test.TestCase):
         logits=logits,
         labels=labels)
     tol = 1e-3
-    with self.test_session():
+    with self.cached_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
       # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
@@ -498,7 +498,7 @@ class MultiHeadTest(test.TestCase):
         logits=logits,
         labels=labels)[0]
     tol = 1e-3
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(
           expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
 
@@ -535,7 +535,7 @@ class MultiHeadTest(test.TestCase):
 
     # Assert predictions, loss, train_op, and summaries.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNotNone(spec.scaffold.summary_op)
       loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
@@ -579,7 +579,7 @@ class MultiHeadTest(test.TestCase):
         optimizer=_Optimizer())
 
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       loss, train_result = sess.run((spec.loss, spec.train_op))
       self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
@@ -634,7 +634,7 @@ class MultiHeadTest(test.TestCase):
 
     # Assert predictions, loss, train_op, and summaries.
     tol = 1e-3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       _initialize_variables(self, spec.scaffold)
       self.assertIsNotNone(spec.scaffold.summary_op)
       loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index dd8a3a95f1..65229d67bb 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -209,7 +209,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn,
           loss_reduction=losses.Reduction.SUM,
@@ -233,7 +233,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       # Add another trainable variable that doesn't produce a gradient to
       # verify that None gradients are supported.
       _ = variable_scope.get_variable(
@@ -275,7 +275,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
       # for the second.
       expected_c = 10.0 - 3.0, 7.0 - 4.0
 
-      with self.test_session() as session, variable_scope.variable_scope(
+      with self.cached_session() as session, variable_scope.variable_scope(
           '', reuse=variable_scope.AUTO_REUSE):
         replicated_model_fn = replicate_model_fn.replicate_model_fn(
             self.model_fn,
@@ -299,7 +299,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn,
           loss_reduction=losses.Reduction.SUM,
@@ -330,7 +330,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, losses.Reduction.MEAN, devices=['/gpu:0', '/gpu:1'])
       estimator_spec = replicated_model_fn(
@@ -359,7 +359,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, devices=['/gpu:0', '/gpu:1'])
       estimator_spec = replicated_model_fn(
@@ -374,7 +374,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
@@ -396,7 +396,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
@@ -424,7 +424,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
@@ -456,7 +456,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
-    with self.test_session():
+    with self.cached_session():
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, devices=['/GPU:0'])
       _ = replicated_model_fn(
@@ -470,7 +470,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
-    with self.test_session():
+    with self.cached_session():
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, devices=['/gpu:0'])
       _ = replicated_model_fn(
@@ -521,7 +521,7 @@ class ReplicateAcrossASingleDeviceWithoutTowerOptimizer(
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
@@ -649,7 +649,7 @@ class ReplicateWithTwoOptimizersTest(test_util.TensorFlowTestCase):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn,
           loss_reduction=losses.Reduction.SUM,
@@ -746,7 +746,7 @@ class ReplicateWithTwoLossesAndOneOptimizer(test_util.TensorFlowTestCase):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
           self.model_fn,
           loss_reduction=losses.Reduction.SUM,
@@ -777,7 +777,7 @@ class ReplicateWithTwoLossesAndOneOptimizer(test_util.TensorFlowTestCase):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session(), ops_lib.Graph().as_default():
+    with self.cached_session(), ops_lib.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError, '.+was.+supposed.+to.+make.+same.+optimizer.+calls.+'):
         replicated_model_fn = replicate_model_fn.replicate_model_fn(
@@ -819,7 +819,7 @@ class FailToWrapOptimizerInTheModelFn(test_util.TensorFlowTestCase):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesRegexp(ValueError,
                                    'Please.+wrap.+with.+TowerOptimizer'):
         replicated_model_fn = replicate_model_fn.replicate_model_fn(
@@ -845,7 +845,7 @@ class GetLossTowersTest(test_util.TensorFlowTestCase):
     return model_fn_lib.EstimatorSpec(mode=mode, loss=math_ops.reduce_sum(loss))
 
   def test_gradients_are_computed(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       tower_specs = replicate_model_fn._get_loss_towers(
           self.model_fn,
           mode=None,
@@ -879,7 +879,7 @@ class GetLossTowersTest(test_util.TensorFlowTestCase):
         self.assertEqual(0.25, session.run(c))
 
   def test_gradients_are_computed_with_mean_reduction(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       tower_specs = replicate_model_fn._get_loss_towers(
           self.model_fn,
           mode=model_fn_lib.ModeKeys.EVAL,
@@ -932,7 +932,7 @@ class GetLossTowersTest(test_util.TensorFlowTestCase):
       return model_fn_lib.EstimatorSpec(
           mode=mode, loss=math_ops.reduce_sum(loss))
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       tower_specs = replicate_model_fn._get_loss_towers(
           model_fn,
           mode=None,
@@ -975,7 +975,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def test_simple_half_split(self):
-    with self.test_session():
+    with self.cached_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -988,7 +988,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
 
   def test_to_each_their_own(self):
-    with self.test_session():
+    with self.cached_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1001,7 +1001,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
 
   def test_one_batch(self):
-    with self.test_session():
+    with self.cached_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1014,7 +1014,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
 
   def test_half_split_in_dictionary(self):
-    with self.test_session():
+    with self.cached_session():
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
       labels = [10.0, 11.0, 12.0, 13.0]
 
@@ -1029,7 +1029,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
 
   def test_sparse_tensor_can_be_split_unevenly(self):
-    with self.test_session():
+    with self.cached_session():
       features = {
           'x':
               sparse_tensor.SparseTensor(
@@ -1054,7 +1054,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[2.0]], label_shards[1].eval())
 
   def test_sparse_tensor_can_be_split_unevenly_repeated_row(self):
-    with self.test_session():
+    with self.cached_session():
       features = {
           'x':
               sparse_tensor.SparseTensor(
@@ -1081,7 +1081,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[2.0]], label_shards[1].eval())
 
   def test_one_batch_in_dictionary(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.cached_session() as session:  # pylint: disable=unused-variable
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
       labels = [10.0, 11.0, 12.0, 13.0]
 
@@ -1095,7 +1095,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([10.0, 11.0, 12.0, 13.0], label_shards[0].eval())
 
   def test_feature_and_label_dictionaries(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.cached_session() as session:  # pylint: disable=unused-variable
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
       labels = {'first': [10.0, 11.0], 'second': [12.0, 13.0]}
 
@@ -1127,7 +1127,7 @@ class TrainSpecTest(test_util.TensorFlowTestCase):
     return constant_op.constant(loss_value, dtype=dtypes.float64)
 
   def test_example(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       tower_losses = list(map(self.create_constant_loss, [2, 4, 6]))
       tower_specs = list(map(self.create_estimator_spec, tower_losses))
 
@@ -1161,7 +1161,7 @@ class EvalSpecTest(test_util.TensorFlowTestCase):
     return metrics
 
   def test_example(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       tower_losses = map(self.create_constant_loss, [2, 4, 6])
       tower_metrics = map(self.create_eval_metrics, [0, 0.2, 0.3])
       tower_specs = [
@@ -1187,7 +1187,7 @@ class EvalSpecTest(test_util.TensorFlowTestCase):
       self.assertEqual(2 + 4 + 6, session.run(estimator_spec.loss))
 
   def test_handles_single_tower(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       tower_losses = map(self.create_constant_loss, [5])
       tower_metrics = map(self.create_eval_metrics, [0.2])
       tower_specs = [
@@ -1231,7 +1231,7 @@ class PredictSpecTest(test_util.TensorFlowTestCase):
         })
 
   def test_example(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       tower_specs = replicate_model_fn._get_loss_towers(
           self.model_fn,
           mode=None,
@@ -1273,7 +1273,7 @@ class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
           np.array([3.3, 3.5, 3.7]) * (tower_id + 1), 'total')
 
   def test_example(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       for tower_id in range(3):
         self.create_tower_metrics(tower_id)
 
@@ -1303,7 +1303,7 @@ class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
       self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
 
   def test_reduce_is_idempotent(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       for tower_id in range(3):
         self.create_tower_metrics(tower_id)
 
@@ -1329,7 +1329,7 @@ class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
       self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
 
   def test_handles_single_tower(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       self.create_tower_metrics(0)
       session.run(
           variables.variables_initializer(
@@ -1346,7 +1346,7 @@ class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
       self.assertAllClose([3.3, 3.5, 3.7], local_metrics[2], 0.01)
 
   def test_doesnt_accept_uneven_number_of_variables(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       for tower_id in range(3):
         self.create_tower_metrics(tower_id)
       self.create_metric_variable(-1.0, 'oddball')
@@ -1418,7 +1418,7 @@ class MergeExportOutputsTest(test_util.TensorFlowTestCase):
     return estimator_spec
 
   def test_merge_predict_output(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       estimator_spec = self.replicate_estimator_spec(session)
       self.assertAllClose(
           {
@@ -1428,7 +1428,7 @@ class MergeExportOutputsTest(test_util.TensorFlowTestCase):
               signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs))
 
   def test_merge_classification_output_scores_classes(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       estimator_spec = self.replicate_estimator_spec(session)
       self.assertAllClose(
           [0.1, 0.02],
@@ -1440,7 +1440,7 @@ class MergeExportOutputsTest(test_util.TensorFlowTestCase):
               estimator_spec.export_outputs['classification_output'].classes))
 
   def test_merge_classification_output_scores(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       estimator_spec = self.replicate_estimator_spec(session)
       self.assertAllClose(
           [0.1, 0.02],
@@ -1450,7 +1450,7 @@ class MergeExportOutputsTest(test_util.TensorFlowTestCase):
           None, estimator_spec.export_outputs['classification_scores'].classes)
 
   def test_merge_classification_output_classes(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       estimator_spec = self.replicate_estimator_spec(session)
       self.assertAllEqual(
           [b'split_inputs/split:0', b'split_inputs/split:1'],
@@ -1460,7 +1460,7 @@ class MergeExportOutputsTest(test_util.TensorFlowTestCase):
           None, estimator_spec.export_outputs['classification_classes'].scores)
 
   def test_merge_regression_output(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       estimator_spec = self.replicate_estimator_spec(session)
       self.assertAllClose(
           [0.1, 0.02],
@@ -1548,7 +1548,7 @@ class LocalDeviceSetterTest(test_util.TensorFlowTestCase):
 class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
 
   def test_vectors(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       total = replicate_model_fn._compute_sum_on_device(
           [1.0, 2.0, 3.0, 4.0], device='/device:GPU:0', name='test_sum')
 
@@ -1557,7 +1557,7 @@ class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
       self.assertEqual(10.0, session.run(total))
 
   def test_tensors(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       total = replicate_model_fn._compute_sum_on_device(
           [[1.0, 2.0], [3.0, 4.0]], device='/device:GPU:0', name='test_sum')
 
@@ -1566,7 +1566,7 @@ class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([4.0, 6.0], session.run(total))
 
   def test_indexedslices(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       a = ops_lib.IndexedSlices(
           constant_op.constant([1.0, 2.0]), [0, 1],
           dense_shape=constant_op.constant([2]))
@@ -1580,7 +1580,7 @@ class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
                           session.run(ops_lib.convert_to_tensor(total)))
 
   def test_indexedslices_higher_dimensions(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       a = ops_lib.IndexedSlices(
           constant_op.constant([[1.0, 5.0], [2.0, 6.0]]), [0, 1],
           dense_shape=constant_op.constant([2, 4]))
@@ -1595,7 +1595,7 @@ class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
                           session.run(ops_lib.convert_to_tensor(total)))
 
   def test_indexedslices_some_dont_overlap(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       a = ops_lib.IndexedSlices(
           constant_op.constant([1.0, 2.0]), [0, 3],
           dense_shape=constant_op.constant([4]))
@@ -1637,7 +1637,7 @@ class ConcatTensorDictsTest(test_util.TensorFlowTestCase):
         },
     ]
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       self.assertAllClose({
           'a': np.array([1.0, 2.0, 3.0]),
           'b': np.array([11.0, 12.0, 13.0, 14.0]),
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py b/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py
index 1322f7ce5f..db47073fcc 100644
--- a/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py
+++ b/tensorflow/contrib/factorization/python/kernel_tests/clustering_ops_test.py
@@ -41,7 +41,7 @@ class KmeansPlusPlusInitializationTest(test.TestCase):
                              [-1., -1.]]).astype(np.float32)
 
   def runTestWithSeed(self, seed):
-    with self.test_session():
+    with self.cached_session():
       sampled_points = clustering_ops.kmeans_plus_plus_initialization(
           self._points, 3, seed, (seed % 5) - 1)
       self.assertAllClose(
@@ -58,7 +58,7 @@ class KmeansPlusPlusInitializationTest(test.TestCase):
 class KMC2InitializationTest(test.TestCase):
 
   def runTestWithSeed(self, seed):
-    with self.test_session():
+    with self.cached_session():
       distances = np.zeros(1000).astype(np.float32)
       distances[6] = 10e7
       distances[4] = 10e3
@@ -82,7 +82,7 @@ class KMC2InitializationLargeTest(test.TestCase):
     self._distances[1000] = 50.0
 
   def testBasic(self):
-    with self.test_session():
+    with self.cached_session():
       counts = {}
       seed = 0
       for i in range(50):
@@ -102,7 +102,7 @@ class KMC2InitializationCornercaseTest(test.TestCase):
     self._distances = np.zeros(10)
 
   def runTestWithSeed(self, seed):
-    with self.test_session():
+    with self.cached_session():
       sampled_point = clustering_ops.kmc2_chain_initialization(
           self._distances, seed)
       self.assertEquals(sampled_point.eval(), 0)
@@ -128,14 +128,14 @@ class NearestCentersTest(test.TestCase):
                               [1., 1.]]).astype(np.float32)
 
   def testNearest1(self):
-    with self.test_session():
+    with self.cached_session():
       [indices, distances] = clustering_ops.nearest_neighbors(self._points,
                                                               self._centers, 1)
       self.assertAllClose(indices.eval(), [[0], [0], [1], [4]])
       self.assertAllClose(distances.eval(), [[0.], [5.], [1.], [0.]])
 
   def testNearest2(self):
-    with self.test_session():
+    with self.cached_session():
       [indices, distances] = clustering_ops.nearest_neighbors(self._points,
                                                               self._centers, 2)
       self.assertAllClose(indices.eval(), [[0, 1], [0, 1], [1, 0], [4, 3]])
@@ -180,7 +180,7 @@ class NearestCentersLargeTest(test.TestCase):
                    expected_nearest_neighbor_squared_distances))
 
   def testNearest1(self):
-    with self.test_session():
+    with self.cached_session():
       [indices, distances] = clustering_ops.nearest_neighbors(self._points,
                                                               self._centers, 1)
       self.assertAllClose(indices.eval(),
@@ -190,7 +190,7 @@ class NearestCentersLargeTest(test.TestCase):
           self._expected_nearest_neighbor_squared_distances[:, [0]])
 
   def testNearest5(self):
-    with self.test_session():
+    with self.cached_session():
       [indices, distances] = clustering_ops.nearest_neighbors(self._points,
                                                               self._centers, 5)
       self.assertAllClose(indices.eval(),
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_ops_test.py b/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_ops_test.py
index 3a909e2373..dd115735d0 100644
--- a/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_ops_test.py
+++ b/tensorflow/contrib/factorization/python/kernel_tests/masked_matmul_ops_test.py
@@ -58,7 +58,7 @@ class MaskedProductOpsTest(test.TestCase):
     self._mask_ind, self._mask_shape = MakeMask()
 
   def _runTestMaskedProduct(self, transpose_a, transpose_b):
-    with ops.Graph().as_default(), self.test_session() as sess:
+    with ops.Graph().as_default(), self.cached_session() as sess:
       a = self._a if not transpose_a else array_ops.transpose(self._a)
       b = self._b if not transpose_b else array_ops.transpose(self._b)
 
@@ -78,7 +78,7 @@ class MaskedProductOpsTest(test.TestCase):
       AssertClose(result, true_result)
 
   def _runTestEmptyMaskedProduct(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
+    with ops.Graph().as_default(), self.cached_session() as sess:
       empty_mask = constant_op.constant(0, shape=[0, 2], dtype=dtypes.int64)
       values = gen_factorization_ops.masked_matmul(
           self._a, self._b, empty_mask, False, False)
diff --git a/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py b/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
index 6c2f1d4608..8a16e22663 100644
--- a/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
+++ b/tensorflow/contrib/factorization/python/kernel_tests/wals_solver_ops_test.py
@@ -50,7 +50,7 @@ class WalsSolverOpsTest(test.TestCase):
 
   def testWalsSolverLhs(self):
     sparse_block = SparseBlock3x3()
-    with self.test_session():
+    with self.cached_session():
       [lhs_tensor,
        rhs_matrix] = gen_factorization_ops.wals_compute_partial_lhs_and_rhs(
            self._column_factors, self._column_weights, self._unobserved_weights,
@@ -82,7 +82,7 @@ class WalsSolverOpsTest(test.TestCase):
 
   def testWalsSolverLhsEntryWeights(self):
     sparse_block = SparseBlock3x3()
-    with self.test_session():
+    with self.cached_session():
       [lhs_tensor,
        rhs_matrix] = gen_factorization_ops.wals_compute_partial_lhs_and_rhs(
            self._column_factors, [], self._unobserved_weights,
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 484ffee3e7..3a756da932 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -15,7 +15,7 @@
 # pylint: disable=g-short-docstring-punctuation
 """Working with audio using FFmpeg.
 
-See the @{$python/contrib.ffmpeg} guide.
+See the [FFMPEG](https://tensorflow.org/api_guides/python/contrib.ffmpeg) guide.
 
 @@decode_audio
 @@encode_audio
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
index 3dc663bb6f..784da1c432 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
@@ -56,7 +56,7 @@ class DecodeAudioOpTest(test.TestCase):
     """
     if samples_per_second_tensor is None:
       samples_per_second_tensor = samples_per_second
-    with self.test_session():
+    with self.cached_session():
       path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
                           filename)
       with open(path, 'rb') as f:
@@ -123,7 +123,7 @@ class DecodeAudioOpTest(test.TestCase):
     self._loadFileAndTest('mono_10khz.ogg', 'ogg', 0.57, 10000, 1)
 
   def testInvalidFile(self):
-    with self.test_session():
+    with self.cached_session():
       contents = 'invalid file'
       audio_op = ffmpeg.decode_audio(
           contents,
@@ -168,7 +168,7 @@ class DecodeAudioOpTest(test.TestCase):
       self._loadFileAndTest('mono_16khz.mp3', 'docx', 0.57, 20000, 1)
 
   def testStaticShapeInference_ConstantChannelCount(self):
-    with self.test_session():
+    with self.cached_session():
       audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~',
                                      file_format='wav',
                                      samples_per_second=44100,
@@ -176,7 +176,7 @@ class DecodeAudioOpTest(test.TestCase):
       self.assertEqual([None, 2], audio_op.shape.as_list())
 
   def testStaticShapeInference_NonConstantChannelCount(self):
-    with self.test_session():
+    with self.cached_session():
       channel_count = array_ops.placeholder(dtypes.int32)
       audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~',
                                      file_format='wav',
@@ -185,7 +185,7 @@ class DecodeAudioOpTest(test.TestCase):
       self.assertEqual([None, None], audio_op.shape.as_list())
 
   def testStaticShapeInference_ZeroChannelCountInvalid(self):
-    with self.test_session():
+    with self.cached_session():
       with six.assertRaisesRegex(self, Exception,
                                  r'channel_count must be positive'):
         ffmpeg.decode_audio(b'~~~ wave ~~~',
@@ -194,7 +194,7 @@ class DecodeAudioOpTest(test.TestCase):
                             channel_count=0)
 
   def testStaticShapeInference_NegativeChannelCountInvalid(self):
-    with self.test_session():
+    with self.cached_session():
       with six.assertRaisesRegex(self, Exception,
                                  r'channel_count must be positive'):
         ffmpeg.decode_audio(b'~~~ wave ~~~',
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op_test.py b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
index b43b6b8919..b734690756 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
@@ -42,7 +42,7 @@ class DecodeVideoOpTest(test.TestCase):
       bmp_filename: The filename for the bmp file.
       index: Index location inside the video.
     """
-    with self.test_session():
+    with self.cached_session():
       path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
                           filename)
       with open(path, 'rb') as f:
diff --git a/tensorflow/contrib/ffmpeg/encode_audio_op_test.py b/tensorflow/contrib/ffmpeg/encode_audio_op_test.py
index 870290dc10..eb4325da82 100644
--- a/tensorflow/contrib/ffmpeg/encode_audio_op_test.py
+++ b/tensorflow/contrib/ffmpeg/encode_audio_op_test.py
@@ -61,7 +61,7 @@ class EncodeAudioOpTest(test.TestCase):
 
   def testRoundTrip(self):
     """Reads a wav file, writes it, and compares them."""
-    with self.test_session():
+    with self.cached_session():
       audio_op = ffmpeg.decode_audio(
           self._contents,
           file_format='wav',
@@ -73,7 +73,7 @@ class EncodeAudioOpTest(test.TestCase):
       self._compareWavFiles(self._contents, encoded_contents)
 
   def testRoundTripWithPlaceholderSampleRate(self):
-    with self.test_session():
+    with self.cached_session():
       placeholder = array_ops.placeholder(dtypes.int32)
       audio_op = ffmpeg.decode_audio(
           self._contents,
@@ -86,7 +86,7 @@ class EncodeAudioOpTest(test.TestCase):
       self._compareWavFiles(self._contents, encoded_contents)
 
   def testFloatingPointSampleRateInvalid(self):
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaises(TypeError):
         ffmpeg.encode_audio(
             [[0.0], [1.0]],
@@ -94,7 +94,7 @@ class EncodeAudioOpTest(test.TestCase):
             samples_per_second=12345.678)
 
   def testZeroSampleRateInvalid(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       encode_op = ffmpeg.encode_audio(
           [[0.0], [1.0]],
           file_format='wav',
@@ -103,7 +103,7 @@ class EncodeAudioOpTest(test.TestCase):
         sess.run(encode_op)
 
   def testNegativeSampleRateInvalid(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       encode_op = ffmpeg.encode_audio(
           [[0.0], [1.0]],
           file_format='wav',
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 20d099fe5d..95f5ba90ab 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -15,7 +15,9 @@
 
 """Framework utilities.
 
-See the @{$python/contrib.framework} guide.
+See the
+[Contrib Framework](https://tensorflow.org/api_guides/python/contrib.framework)
+guide.
 
 @@assert_same_float_dtype
 @@assert_scalar
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
index 9396f027d3..4f591367fd 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
@@ -117,7 +117,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable("my1", [1, 10])
           with variable_scope.variable_scope("some_other_scope"):
@@ -158,7 +158,7 @@ class CheckpointsTest(test.TestCase):
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         session.run(variables.global_variables_initializer())
         self.assertAllEqual(my4.eval(session), v4)
         self.assertAllEqual(my5.eval(session), my5_init)
@@ -170,7 +170,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable("var1", [1, 10])
           my2 = variable_scope.get_variable("var2", [10, 10])
@@ -194,7 +194,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         my1 = variable_scope.get_variable("var1", [1, 10])
         my2 = variable_scope.get_variable("var2", [10, 10])
         my3 = variable_scope.get_variable("var3", [100, 100])
@@ -217,7 +217,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable(
               name="my1",
@@ -247,7 +247,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable(
               name="my1",
@@ -271,7 +271,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           _ = variable_scope.get_variable("my1", [10, 10])
           _ = variable_scope.get_variable(
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index af1b404cb5..9db2670304 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -366,7 +366,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
       squeezed_predictions, squeezed_labels = (
           tensor_util.remove_squeezable_dimensions(predictions, labels))
-      with self.test_session(g):
+      with self.session(g):
         variables_lib.local_variables_initializer().run()
         self.assertAllClose(
             predictions_value, squeezed_predictions.eval(feed_dict=feed_dict))
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope_test.py b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
index bcafc1a328..0e6c6f0e2f 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope_test.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
@@ -52,7 +52,7 @@ def _key_op(op):
 class ArgScopeTest(test.TestCase):
 
   def testEmptyArgScope(self):
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([]) as sc:
         self.assertEqual(sc, {})
 
@@ -60,7 +60,7 @@ class ArgScopeTest(test.TestCase):
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
     key_op = _key_op(func1)
     func1_scope = {key_op: func1_kwargs.copy()}
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([func1], a=1, b=None, c=[1]) as sc1:
         self.assertEqual(sc1, func1_scope)
         with arg_scope({}) as sc2:
@@ -86,7 +86,7 @@ class ArgScopeTest(test.TestCase):
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
     key_op = _key_op(func1)
     current_scope = {key_op: func1_kwargs.copy()}
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([func1], a=1, b=None, c=[1]) as scope:
         self.assertDictEqual(scope, current_scope)
 
@@ -102,7 +102,7 @@ class ArgScopeTest(test.TestCase):
         key(func1): func1_kwargs.copy(),
         key(func2): func2_kwargs.copy()
     }
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([func1], a=1, b=None, c=[1]):
         with arg_scope([func2], b=2, d=[2]) as scope:
           self.assertDictEqual(scope, current_scope)
@@ -111,7 +111,7 @@ class ArgScopeTest(test.TestCase):
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
     key_op = _key_op(func1)
     current_scope = {key_op: func1_kwargs.copy()}
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([func1], a=1, b=None, c=[1]) as scope1:
         pass
       with arg_scope(scope1) as scope:
@@ -126,7 +126,7 @@ class ArgScopeTest(test.TestCase):
         key(func1): func1_kwargs.copy(),
         key(func2): func2_kwargs.copy()
     }
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([func1], a=1, b=None, c=[1]) as scope1:
         with arg_scope([func2], b=2, d=[2]) as scope2:
           pass
@@ -140,7 +140,7 @@ class ArgScopeTest(test.TestCase):
   def testSimpleArgScope(self):
     func1_args = (0,)
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([func1], a=1, b=None, c=[1]):
         args, kwargs = func1(0)
         self.assertTupleEqual(args, func1_args)
@@ -149,7 +149,7 @@ class ArgScopeTest(test.TestCase):
   def testSimpleArgScopeWithTuple(self):
     func1_args = (0,)
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
-    with self.test_session():
+    with self.cached_session():
       with arg_scope((func1,), a=1, b=None, c=[1]):
         args, kwargs = func1(0)
         self.assertTupleEqual(args, func1_args)
@@ -240,7 +240,7 @@ class ArgScopeTest(test.TestCase):
   def testAddArgScopeRaceCondition(self):
     func4_kwargs = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h')
     for i in range(4):
-        # redefine the function with different args
+      # redefine the function with different args
       @add_arg_scope
       def func4(a=1, b=2, c=3, d=4, e=5, f=6, g=7, h=8):
         pass
diff --git a/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
index b7b9f5c59e..4036c87b6d 100644
--- a/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
@@ -50,7 +50,7 @@ class LoadMulticlassBiasTest(test.TestCase):
       bias = variables.Variable(
           array_ops.reshape(flat_data, (num, dim)), name='bias')
     save = saver.Saver([bias])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       variables.global_variables_initializer().run()
       self.bundle_file = os.path.join(test.get_temp_dir(), 'bias_checkpoint')
       save.save(sess, self.bundle_file)
@@ -90,7 +90,7 @@ class LoadMulticlassBiasTest(test.TestCase):
         initializer=bias_loading_initializer,
         partitioner=partitioned_variables.fixed_size_partitioner(3))
 
-    with self.test_session():
+    with self.cached_session():
       variables.global_variables_initializer().run()
       self.assertAllClose(expected_remapped_bias_vector,
                           remapped_bias_vector.as_tensor().eval())
@@ -109,7 +109,7 @@ class LoadVariableSlotTest(test.TestCase):
       accum = variables.Variable(
           array_ops.reshape(flat_data, (num, dim)), name='accum')
     save = saver.Saver([accum])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       variables.global_variables_initializer().run()
       self.bundle_file = os.path.join(test.get_temp_dir(), 'accum_checkpoint')
       save.save(sess, self.bundle_file)
@@ -179,7 +179,7 @@ class LoadVariableSlotTest(test.TestCase):
         shape=[2, 1],
         initializer=variable_slot_initializer_part_1)
 
-    with self.test_session():
+    with self.cached_session():
       variables.global_variables_initializer().run()
       self.assertAllClose(expected_remapped_accum_vector_part_0,
                           remapped_accum_vector_part_0.eval())
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
index 72835c3ad8..71ab755aa2 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
@@ -325,6 +325,8 @@ class CriticalSection(object):
 
   def _is_self_handle(self, x):
     """Check if the tensor `x` is the same Mutex as `self._handle`."""
+    if isinstance(x, ops.EagerTensor):
+      return x is self._handle
     return (x.op.type == "MutexV2"
             # blank shared_name means the op will create a unique one.
             and x.op.get_attr("shared_name")
@@ -365,8 +367,7 @@ class CriticalSection(object):
             "(CriticalSection: %s) requested exclusive resource access "
             "of this resource.  Did you mean to call execute with keyword "
             "argument exclusive_resource_access=False?" %
-            (list(resource_intersection), self._handle.name,
-             sg.op.name, sg.handle.name))
+            (list(resource_intersection), self._handle, sg, sg.handle))
 
   # TODO(ebrevdo): Re-enable once CriticalSection is in core.
 
diff --git a/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py b/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
index 50bcbe625d..c104c51fef 100644
--- a/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
@@ -34,7 +34,7 @@ class PrettyPrintOpsTest(test.TestCase):
   def testPrintTensorPassthrough(self):
     a = constant_op.constant([1])
     a = prettyprint_ops.print_op(a)
-    with self.test_session():
+    with self.cached_session():
       self.assertEqual(a.eval(), constant_op.constant([1]).eval())
 
   def testPrintSparseTensorPassthrough(self):
@@ -43,7 +43,7 @@ class PrettyPrintOpsTest(test.TestCase):
     b = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
     a = prettyprint_ops.print_op(a)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           sparse_ops.sparse_tensor_to_dense(a).eval(),
           sparse_ops.sparse_tensor_to_dense(b).eval())
@@ -54,13 +54,13 @@ class PrettyPrintOpsTest(test.TestCase):
     a = a.write(1, 1)
     a = a.write(0, 0)
     a = prettyprint_ops.print_op(a)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(a.stack().eval(), constant_op.constant([0, 1]).eval())
 
   def testPrintVariable(self):
     a = variables.Variable(1.0)
     a = prettyprint_ops.print_op(a)
-    with self.test_session():
+    with self.cached_session():
       variables.global_variables_initializer().run()
       a.eval()
 
diff --git a/tensorflow/contrib/framework/python/ops/script_ops.py b/tensorflow/contrib/framework/python/ops/script_ops.py
index 5d269fefdc..d5cb679e2c 100644
--- a/tensorflow/contrib/framework/python/ops/script_ops.py
+++ b/tensorflow/contrib/framework/python/ops/script_ops.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Script Language Operators. See the @{$python/script_ops} guide.
+"""Script Language Operators.
 
 @@py_func
 """
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
index a8fb94b245..791b32cd1e 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
@@ -48,7 +48,7 @@ class SortTest(test.TestCase):
       sort_axis = np.random.choice(rank)
       if negative_axis:
         sort_axis = -1 - sort_axis
-      with self.test_session():
+      with self.cached_session():
         self.assertAllEqual(
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
@@ -60,7 +60,7 @@ class SortTest(test.TestCase):
       shape = [np.random.randint(1, 4) for _ in range(rank)]
       arr = np.random.random(shape)
       sort_axis = np.random.choice(rank)
-      with self.test_session():
+      with self.cached_session():
         self.assertAllEqual(
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
@@ -73,7 +73,7 @@ class SortTest(test.TestCase):
     scalar = array_ops.zeros(zeros_length_1)
 
     sort = sort_ops.sort(scalar)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaises(errors.InvalidArgumentError):
         sort.eval()
 
@@ -84,7 +84,7 @@ class SortTest(test.TestCase):
 
   def testDescending(self):
     arr = np.random.random((10, 5, 5))
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           np.sort(arr, axis=0)[::-1],
           sort_ops.sort(
@@ -111,7 +111,7 @@ class SortTest(test.TestCase):
 
   def testArgsort_1d(self):
     arr = np.random.random(42)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           np.sort(arr),
           array_ops.gather(arr, sort_ops.argsort(arr)).eval())
@@ -119,7 +119,7 @@ class SortTest(test.TestCase):
   def testArgsort(self):
     arr = np.random.random((5, 6, 7, 8))
     for axis in range(4):
-      with self.test_session():
+      with self.cached_session():
         self.assertAllEqual(
             np.argsort(arr, axis=axis),
             sort_ops.argsort(arr, axis=axis).eval())
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index 3c44630a51..f9b0efd1da 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -45,7 +45,7 @@ from tensorflow.python.training import saver as saver_lib
 class LocalVariableTest(test.TestCase):
 
   def test_local_variable(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEquals([], variables_lib.local_variables())
       value0 = 42
       variables_lib2.local_variable(value0)
@@ -58,7 +58,7 @@ class LocalVariableTest(test.TestCase):
       self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
 
   def testLocalVariableNameAndShape(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.local_variable([1, 1, 1, 1, 1], name='a')
         self.assertEquals(a.op.name, 'A/a')
@@ -66,21 +66,21 @@ class LocalVariableTest(test.TestCase):
         self.assertListEqual([a], variables_lib2.get_local_variables())
 
   def testLocalVariableNotInAllVariables(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.local_variable(0)
         self.assertFalse(a in variables_lib.global_variables())
         self.assertTrue(a in variables_lib.local_variables())
 
   def testLocalVariableNotInVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.local_variable(0)
         self.assertFalse(a in variables_lib2.get_variables_to_restore())
         self.assertTrue(a in variables_lib.local_variables())
 
   def testGetVariablesDontReturnsTransients(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         variables_lib2.local_variable(0)
       with variable_scope.variable_scope('B'):
@@ -89,7 +89,7 @@ class LocalVariableTest(test.TestCase):
       self.assertEquals([], variables_lib2.get_variables('B'))
 
   def testGetLocalVariablesReturnsTransients(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.local_variable(0)
       with variable_scope.variable_scope('B'):
@@ -98,7 +98,7 @@ class LocalVariableTest(test.TestCase):
       self.assertEquals([b], variables_lib2.get_local_variables('B'))
 
   def testInitializedVariableValue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       a = variables_lib2.local_variable([0, 0, 0, 0, 0], name='a')
       sess.run(variables_lib.local_variables_initializer())
       self.assertAllEqual(a.eval(), [0] * 5)
@@ -114,7 +114,7 @@ class LocalVariableTest(test.TestCase):
 class GlobalVariableTest(test.TestCase):
 
   def test_global_variable(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEquals([], variables_lib.global_variables())
       value0 = 42
       variables_lib2.global_variable(value0)
@@ -129,7 +129,7 @@ class GlobalVariableTest(test.TestCase):
       self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
 
   def testVariableNameAndShape(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.global_variable([1, 1, 1, 1, 1], name='a')
         self.assertEquals(a.op.name, 'A/a')
@@ -137,21 +137,21 @@ class GlobalVariableTest(test.TestCase):
         self.assertListEqual([a], variables_lib.global_variables())
 
   def testGlobalVariableNotInLocalVariables(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.global_variable(0)
         self.assertFalse(a in variables_lib.local_variables())
         self.assertTrue(a in variables_lib.global_variables())
 
   def testGlobalVariableInVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.global_variable(0)
         self.assertFalse(a in variables_lib.local_variables())
         self.assertTrue(a in variables_lib2.get_variables_to_restore())
 
   def testGetVariablesReturnsThem(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.global_variable(0)
       with variable_scope.variable_scope('B'):
@@ -160,7 +160,7 @@ class GlobalVariableTest(test.TestCase):
       self.assertEquals([b], variables_lib2.get_variables('B'))
 
   def testGetLocalVariablesDontReturnsThem(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         variables_lib2.global_variable(0)
       with variable_scope.variable_scope('B'):
@@ -169,7 +169,7 @@ class GlobalVariableTest(test.TestCase):
       self.assertEquals([], variables_lib2.get_local_variables('B'))
 
   def testInitializedVariableValue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       a = variables_lib2.global_variable([0, 0, 0, 0, 0], name='a')
       sess.run(variables_lib.global_variables_initializer())
       self.assertAllEqual(a.eval(), [0] * 5)
@@ -249,7 +249,7 @@ class GlobalStepTest(test.TestCase):
 class VariablesTest(test.TestCase):
 
   def testCreateVariable(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
         self.assertEquals(a.op.name, 'A/a')
@@ -259,7 +259,7 @@ class VariablesTest(test.TestCase):
         self.assertFalse(a in variables_lib.local_variables())
 
   def testGetVariables(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -269,7 +269,7 @@ class VariablesTest(test.TestCase):
       self.assertEquals([b], variables_lib2.get_variables('B'))
 
   def testGetVariablesWithScope(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A') as var_scope:
         a = variables_lib2.variable('a', [5])
         b = variables_lib2.variable('b', [5])
@@ -277,7 +277,7 @@ class VariablesTest(test.TestCase):
           set([a, b]), set(variables_lib2.get_variables(var_scope)))
 
   def testGetVariablesSuffix(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
       with variable_scope.variable_scope('A'):
@@ -286,13 +286,13 @@ class VariablesTest(test.TestCase):
       self.assertEquals([b], variables_lib2.get_variables(suffix='b'))
 
   def testGetVariableWithSingleVar(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('parent'):
         a = variables_lib2.variable('child', [5])
       self.assertEquals(a, variables_lib2.get_unique_variable('parent/child'))
 
   def testGetVariableWithDistractors(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('parent'):
         a = variables_lib2.variable('child', [5])
         with variable_scope.variable_scope('child'):
@@ -302,13 +302,13 @@ class VariablesTest(test.TestCase):
 
   def testGetVariableThrowsExceptionWithNoMatch(self):
     var_name = 'cant_find_me'
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaises(ValueError):
         variables_lib2.get_unique_variable(var_name)
 
   def testGetThrowsExceptionWithChildrenButNoMatch(self):
     var_name = 'parent/child'
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope(var_name):
         variables_lib2.variable('grandchild1', [7])
         variables_lib2.variable('grandchild2', [9])
@@ -316,7 +316,7 @@ class VariablesTest(test.TestCase):
         variables_lib2.get_unique_variable(var_name)
 
   def testGetVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -324,7 +324,7 @@ class VariablesTest(test.TestCase):
       self.assertEquals([a, b], variables_lib2.get_variables_to_restore())
 
   def testIncludeGetVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -333,7 +333,7 @@ class VariablesTest(test.TestCase):
       self.assertEquals([a], variables_lib2.get_variables_to_restore(['A']))
 
   def testExcludeGetVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -343,7 +343,7 @@ class VariablesTest(test.TestCase):
           [a], variables_lib2.get_variables_to_restore(exclude=['B']))
 
   def testWrongIncludeGetVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -352,7 +352,7 @@ class VariablesTest(test.TestCase):
       self.assertEquals([], variables_lib2.get_variables_to_restore(['a']))
 
   def testGetMixedVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
         b = variables_lib2.variable('b', [5])
@@ -365,7 +365,7 @@ class VariablesTest(test.TestCase):
           variables_lib2.get_variables_to_restore(include=['A/a', 'B/c']))
 
   def testExcludeGetMixedVariablesToRestore(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
         b = variables_lib2.variable('b', [5])
@@ -378,7 +378,7 @@ class VariablesTest(test.TestCase):
           variables_lib2.get_variables_to_restore(exclude=['A/a', 'B/c']))
 
   def testReuseVariable(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [])
       with variable_scope.variable_scope('A', reuse=True):
@@ -387,14 +387,14 @@ class VariablesTest(test.TestCase):
       self.assertListEqual([a], variables_lib2.get_variables())
 
   def testVariableWithRegularizer(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [], regularizer=nn_ops.l2_loss)
       loss = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[0]
       self.assertDeviceEqual(loss.device, a.device)
 
   def testVariableWithRegularizerColocate(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable(
             'a', [], device='gpu:0', regularizer=nn_ops.l2_loss)
@@ -402,7 +402,7 @@ class VariablesTest(test.TestCase):
       self.assertDeviceEqual(loss.device, a.device)
 
   def testVariableWithDevice(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [], device='cpu:0')
         b = variables_lib2.variable('b', [], device='cpu:1')
@@ -410,7 +410,7 @@ class VariablesTest(test.TestCase):
       self.assertDeviceEqual(b.device, 'cpu:1')
 
   def testVariableWithDeviceFromScope(self):
-    with self.test_session():
+    with self.cached_session():
       with ops.device('/cpu:0'):
         a = variables_lib2.variable('a', [])
         b = variables_lib2.variable('b', [], device='cpu:1')
@@ -428,7 +428,7 @@ class VariablesTest(test.TestCase):
         self.counter += 1
         return 'cpu:%d' % self.counter
 
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([variables_lib2.variable], device=DevFn()):
         a = variables_lib2.variable('a', [])
         b = variables_lib2.variable('b', [])
@@ -453,7 +453,7 @@ class VariablesTest(test.TestCase):
       self.assertDeviceEqual(e.initial_value.device, 'cpu:99')
 
   def testVariableWithReplicaDeviceSetter(self):
-    with self.test_session():
+    with self.cached_session():
       with ops.device(device_setter.replica_device_setter(ps_tasks=2)):
         a = variables_lib2.variable('a', [])
         b = variables_lib2.variable('b', [])
@@ -570,7 +570,7 @@ class VariablesTest(test.TestCase):
 class ModelVariablesTest(test.TestCase):
 
   def testNameAndShape(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.model_variable('a', [5])
         self.assertEquals(a.op.name, 'A/a')
@@ -578,7 +578,7 @@ class ModelVariablesTest(test.TestCase):
         self.assertListEqual([a], variables_lib2.get_model_variables('A'))
 
   def testNotInLocalVariables(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.model_variable('a', [5])
         self.assertTrue(a in variables_lib.global_variables())
@@ -586,7 +586,7 @@ class ModelVariablesTest(test.TestCase):
         self.assertFalse(a in variables_lib.local_variables())
 
   def testGetVariablesReturns(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.model_variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -595,7 +595,7 @@ class ModelVariablesTest(test.TestCase):
       self.assertEquals([b], variables_lib2.get_variables('B'))
 
   def testGetModelVariables(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.model_variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -604,7 +604,7 @@ class ModelVariablesTest(test.TestCase):
       self.assertEquals([b], variables_lib2.get_model_variables('B'))
 
   def testGetTrainableVariables(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         variables_lib2.local_variable([5])
         a = variables_lib.Variable([5])
@@ -615,7 +615,7 @@ class ModelVariablesTest(test.TestCase):
       self.assertEquals([b], variables_lib2.get_trainable_variables('B'))
 
   def testGetLocalVariables(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         _ = variables_lib2.model_variable('a', [5])
       with variable_scope.variable_scope('B'):
@@ -624,7 +624,7 @@ class ModelVariablesTest(test.TestCase):
       self.assertEquals([], variables_lib2.get_local_variables('B'))
 
   def testInitializedVariableValue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       a = variables_lib2.model_variable(
           'a', [5], initializer=init_ops.ones_initializer())
       sess.run(variables_lib.global_variables_initializer())
@@ -670,14 +670,14 @@ class ModelVariablesTest(test.TestCase):
 class GetVariablesCollections(test.TestCase):
 
   def testVariableCollection(self):
-    with self.test_session():
+    with self.cached_session():
       a = variables_lib2.variable('a', [], collections='A')
       b = variables_lib2.variable('b', [], collections='B')
       self.assertEquals(a, ops.get_collection('A')[0])
       self.assertEquals(b, ops.get_collection('B')[0])
 
   def testVariableCollections(self):
-    with self.test_session():
+    with self.cached_session():
       a = variables_lib2.variable('a', [], collections=['A', 'C'])
       b = variables_lib2.variable('b', [], collections=['B', 'C'])
       self.assertEquals(a, ops.get_collection('A')[0])
@@ -685,14 +685,14 @@ class GetVariablesCollections(test.TestCase):
       self.assertListEqual([a, b], ops.get_collection('C'))
 
   def testVariableCollectionsWithArgScope(self):
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([variables_lib2.variable], collections='A'):
         a = variables_lib2.variable('a', [])
         b = variables_lib2.variable('b', [])
       self.assertListEqual([a, b], ops.get_collection('A'))
 
   def testVariableCollectionsWithArgScopeNested(self):
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([variables_lib2.variable], collections='A'):
         a = variables_lib2.variable('a', [])
         with arg_scope([variables_lib2.variable], collections='B'):
@@ -701,7 +701,7 @@ class GetVariablesCollections(test.TestCase):
       self.assertEquals(b, ops.get_collection('B')[0])
 
   def testVariableCollectionsWithArgScopeNonNested(self):
-    with self.test_session():
+    with self.cached_session():
       with arg_scope([variables_lib2.variable], collections='A'):
         a = variables_lib2.variable('a', [])
       with arg_scope([variables_lib2.variable], collections='B'):
@@ -711,7 +711,7 @@ class GetVariablesCollections(test.TestCase):
       self.assertListEqual([b], ops.get_collection('B'))
 
   def testVariableRestoreWithArgScopeNested(self):
-    with self.test_session():
+    with self.cached_session():
       a = variables_lib2.variable('a', [])
       with arg_scope(
           [variables_lib2.variable], trainable=False, collections=['A', 'B']):
@@ -726,7 +726,7 @@ class GetVariablesCollections(test.TestCase):
 class GetVariablesBySuffixTest(test.TestCase):
 
   def testGetVariableGivenNameScoped(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
         b = variables_lib2.variable('b', [5])
@@ -734,7 +734,7 @@ class GetVariablesBySuffixTest(test.TestCase):
         self.assertEquals([b], variables_lib2.get_variables_by_suffix('b'))
 
   def testGetVariableWithScope(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
         fooa = variables_lib2.variable('fooa', [5])
@@ -748,7 +748,7 @@ class GetVariablesBySuffixTest(test.TestCase):
       self.assertEquals([a, fooa], matched_variables)
 
   def testGetVariableWithoutScope(self):
-    with self.test_session():
+    with self.cached_session():
       a = variables_lib2.variable('a', [5])
       fooa = variables_lib2.variable('fooa', [5])
       b_a = variables_lib2.variable('B/a', [5])
@@ -761,7 +761,7 @@ class GetVariablesBySuffixTest(test.TestCase):
 class GetVariablesByNameTest(test.TestCase):
 
   def testGetVariableGivenNameScoped(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
         b = variables_lib2.variable('b', [5])
@@ -769,7 +769,7 @@ class GetVariablesByNameTest(test.TestCase):
         self.assertEquals([b], variables_lib2.get_variables_by_name('b'))
 
   def testGetVariableWithScope(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('A'):
         a = variables_lib2.variable('a', [5])
         fooa = variables_lib2.variable('fooa', [5])
@@ -785,7 +785,7 @@ class GetVariablesByNameTest(test.TestCase):
       self.assertEquals([a], matched_variables)
 
   def testGetVariableWithoutScope(self):
-    with self.test_session():
+    with self.cached_session():
       a = variables_lib2.variable('a', [5])
       fooa = variables_lib2.variable('fooa', [5])
       b_a = variables_lib2.variable('B/a', [5])
@@ -818,7 +818,7 @@ class AssignFromValuesTest(test.TestCase):
     init_value0 = np.asarray([1.0, 3.0, 9.0]).reshape((1, 3, 1))
     init_value1 = np.asarray([2.0, 4.0, 6.0, 8.0]).reshape((2, 1, 2))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       initializer = init_ops.truncated_normal_initializer(stddev=.1)
       var0 = variables_lib2.variable(
           'my_var0', shape=[1, 3, 1], initializer=initializer)
@@ -844,7 +844,7 @@ class AssignFromValuesTest(test.TestCase):
     init_value0 = np.asarray([1.0, 3.0, 9.0]).reshape((1, 3, 1))
     init_value1 = np.asarray([2.0, 4.0, 6.0, 8.0]).reshape((2, 1, 2))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       initializer = init_ops.truncated_normal_initializer(stddev=.1)
 
       with variable_scope.variable_scope('my_model/my_layer0'):
@@ -879,7 +879,7 @@ class AssignFromValuesFnTest(test.TestCase):
     init_value0 = np.asarray([1.0, 3.0, 9.0]).reshape((1, 3, 1))
     init_value1 = np.asarray([2.0, 4.0, 6.0, 8.0]).reshape((2, 1, 2))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       initializer = init_ops.truncated_normal_initializer(stddev=.1)
       var0 = variables_lib2.variable(
           'my_var0', shape=[1, 3, 1], initializer=initializer)
@@ -904,7 +904,7 @@ class AssignFromValuesFnTest(test.TestCase):
     init_value0 = np.asarray([1.0, 3.0, 9.0]).reshape((1, 3, 1))
     init_value1 = np.asarray([2.0, 4.0, 6.0, 8.0]).reshape((2, 1, 2))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       initializer = init_ops.truncated_normal_initializer(stddev=.1)
 
       with variable_scope.variable_scope('my_model/my_layer0'):
@@ -968,7 +968,7 @@ class AssignFromCheckpointTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('my_var0', shape=[])
@@ -998,7 +998,7 @@ class AssignFromCheckpointTest(test.TestCase):
     init_value1 = np.array([20.0])  # Partitioned into 1 part, edge case.
     var_names_to_values = {'var0': init_value0, 'var1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       # var0 and var1 are PartitionedVariables.
@@ -1039,7 +1039,7 @@ class AssignFromCheckpointTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session():
+    with self.cached_session():
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('my_var0', shape=[])
@@ -1062,7 +1062,7 @@ class AssignFromCheckpointTest(test.TestCase):
 
     var_names_to_values = {'layer0/v0': init_value0, 'layer1/v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       with variable_scope.variable_scope('my_model/my_layer0'):
@@ -1123,7 +1123,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('my_var0', shape=[])
@@ -1154,7 +1154,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('my_var0', shape=[2, 1])
@@ -1183,7 +1183,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('my_var0', shape=[2, 1])
@@ -1213,7 +1213,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('my_var0', shape=[])
@@ -1241,7 +1241,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('v0', shape=[])
@@ -1272,7 +1272,7 @@ class AssignFromCheckpointFnTest(test.TestCase):
     init_value1 = 20.0
     var_names_to_values = {'v0': init_value0, 'v1': init_value1}
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       model_path = self.create_checkpoint_from_values(var_names_to_values,
                                                       model_dir)
       var0 = variables_lib2.variable('my_var0', shape=[])
@@ -1299,7 +1299,7 @@ class ZeroInitializerOpTest(test.TestCase):
   def _testZeroInitializer(self, shape, initializer, use_init):
     var = variables_lib.Variable(initializer)
     var_zero = variables_lib2.zero_initializer(var)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.assertRaisesOpError('Attempting to use uninitialized value'):
         var.eval()
       if use_init:
@@ -1324,7 +1324,7 @@ class ZeroVarInitializerOpTest(test.TestCase):
     var = resource_variable_ops.ResourceVariable(initializer)
     var_zero = variables_lib2.zero_initializer(var)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.assertRaisesOpError('Error while reading resource variable'):
         var.eval()
       if use_init:
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
index 7534f5797c..869e899ac8 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
-#define THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+#ifndef TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+#define TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -62,4 +62,4 @@ class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T, BiasType,
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 4fb8d58bc9..d64dfd1576 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -335,7 +335,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     mofid_op = classifier_metrics.mean_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
         tf_pool_real_a, tf_pool_gen_a)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_mofid = sess.run(mofid_op)
 
     expected_mofid = _expected_mean_only_fid(pool_real_a, pool_gen_a)
@@ -355,7 +355,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     dofid_op = classifier_metrics.diagonal_only_frechet_classifier_distance_from_activations(  # pylint: disable=line-too-long
         tf_pool_real_a, tf_pool_gen_a)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_dofid = sess.run(dofid_op)
 
     expected_dofid = _expected_diagonal_only_fid(pool_real_a, pool_gen_a)
@@ -377,7 +377,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         test_pool_gen_a,
         classifier_fn=lambda x: x)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_fid = sess.run(fid_op)
 
     expected_fid = _expected_fid(test_pool_real_a, test_pool_gen_a)
@@ -404,7 +404,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
           classifier_fn=lambda x: x))
 
     fids = []
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for fid_op in fid_ops:
         fids.append(sess.run(fid_op))
 
@@ -426,7 +426,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     trace_sqrt_prod_op = _run_with_mock(classifier_metrics.trace_sqrt_product,
                                         cov_real, cov_gen)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # trace_sqrt_product: tsp
       actual_tsp = sess.run(trace_sqrt_prod_op)
 
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
index 871f1ad54e..ab909feae3 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
@@ -65,7 +65,7 @@ class ClassifierMetricsTest(test.TestCase):
     pyramid = np_laplacian_pyramid(data, 3)
     data_tf = array_ops.placeholder(dtypes.float32, [256, 32, 32, 3])
     pyramid_tf = swd._laplacian_pyramid(data_tf, 3)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       pyramid_tf = sess.run(
           pyramid_tf, feed_dict={
               data_tf: data.transpose(0, 2, 3, 1)
@@ -79,7 +79,7 @@ class ClassifierMetricsTest(test.TestCase):
     d1 = random_ops.random_uniform([256, 32, 32, 3])
     d2 = random_ops.random_normal([256, 32, 32, 3])
     wfunc = swd.sliced_wasserstein_distance(d1, d2)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       wscores = [sess.run(x) for x in wfunc]
     self.assertAllClose(
         np.array([0.014, 0.014], 'f'),
@@ -95,7 +95,7 @@ class ClassifierMetricsTest(test.TestCase):
     d1 = random_ops.random_uniform([256, 32, 32, 3])
     d2 = random_ops.random_normal([256, 32, 32, 3])
     wfunc = swd.sliced_wasserstein_distance(d1, d2, use_svd=True)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       wscores = [sess.run(x) for x in wfunc]
     self.assertAllClose(
         np.array([0.013, 0.013], 'f'),
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.h b/tensorflow/contrib/gdr/gdr_memory_manager.h
index 9ac1aa96c4..c85886863e 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.h
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef GDR_MEMORY_MANAGER_H_
-#define GDR_MEMORY_MANAGER_H_
+#ifndef TENSORFLOW_CONTRIB_GDR_GDR_MEMORY_MANAGER_H_
+#define TENSORFLOW_CONTRIB_GDR_GDR_MEMORY_MANAGER_H_
 
 #include "google/protobuf/any.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -57,4 +57,4 @@ RemoteMemoryManager* CreateRemoteMemoryManager(const string& host,
 
 }  // namespace tensorflow
 
-#endif  // GDR_MEMORY_MANAGER_H_
+#endif  // TENSORFLOW_CONTRIB_GDR_GDR_MEMORY_MANAGER_H_
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.h b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.h
index 7fedd04f54..47a36efdb7 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.h
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef GDR_RENDEZVOUS_MGR_H_
-#define GDR_RENDEZVOUS_MGR_H_
+#ifndef TENSORFLOW_CONTRIB_GDR_GDR_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CONTRIB_GDR_GDR_RENDEZVOUS_MGR_H_
 
 #include "tensorflow/contrib/gdr/gdr_memory_manager.h"
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
@@ -39,4 +39,4 @@ class GdrRendezvousMgr : public BaseRendezvousMgr {
 
 }  // end namespace tensorflow
 
-#endif  // GDR_RENDEZVOUS_MGR_H_
+#endif  // TENSORFLOW_CONTRIB_GDR_GDR_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.h b/tensorflow/contrib/gdr/gdr_server_lib.h
index d6c40d429e..efa2390d33 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.h
+++ b/tensorflow/contrib/gdr/gdr_server_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef GDR_SERVER_LIB_H_
-#define GDR_SERVER_LIB_H_
+#ifndef TENSORFLOW_CONTRIB_GDR_GDR_SERVER_LIB_H_
+#define TENSORFLOW_CONTRIB_GDR_GDR_SERVER_LIB_H_
 
 #include "tensorflow/contrib/gdr/gdr_memory_manager.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
@@ -49,4 +49,4 @@ class GdrServer : public GrpcServer {
 
 }  // namespace tensorflow
 
-#endif  // GDR_SERVER_LIB_H_
+#endif  // TENSORFLOW_CONTRIB_GDR_GDR_SERVER_LIB_H_
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index 54081f655e..65105ed997 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef GDR_WORKER_H_
-#define GDR_WORKER_H_
+#ifndef TENSORFLOW_CONTRIB_GDR_GDR_WORKER_H_
+#define TENSORFLOW_CONTRIB_GDR_GDR_WORKER_H_
 
 #include "tensorflow/contrib/gdr/gdr_memory_manager.h"
 
@@ -44,4 +44,4 @@ class GdrWorker : public GrpcWorker {
 
 }  // namespace tensorflow
 
-#endif  // GDR_WORKER_H_
+#endif  // TENSORFLOW_CONTRIB_GDR_GDR_WORKER_H_
diff --git a/tensorflow/contrib/graph_editor/__init__.py b/tensorflow/contrib/graph_editor/__init__.py
index 51b7f45274..b2de2b9a69 100644
--- a/tensorflow/contrib/graph_editor/__init__.py
+++ b/tensorflow/contrib/graph_editor/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """TensorFlow Graph Editor.
 
-See the @{$python/contrib.graph_editor} guide.
+See the
+[Graph Editor](https://tensorflow.org/api_guides/python/contrib.graph_editor)
+guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
index a58b6a247e..24b790977d 100644
--- a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
@@ -50,7 +50,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
 
     interp = dense_image_warp._interpolate_bilinear(grid, query_points)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       predicted = sess.run(interp)
       self.assertAllClose(expected_results, predicted)
 
@@ -64,7 +64,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
     interp = dense_image_warp._interpolate_bilinear(
         grid, query_points, indexing='xy')
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       predicted = sess.run(interp)
       self.assertAllClose(expected_results, predicted)
 
@@ -78,7 +78,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
 
     interp = dense_image_warp._interpolate_bilinear(grid, query_points)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       predicted = sess.run(interp)
       self.assertAllClose(expected_results, predicted)
 
@@ -160,7 +160,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
                                                         flow_type)
     interp = dense_image_warp.dense_image_warp(image, flows)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       rand_image, rand_flows = self.get_random_image_and_flows(
           shape, image_type, flow_type)
       rand_flows *= 0
@@ -191,7 +191,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
                                                         flow_type)
     interp = dense_image_warp.dense_image_warp(image, flows)
     low_precision = image_type == 'float16' or flow_type == 'float16'
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       rand_image, rand_flows = self.get_random_image_and_flows(
           shape, image_type, flow_type)
 
@@ -249,7 +249,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
     opt_func = optimizer.apply_gradients(zip(grad, [flows]))
     init_op = variables.global_variables_initializer()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(init_op)
       for _ in range(10):
         sess.run(opt_func)
diff --git a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
index a495b58b7f..ac8573445c 100644
--- a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
@@ -217,7 +217,7 @@ class AdjustSaturationInYiqTest(test_util.TensorFlowTestCase):
         'gb_same',
         'rgb_same',
     ]
-    with self.test_session():
+    with self.cached_session():
       for x_shape in x_shapes:
         for test_style in test_styles:
           x_np = np.random.rand(*x_shape) * 255.
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index f588eae923..70339d7612 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -39,7 +39,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
 
   def test_zeros(self):
     for dtype in _DTYPES:
-      with self.test_session():
+      with self.cached_session():
         for shape in [(5, 5), (24, 24), (2, 24, 24, 3)]:
           for angle in [0, 1, np.pi / 2.0]:
             image = array_ops.zeros(shape, dtype)
@@ -49,7 +49,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
 
   def test_rotate_even(self):
     for dtype in _DTYPES:
-      with self.test_session():
+      with self.cached_session():
         image = array_ops.reshape(
             math_ops.cast(math_ops.range(36), dtype), (6, 6))
         image_rep = array_ops.tile(image[None, :, :, None], [3, 1, 1, 1])
@@ -71,7 +71,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
 
   def test_rotate_odd(self):
     for dtype in _DTYPES:
-      with self.test_session():
+      with self.cached_session():
         image = array_ops.reshape(
             math_ops.cast(math_ops.range(25), dtype), (5, 5))
         image_rep = array_ops.tile(image[None, :, :, None], [3, 1, 1, 1])
@@ -91,7 +91,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
 
   def test_translate(self):
     for dtype in _DTYPES:
-      with self.test_session():
+      with self.cached_session():
         image = constant_op.constant(
             [[1, 0, 1, 0],
              [0, 1, 0, 1],
@@ -107,7 +107,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
 
   def test_compose(self):
     for dtype in _DTYPES:
-      with self.test_session():
+      with self.cached_session():
         image = constant_op.constant(
             [[1, 1, 1, 0],
              [1, 0, 0, 0],
@@ -131,7 +131,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
 
   def test_extreme_projective_transform(self):
     for dtype in _DTYPES:
-      with self.test_session():
+      with self.cached_session():
         image = constant_op.constant(
             [[1, 0, 1, 0],
              [0, 1, 0, 1],
@@ -147,7 +147,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
                              [0, 0, 0, 0]])
 
   def test_bilinear(self):
-    with self.test_session():
+    with self.cached_session():
       image = constant_op.constant(
           [[0, 0, 0, 0, 0],
            [0, 1, 1, 1, 0],
@@ -176,7 +176,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
            [0, 0, 1, 0, 0]])
 
   def test_bilinear_uint8(self):
-    with self.test_session():
+    with self.cached_session():
       image = constant_op.constant(
           np.asarray(
               [[0.0, 0.0, 0.0, 0.0, 0.0],
@@ -209,7 +209,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([3, 5], result.get_shape())
 
   def _test_grad(self, shape_to_test):
-    with self.test_session():
+    with self.cached_session():
       test_image_shape = shape_to_test
       test_image = np.random.randn(*test_image_shape)
       test_image_tensor = constant_op.constant(
@@ -228,7 +228,7 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
       self.assertLess(left_err, 1e-10)
 
   def _test_grad_different_shape(self, input_shape, output_shape):
-    with self.test_session():
+    with self.cached_session():
       test_image_shape = input_shape
       test_image = np.random.randn(*test_image_shape)
       test_image_tensor = constant_op.constant(
@@ -276,7 +276,7 @@ class BipartiteMatchTest(test_util.TensorFlowTestCase):
     expected_col_to_row_match_np = np.array(expected_col_to_row_match,
                                             dtype=np.int32)
 
-    with self.test_session():
+    with self.cached_session():
       distance_mat_tf = constant_op.constant(distance_mat_np,
                                              shape=distance_mat_shape)
       location_to_prior, prior_to_location = image_ops.bipartite_match(
diff --git a/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
index 1939caaa2d..d58a654292 100644
--- a/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
@@ -164,7 +165,7 @@ class InterpolateSplineTest(test_util.TensorFlowTestCase):
     with ops.name_scope('interpolator'):
       interpolator = interpolate_spline.interpolate_spline(
           train_points, train_values, query_points, interpolation_order)
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         fetches = [query_points, train_points, train_values, interpolator]
         query_points_, train_points_, train_values_, interp_ = sess.run(fetches)
 
@@ -204,7 +205,7 @@ class InterpolateSplineTest(test_util.TensorFlowTestCase):
 
         target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
         target_interpolation = np.array(target_interpolation)
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           interp_val = sess.run(interpolator)
           self.assertAllClose(interp_val[0, :, 0], target_interpolation)
 
@@ -222,10 +223,85 @@ class InterpolateSplineTest(test_util.TensorFlowTestCase):
 
         target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
         target_interpolation = np.array(target_interpolation)
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           interp_val = sess.run(interpolator)
           self.assertAllClose(interp_val[0, :, 0], target_interpolation)
 
+  def test_nd_linear_interpolation_unspecified_shape(self):
+    """Ensure that interpolation supports dynamic batch_size and num_points."""
+
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    # Construct placeholders such that the batch size, number of train points,
+    # and number of query points are not known at graph construction time.
+    feature_dim = query_points.shape[-1]
+    value_dim = train_values.shape[-1]
+    train_points_ph = array_ops.placeholder(
+        dtype=train_points.dtype, shape=[None, None, feature_dim])
+    train_values_ph = array_ops.placeholder(
+        dtype=train_values.dtype, shape=[None, None, value_dim])
+    query_points_ph = array_ops.placeholder(
+        dtype=query_points.dtype, shape=[None, None, feature_dim])
+
+    order = 1
+    reg_weight = 0.01
+
+    interpolator = interpolate_spline.interpolate_spline(
+        train_points_ph, train_values_ph, query_points_ph, order, reg_weight)
+
+    target_interpolation = tp.HARDCODED_QUERY_VALUES[(order, reg_weight)]
+    target_interpolation = np.array(target_interpolation)
+    with self.cached_session() as sess:
+
+      (train_points_value, train_values_value, query_points_value) = sess.run(
+          [train_points, train_values, query_points])
+
+      interp_val = sess.run(
+          interpolator,
+          feed_dict={
+              train_points_ph: train_points_value,
+              train_values_ph: train_values_value,
+              query_points_ph: query_points_value
+          })
+      self.assertAllClose(interp_val[0, :, 0], target_interpolation)
+
+  def test_fully_unspecified_shape(self):
+    """Ensure that erreor is thrown when input/output dim unspecified."""
+
+    tp = _QuadraticPlusSinProblemND()
+    (query_points, _, train_points,
+     train_values) = tp.get_problem(dtype='float64')
+
+    # Construct placeholders such that the batch size, number of train points,
+    # and number of query points are not known at graph construction time.
+    feature_dim = query_points.shape[-1]
+    value_dim = train_values.shape[-1]
+    train_points_ph = array_ops.placeholder(
+        dtype=train_points.dtype, shape=[None, None, feature_dim])
+    train_points_ph_invalid = array_ops.placeholder(
+        dtype=train_points.dtype, shape=[None, None, None])
+    train_values_ph = array_ops.placeholder(
+        dtype=train_values.dtype, shape=[None, None, value_dim])
+    train_values_ph_invalid = array_ops.placeholder(
+        dtype=train_values.dtype, shape=[None, None, None])
+    query_points_ph = array_ops.placeholder(
+        dtype=query_points.dtype, shape=[None, None, feature_dim])
+
+    order = 1
+    reg_weight = 0.01
+
+    with self.assertRaises(ValueError):
+      _ = interpolate_spline.interpolate_spline(
+          train_points_ph_invalid, train_values_ph, query_points_ph, order,
+          reg_weight)
+
+    with self.assertRaises(ValueError):
+      _ = interpolate_spline.interpolate_spline(
+          train_points_ph, train_values_ph_invalid, query_points_ph, order,
+          reg_weight)
+
   def test_interpolation_gradient(self):
     """Make sure that backprop can run. Correctness of gradients is assumed.
 
@@ -254,7 +330,7 @@ class InterpolateSplineTest(test_util.TensorFlowTestCase):
       opt_func = optimizer.apply_gradients(zip(grad, [train_points]))
       init_op = variables.global_variables_initializer()
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         sess.run(init_op)
         for _ in range(100):
           sess.run([loss, opt_func])
diff --git a/tensorflow/contrib/image/python/kernel_tests/segmentation_test.py b/tensorflow/contrib/image/python/kernel_tests/segmentation_test.py
index 48066cbace..3d39165ede 100644
--- a/tensorflow/contrib/image/python/kernel_tests/segmentation_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/segmentation_test.py
@@ -59,19 +59,19 @@ class SegmentationTest(test_util.TensorFlowTestCase):
          [7, 0, 8, 0, 0, 0, 9, 0, 0],
          [0, 0, 0, 0, 10, 0, 0, 0, 0],
          [0, 0, 11, 0, 0, 0, 0, 0, 0]])  # pyformat: disable
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(image_ops.connected_components(arr).eval(), expected)
 
   def testSimple(self):
     arr = [[0, 1, 0], [1, 1, 1], [0, 1, 0]]
-    with self.test_session():
+    with self.cached_session():
       # Single component with id 1.
       self.assertAllEqual(
           image_ops.connected_components(math_ops.cast(
               arr, dtypes.bool)).eval(), arr)
 
   def testSnake(self):
-    with self.test_session():
+    with self.cached_session():
       # Single component with id 1.
       self.assertAllEqual(
           image_ops.connected_components(math_ops.cast(
@@ -80,7 +80,7 @@ class SegmentationTest(test_util.TensorFlowTestCase):
   def testSnake_disconnected(self):
     for i in range(SNAKE.shape[0]):
       for j in range(SNAKE.shape[1]):
-        with self.test_session():
+        with self.cached_session():
           # If we disconnect any part of the snake except for the endpoints,
           # there will be 2 components.
           if SNAKE[i, j] and (i, j) not in [(1, 1), (6, 3)]:
@@ -121,27 +121,27 @@ class SegmentationTest(test_util.TensorFlowTestCase):
                  [0, 6, 6, 0],
                  [8, 0, 6, 0],
                  [0, 0, 6, 6]]]  # pyformat: disable
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           image_ops.connected_components(math_ops.cast(
               images, dtypes.bool)).eval(), expected)
 
   def testZeros(self):
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           image_ops.connected_components(
               array_ops.zeros((100, 20, 50), dtypes.bool)).eval(),
           np.zeros((100, 20, 50)))
 
   def testOnes(self):
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           image_ops.connected_components(
               array_ops.ones((100, 20, 50), dtypes.bool)).eval(),
           np.tile(np.arange(100)[:, None, None] + 1, [1, 20, 50]))
 
   def testOnes_small(self):
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           image_ops.connected_components(array_ops.ones((3, 5),
                                                         dtypes.bool)).eval(),
@@ -153,7 +153,7 @@ class SegmentationTest(test_util.TensorFlowTestCase):
     expected = connected_components_reference_implementation(images)
     if expected is None:
       return
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           image_ops.connected_components(images).eval(), expected)
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
index 3f4029e558..e5980c53b2 100644
--- a/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
@@ -47,7 +47,7 @@ class SingleImageRandomDotStereogramsTest(test_util.TensorFlowTestCase):
         normalize=True)
     shape_1 = sirds_1.get_shape().as_list()
     self.assertEqual(shape_1, [768, 1024, 1])
-    with self.test_session():
+    with self.cached_session():
       r_tf_1 = sirds_1.eval()
       self.assertAllEqual(shape_1, r_tf_1.shape)
 
@@ -59,7 +59,7 @@ class SingleImageRandomDotStereogramsTest(test_util.TensorFlowTestCase):
         normalize=True)
     shape_2 = sirds_2.get_shape().as_list()
     self.assertEqual(shape_2, [768, 1024, 3])
-    with self.test_session():
+    with self.cached_session():
       r_tf_2 = sirds_2.eval()
       self.assertAllEqual(shape_2, r_tf_2.shape)
 
@@ -73,7 +73,7 @@ class SingleImageRandomDotStereogramsTest(test_util.TensorFlowTestCase):
         output_image_shape=[1200, 800, 1])
     shape_3 = sirds_3.get_shape().as_list()
     self.assertEqual(shape_3, [800, 1200, 1])
-    with self.test_session():
+    with self.cached_session():
       r_tf_3 = sirds_3.eval()
       self.assertAllEqual(shape_3, r_tf_3.shape)
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py
index 0135c66e29..ce9e34df73 100644
--- a/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py
@@ -107,7 +107,7 @@ class SparseImageWarpTest(test_util.TensorFlowTestCase):
         regularization_weight=regularization,
         num_boundary_points=num_boundary_points)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       warped_image, input_image, _ = sess.run(
           [warped_image_op, input_image_op, flow_field])
 
@@ -149,7 +149,7 @@ class SparseImageWarpTest(test_util.TensorFlowTestCase):
         interpolation_order=order,
         num_boundary_points=num_boundary_points)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       warped_image, input_image, flow = sess.run(
           [warped_image_op, input_image_op, flow_field])
       # Check that it moved the pixel correctly.
@@ -176,7 +176,7 @@ class SparseImageWarpTest(test_util.TensorFlowTestCase):
     test_data_dir = test.test_src_dir_path('contrib/image/python/'
                                            'kernel_tests/test_data/')
     input_file = test_data_dir + 'Yellow_Smiley_Face.png'
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       input_image = self.load_image(input_file, sess)
     control_points = np.asarray([[64, 59], [180 - 64, 59], [39, 111],
                                  [180 - 39, 111], [90, 143], [58, 134],
@@ -199,7 +199,7 @@ class SparseImageWarpTest(test_util.TensorFlowTestCase):
             control_points_op + control_point_displacements_op,
             interpolation_order=interpolation_order,
             num_boundary_points=num_boundary_points)
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
           warped_image = sess.run(warp_op)
           out_image = np.uint8(warped_image[0, :, :, :] * 255)
           target_file = (
@@ -244,7 +244,7 @@ class SparseImageWarpTest(test_util.TensorFlowTestCase):
     opt_func = optimizer.apply_gradients(zip(grad, [image]))
     init_op = variables.global_variables_initializer()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(init_op)
       for _ in range(5):
         sess.run([loss, opt_func])
diff --git a/tensorflow/contrib/image/python/ops/interpolate_spline.py b/tensorflow/contrib/image/python/ops/interpolate_spline.py
index daf8c56456..f0b408faa3 100644
--- a/tensorflow/contrib/image/python/ops/interpolate_spline.py
+++ b/tensorflow/contrib/image/python/ops/interpolate_spline.py
@@ -17,9 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
@@ -95,10 +92,22 @@ def _solve_interpolation(train_points, train_values, order,
   Returns:
     w: `[b, n, k]` weights on each interpolation center
     v: `[b, d, k]` weights on each input dimension
+  Raises:
+    ValueError: if d or k is not fully specified.
   """
 
-  b, n, d = train_points.get_shape().as_list()
-  _, _, k = train_values.get_shape().as_list()
+  # These dimensions are set dynamically at runtime.
+  b, n, _ = array_ops.unstack(array_ops.shape(train_points), num=3)
+
+  d = train_points.shape[-1]
+  if d.value is None:
+    raise ValueError('The dimensionality of the input points (d) must be '
+                     'statically-inferrable.')
+
+  k = train_values.shape[-1]
+  if k.value is None:
+    raise ValueError('The dimensionality of the output values (k) must be '
+                     'statically-inferrable.')
 
   # First, rename variables so that the notation (c, f, w, v, A, B, etc.)
   # follows https://en.wikipedia.org/wiki/Polyharmonic_spline.
@@ -113,14 +122,12 @@ def _solve_interpolation(train_points, train_values, order,
 
     matrix_a = _phi(_pairwise_squared_distance_matrix(c), order)  # [b, n, n]
     if regularization_weight > 0:
-      batch_identity_matrix = np.expand_dims(np.eye(n), 0)
-      batch_identity_matrix = constant_op.constant(
-          batch_identity_matrix, dtype=train_points.dtype)
-
+      batch_identity_matrix = array_ops.expand_dims(
+          linalg_ops.eye(n, dtype=c.dtype), 0)
       matrix_a += regularization_weight * batch_identity_matrix
 
     # Append ones to the feature values for the bias term in the linear model.
-    ones = array_ops.ones([b, n, 1], train_points.dtype)
+    ones = array_ops.ones_like(c[..., :1], dtype=c.dtype)
     matrix_b = array_ops.concat([c, ones], 2)  # [b, n, d + 1]
 
     # [b, n + d + 1, n]
@@ -164,9 +171,6 @@ def _apply_interpolation(query_points, train_points, w, v, order):
     Polyharmonic interpolation evaluated at points defined in query_points.
   """
 
-  batch_size = train_points.get_shape()[0].value
-  num_query_points = query_points.get_shape()[1].value
-
   # First, compute the contribution from the rbf term.
   pairwise_dists = _cross_squared_distance_matrix(query_points, train_points)
   phi_pairwise_dists = _phi(pairwise_dists, order)
@@ -177,7 +181,7 @@ def _apply_interpolation(query_points, train_points, w, v, order):
   # Pad query_points with ones, for the bias term in the linear model.
   query_points_pad = array_ops.concat([
       query_points,
-      array_ops.ones([batch_size, num_query_points, 1], train_points.dtype)
+      array_ops.ones_like(query_points[..., :1], train_points.dtype)
   ], 2)
   linear_term = math_ops.matmul(query_points_pad, v)
 
@@ -251,6 +255,9 @@ def interpolate_spline(train_points,
   Note the interpolation procedure is differentiable with respect to all inputs
   besides the order parameter.
 
+  We support dynamically-shaped inputs, where batch_size, n, and m are None
+  at graph construction time. However, d and k must be known.
+
   Args:
     train_points: `[batch_size, n, d]` float `Tensor` of n d-dimensional
       locations. These do not need to be regularly-spaced.
diff --git a/tensorflow/contrib/integrate/__init__.py b/tensorflow/contrib/integrate/__init__.py
index 694f0c14bd..3c37f152e5 100644
--- a/tensorflow/contrib/integrate/__init__.py
+++ b/tensorflow/contrib/integrate/__init__.py
@@ -15,7 +15,9 @@
 
 """Integration and ODE solvers.
 
-See the @{$python/contrib.integrate} guide.
+See the
+[Contrib Integrate](https://tensorflow.org/api_guides/python/contrib.integrate)
+guide.
 
 @@odeint
 @@odeint_fixed
diff --git a/tensorflow/contrib/kfac/BUILD b/tensorflow/contrib/kfac/BUILD
deleted file mode 100644
index b719046b37..0000000000
--- a/tensorflow/contrib/kfac/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-# Description:
-#   Contains KfacOptimizer, an implementation of the K-FAC optimization
-#   algorithm in TensorFlow.
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "kfac",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:curvature_matrix_vector_products_lib",
-        "//tensorflow/contrib/kfac/python/ops:fisher_blocks_lib",
-        "//tensorflow/contrib/kfac/python/ops:fisher_estimator_lib",
-        "//tensorflow/contrib/kfac/python/ops:fisher_factors_lib",
-        "//tensorflow/contrib/kfac/python/ops:kfac_optimizer_lib",
-        "//tensorflow/contrib/kfac/python/ops:layer_collection_lib",
-        "//tensorflow/contrib/kfac/python/ops:loss_functions_lib",
-        "//tensorflow/contrib/kfac/python/ops:op_queue_lib",
-        "//tensorflow/contrib/kfac/python/ops:utils_lib",
-        "//tensorflow/python:util",
-    ],
-)
diff --git a/tensorflow/contrib/kfac/README.md b/tensorflow/contrib/kfac/README.md
index 102626925d..42b91d0313 100644
--- a/tensorflow/contrib/kfac/README.md
+++ b/tensorflow/contrib/kfac/README.md
@@ -1,94 +1,3 @@
 # K-FAC: Kronecker-Factored Approximate Curvature
 
-# <font color="red", size=10><u>WARNING: </u></font>
-# ==third_party/tensorflow/contrib/kfac is deprecated. This will be==
-# ==removed on 15-07-2018. <!-- STY:begin_strip_and_replace -->Please import third_party/tensorflow_kfac.==
-# ==<!-- STY:end_strip_and_replace Please check https://github.com/tensorflow/kfac. -->==
-
-**K-FAC in TensorFlow** is an implementation of [K-FAC][kfac-paper], an
-approximate second-order optimization method, in TensorFlow. When applied to
-feedforward and convolutional neural networks, K-FAC can converge `>3.5x`
-faster in `>14x` fewer iterations than SGD with Momentum.
-
-[kfac-paper]: https://arxiv.org/abs/1503.05671
-
-## What is K-FAC?
-
-K-FAC, short for "Kronecker-factored Approximate Curvature", is an approximation
-to the [Natural Gradient][natural_gradient] algorithm designed specifically for
-neural networks. It maintains a block-diagonal approximation to the [Fisher
-Information matrix][fisher_information], whose inverse preconditions the
-gradient.
-
-K-FAC can be used in place of SGD, Adam, and other `Optimizer` implementations.
-Experimentally, K-FAC converges `>3.5x` faster than well-tuned SGD.
-
-Unlike most optimizers, K-FAC exploits structure in the model itself (e.g. "What
-are the weights for layer i?"). As such, you must add some additional code while
-constructing your model to use K-FAC.
-
-[natural_gradient]: http://www.mitpressjournals.org/doi/abs/10.1162/089976698300017746
-[fisher_information]: https://en.wikipedia.org/wiki/Fisher_information#Matrix_form
-
-## Why should I use K-FAC?
-
-K-FAC can take advantage of the curvature of the optimization problem, resulting
-in **faster training**. For an 8-layer Autoencoder, K-FAC converges to the same
-loss as SGD with Momentum in 3.8x fewer seconds and 14.7x fewer updates. See how
-training loss changes as a function of number of epochs, steps, and seconds:
-
-![autoencoder](g3doc/autoencoder.png)
-
-## Is K-FAC for me?
-
-If you have a feedforward or convolutional model for classification that is
-converging too slowly, K-FAC is for you. K-FAC can be used in your model if:
-
-*   Your model defines a posterior distribution.
-*   Your model uses only fully-connected or convolutional layers (residual
-    connections OK).
-*   You are training on CPU or GPU.
-*   You can modify model code to register layers with K-FAC.
-
-## How do I use K-FAC?
-
-Using K-FAC requires three steps:
-
-1.  Registering layer inputs, weights, and pre-activations with a
-    `LayerCollection`.
-1.  Minimizing the loss with a `KfacOptimizer`.
-1.  Keeping K-FAC's preconditioner updated.
-
-```python
-# Build model.
-w = tf.get_variable("w", ...)
-b = tf.get_variable("b", ...)
-logits = tf.matmul(x, w) + b
-loss = tf.reduce_mean(
-  tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits))
-
-# Register layers.
-layer_collection = LayerCollection()
-layer_collection.register_fully_connected((w, b), x, logits)
-layer_collection.register_categorical_predictive_distribution(logits)
-
-# Construct training ops.
-optimizer = KfacOptimizer(..., layer_collection=layer_collection)
-train_op = optimizer.minimize(loss)
-
-# Minimize loss.
-with tf.Session() as sess:
-  ...
-  sess.run([train_op, optimizer.cov_update_op, optimizer.inv_update_op])
-```
-
-See [`examples/`](https://www.tensorflow.org/code/tensorflow/contrib/kfac/examples/) for runnable, end-to-end illustrations.
-
-## Authors
-
-- Alok Aggarwal
-- Daniel Duckworth
-- James Martens
-- Matthew Johnson
-- Olga Wichrowska
-- Roger Grosse
+## KFAC moved to third_party/tensorflow_kfac.
diff --git a/tensorflow/contrib/kfac/__init__.py b/tensorflow/contrib/kfac/__init__.py
deleted file mode 100644
index 1ea354e6cd..0000000000
--- a/tensorflow/contrib/kfac/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Kronecker-factored Approximate Curvature Optimizer."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.kfac.python.ops import curvature_matrix_vector_products_lib as curvature_matrix_vector_products
-from tensorflow.contrib.kfac.python.ops import estimator_lib as estimator
-from tensorflow.contrib.kfac.python.ops import fisher_blocks_lib as fisher_blocks
-from tensorflow.contrib.kfac.python.ops import fisher_factors_lib as fisher_factors
-from tensorflow.contrib.kfac.python.ops import layer_collection_lib as layer_collection
-from tensorflow.contrib.kfac.python.ops import loss_functions_lib as loss_functions
-from tensorflow.contrib.kfac.python.ops import op_queue_lib as op_queue
-from tensorflow.contrib.kfac.python.ops import optimizer_lib as optimizer
-from tensorflow.contrib.kfac.python.ops import utils_lib as utils
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long
-
-_allowed_symbols = [
-    "curvature_matrix_vector_products",
-    "estimator",
-    "fisher_blocks",
-    "fisher_factors",
-    "layer_collection",
-    "loss_functions",
-    "op_queue",
-    "optimizer",
-    "utils",
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/examples/BUILD b/tensorflow/contrib/kfac/examples/BUILD
deleted file mode 100644
index 8186fa1c62..0000000000
--- a/tensorflow/contrib/kfac/examples/BUILD
+++ /dev/null
@@ -1,80 +0,0 @@
-package(default_visibility = [
-    "//learning/brain/contrib/kfac/examples:__subpackages__",
-    "//tensorflow/contrib/kfac/examples:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "mlp_mnist_main",
-    srcs = ["mlp_mnist_main.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":mlp",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "mlp",
-    srcs = ["mlp.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":mnist",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_binary(
-    name = "convnet_mnist_single_main",
-    srcs = ["convnet_mnist_single_main.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":convnet",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_binary(
-    name = "convnet_mnist_multi_tower_main",
-    srcs = ["convnet_mnist_multi_tower_main.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":convnet",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_binary(
-    name = "convnet_mnist_distributed_main",
-    srcs = ["convnet_mnist_distributed_main.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":convnet",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "convnet",
-    srcs = ["convnet.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":mlp",
-        ":mnist",
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "mnist",
-    srcs = ["mnist.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
deleted file mode 100644
index d6b1a61b71..0000000000
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ /dev/null
@@ -1,667 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Train a ConvNet on MNIST using K-FAC.
-
-This library fits a 5-layer ConvNet on MNIST using K-FAC. The model has the
-following structure,
-
-- Conv Layer: 5x5 kernel, 16 output channels.
-- Max Pool: 3x3 kernel, stride 2.
-- Conv Layer: 5x5 kernel, 16 output channels.
-- Max Pool: 3x3 kernel, stride 2.
-- Linear: 10 output dims.
-
-After 3k~6k steps, this should reach perfect accuracy on the training set.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import mlp
-from tensorflow.contrib.kfac.examples import mnist
-from tensorflow.contrib.kfac.python.ops import optimizer as opt
-
-
-lc = tf.contrib.kfac.layer_collection
-oq = tf.contrib.kfac.op_queue
-opt = tf.contrib.kfac.optimizer
-
-__all__ = [
-    "conv_layer",
-    "max_pool_layer",
-    "linear_layer",
-    "build_model",
-    "minimize_loss_single_machine",
-    "distributed_grads_only_and_ops_chief_worker",
-    "distributed_grads_and_ops_dedicated_workers",
-    "train_mnist_single_machine",
-    "train_mnist_distributed_sync_replicas",
-    "train_mnist_multitower"
-]
-
-
-# Inverse update ops will be run every _INVERT_EVRY iterations.
-_INVERT_EVERY = 10
-
-
-def conv_layer(layer_id, inputs, kernel_size, out_channels):
-  """Builds a convolutional layer with ReLU non-linearity.
-
-  Args:
-    layer_id: int. Integer ID for this layer's variables.
-    inputs: Tensor of shape [num_examples, width, height, in_channels]. Each row
-      corresponds to a single example.
-    kernel_size: int. Width and height of the convolution kernel. The kernel is
-      assumed to be square.
-    out_channels: int. Number of output features per pixel.
-
-  Returns:
-    preactivations: Tensor of shape [num_examples, width, height, out_channels].
-      Values of the layer immediately before the activation function.
-    activations: Tensor of shape [num_examples, width, height, out_channels].
-      Values of the layer immediately after the activation function.
-    params: Tuple of (kernel, bias), parameters for this layer.
-  """
-  # TODO(b/67004004): Delete this function and rely on tf.layers exclusively.
-  layer = tf.layers.Conv2D(
-      out_channels,
-      kernel_size=[kernel_size, kernel_size],
-      kernel_initializer=tf.random_normal_initializer(stddev=0.01),
-      padding="SAME",
-      name="conv_%d" % layer_id)
-  preactivations = layer(inputs)
-  activations = tf.nn.relu(preactivations)
-
-  # layer.weights is a list. This converts it a (hashable) tuple.
-  return preactivations, activations, (layer.kernel, layer.bias)
-
-
-def max_pool_layer(layer_id, inputs, kernel_size, stride):
-  """Build a max-pooling layer.
-
-  Args:
-    layer_id: int. Integer ID for this layer's variables.
-    inputs: Tensor of shape [num_examples, width, height, in_channels]. Each row
-      corresponds to a single example.
-    kernel_size: int. Width and height to pool over per input channel. The
-      kernel is assumed to be square.
-    stride: int. Step size between pooling operations.
-
-  Returns:
-    Tensor of shape [num_examples, width/stride, height/stride, out_channels].
-    Result of applying max pooling to 'inputs'.
-  """
-  # TODO(b/67004004): Delete this function and rely on tf.layers exclusively.
-  with tf.variable_scope("pool_%d" % layer_id):
-    return tf.nn.max_pool(
-        inputs, [1, kernel_size, kernel_size, 1], [1, stride, stride, 1],
-        padding="SAME",
-        name="pool")
-
-
-def linear_layer(layer_id, inputs, output_size):
-  """Builds the final linear layer for an MNIST classification problem.
-
-  Args:
-    layer_id: int. Integer ID for this layer's variables.
-    inputs: Tensor of shape [num_examples, width, height, in_channels]. Each row
-      corresponds to a single example.
-    output_size: int. Number of output dims per example.
-
-  Returns:
-    activations: Tensor of shape [num_examples, output_size]. Values of the
-      layer immediately after the activation function.
-    params: Tuple of (weights, bias), parameters for this layer.
-  """
-  # TODO(b/67004004): Delete this function and rely on tf.layers exclusively.
-  pre, _, params = mlp.fc_layer(layer_id, inputs, output_size)
-  return pre, params
-
-
-def build_model(examples, labels, num_labels, layer_collection):
-  """Builds a ConvNet classification model.
-
-  Args:
-    examples: Tensor of shape [num_examples, num_features]. Represents inputs of
-      model.
-    labels: Tensor of shape [num_examples]. Contains integer IDs to be predicted
-      by softmax for each example.
-    num_labels: int. Number of distinct values 'labels' can take on.
-    layer_collection: LayerCollection instance. Layers will be registered here.
-
-  Returns:
-    loss: 0-D Tensor representing loss to be minimized.
-    accuracy: 0-D Tensor representing model's accuracy.
-  """
-  # Build a ConvNet. For each layer with parameters, we'll keep track of the
-  # preactivations, activations, weights, and bias.
-  tf.logging.info("Building model.")
-  pre0, act0, params0 = conv_layer(
-      layer_id=0, inputs=examples, kernel_size=5, out_channels=16)
-  act1 = max_pool_layer(layer_id=1, inputs=act0, kernel_size=3, stride=2)
-  pre2, act2, params2 = conv_layer(
-      layer_id=2, inputs=act1, kernel_size=5, out_channels=16)
-  act3 = max_pool_layer(layer_id=3, inputs=act2, kernel_size=3, stride=2)
-  flat_act3 = tf.reshape(act3, shape=[-1, int(np.prod(act3.shape[1:4]))])
-  logits, params4 = linear_layer(
-      layer_id=4, inputs=flat_act3, output_size=num_labels)
-  loss = tf.reduce_mean(
-      tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=labels, logits=logits))
-  accuracy = tf.reduce_mean(
-      tf.cast(tf.equal(labels, tf.argmax(logits, axis=1)), dtype=tf.float32))
-
-  with tf.device("/cpu:0"):
-    tf.summary.scalar("loss", loss)
-    tf.summary.scalar("accuracy", accuracy)
-
-  # Register parameters. K-FAC needs to know about the inputs, outputs, and
-  # parameters of each conv/fully connected layer and the logits powering the
-  # posterior probability over classes.
-  tf.logging.info("Building LayerCollection.")
-  layer_collection.register_conv2d(params0, (1, 1, 1, 1), "SAME", examples,
-                                   pre0)
-  layer_collection.register_conv2d(params2, (1, 1, 1, 1), "SAME", act1, pre2)
-  layer_collection.register_fully_connected(params4, flat_act3, logits)
-  layer_collection.register_categorical_predictive_distribution(
-      logits, name="logits")
-
-  return loss, accuracy
-
-
-def minimize_loss_single_machine(loss,
-                                 accuracy,
-                                 layer_collection,
-                                 device="/gpu:0",
-                                 session_config=None):
-  """Minimize loss with K-FAC on a single machine.
-
-  A single Session is responsible for running all of K-FAC's ops. The covariance
-  and inverse update ops are placed on `device`. All model variables are on CPU.
-
-  Args:
-    loss: 0-D Tensor. Loss to be minimized.
-    accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
-    layer_collection: LayerCollection instance describing model architecture.
-      Used by K-FAC to construct preconditioner.
-    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and invserse
-      update ops are run on this device.
-    session_config: None or tf.ConfigProto. Configuration for tf.Session().
-
-  Returns:
-    final value for 'accuracy'.
-  """
-  # Train with K-FAC.
-  g_step = tf.train.get_or_create_global_step()
-  optimizer = opt.KfacOptimizer(
-      learning_rate=0.0001,
-      cov_ema_decay=0.95,
-      damping=0.001,
-      layer_collection=layer_collection,
-      placement_strategy="round_robin",
-      cov_devices=[device],
-      inv_devices=[device],
-      momentum=0.9)
-  (cov_update_thunks,
-   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
-
-  def make_update_op(update_thunks):
-    update_ops = [thunk() for thunk in update_thunks]
-    return tf.group(*update_ops)
-
-  cov_update_op = make_update_op(cov_update_thunks)
-  with tf.control_dependencies([cov_update_op]):
-    inverse_op = tf.cond(
-        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
-        lambda: make_update_op(inv_update_thunks), tf.no_op)
-    with tf.control_dependencies([inverse_op]):
-      with tf.device(device):
-        train_op = optimizer.minimize(loss, global_step=g_step)
-
-  tf.logging.info("Starting training.")
-  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
-    while not sess.should_stop():
-      global_step_, loss_, accuracy_, _ = sess.run(
-          [g_step, loss, accuracy, train_op])
-
-      if global_step_ % _INVERT_EVERY == 0:
-        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
-                        global_step_, loss_, accuracy_)
-
-  return accuracy_
-
-
-def _is_gradient_task(task_id, num_tasks):
-  """Returns True if this task should update the weights."""
-  if num_tasks < 3:
-    return True
-  return 0 <= task_id < 0.6 * num_tasks
-
-
-def _is_cov_update_task(task_id, num_tasks):
-  """Returns True if this task should update K-FAC's covariance matrices."""
-  if num_tasks < 3:
-    return False
-  return 0.6 * num_tasks <= task_id < num_tasks - 1
-
-
-def _is_inv_update_task(task_id, num_tasks):
-  """Returns True if this task should update K-FAC's preconditioner."""
-  if num_tasks < 3:
-    return False
-  return task_id == num_tasks - 1
-
-
-def _num_gradient_tasks(num_tasks):
-  """Number of tasks that will update weights."""
-  if num_tasks < 3:
-    return num_tasks
-  return int(np.ceil(0.6 * num_tasks))
-
-
-def _make_distributed_train_op(
-    task_id,
-    num_worker_tasks,
-    num_ps_tasks,
-    layer_collection
-):
-  """Creates optimizer and distributed training op.
-
-  Constructs KFAC optimizer and wraps it in `sync_replicas` optimizer. Makes
-  the train op.
-
-  Args:
-   task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
-    num_worker_tasks: int. Number of workers in this distributed training setup.
-    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
-      parameter servers are not used.
-    layer_collection: LayerCollection instance describing model architecture.
-      Used by K-FAC to construct preconditioner.
-
-  Returns:
-    sync_optimizer: `tf.train.SyncReplicasOptimizer` instance which wraps KFAC
-      optimizer.
-    optimizer: Instance of `opt.KfacOptimizer`.
-    global_step: `tensor`, Global step.
-  """
-  tf.logging.info("Task id : %d", task_id)
-  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
-    global_step = tf.train.get_or_create_global_step()
-    optimizer = opt.KfacOptimizer(
-        learning_rate=0.0001,
-        cov_ema_decay=0.95,
-        damping=0.001,
-        layer_collection=layer_collection,
-        momentum=0.9)
-    sync_optimizer = tf.train.SyncReplicasOptimizer(
-        opt=optimizer,
-        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks),
-        total_num_replicas=num_worker_tasks)
-    return sync_optimizer, optimizer, global_step
-
-
-def distributed_grads_only_and_ops_chief_worker(
-    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
-    loss, accuracy, layer_collection, invert_every=10):
-  """Minimize loss with a synchronous implementation of K-FAC.
-
-  All workers perform gradient computation. Chief worker applies gradient after
-  averaging the gradients obtained from all the workers. All workers block
-  execution until the update is applied. Chief worker runs covariance and
-  inverse update ops. Covariance and inverse matrices are placed on parameter
-  servers in a round robin manner. For further details on synchronous
-  distributed optimization check `tf.train.SyncReplicasOptimizer`.
-
-  Args:
-    task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
-    is_chief: `boolean`, `True` if the worker is chief worker.
-    num_worker_tasks: int. Number of workers in this distributed training setup.
-    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
-      parameter servers are not used.
-    master: string. IP and port of TensorFlow runtime process. Set to empty
-      string to run locally.
-    checkpoint_dir: string or None. Path to store checkpoints under.
-    loss: 0-D Tensor. Loss to be minimized.
-    accuracy: dict mapping strings to 0-D Tensors. Additional accuracy to
-      run with each step.
-    layer_collection: LayerCollection instance describing model architecture.
-      Used by K-FAC to construct preconditioner.
-    invert_every: `int`, Number of steps between update the inverse.
-
-  Returns:
-    final value for 'accuracy'.
-
-  Raises:
-    ValueError: if task_id >= num_worker_tasks.
-  """
-
-  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
-      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
-  (cov_update_thunks,
-   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
-
-  tf.logging.info("Starting training.")
-  hooks = [sync_optimizer.make_session_run_hook(is_chief)]
-
-  def make_update_op(update_thunks):
-    update_ops = [thunk() for thunk in update_thunks]
-    return tf.group(*update_ops)
-
-  if is_chief:
-    cov_update_op = make_update_op(cov_update_thunks)
-    with tf.control_dependencies([cov_update_op]):
-      inverse_op = tf.cond(
-          tf.equal(tf.mod(global_step, invert_every), 0),
-          lambda: make_update_op(inv_update_thunks),
-          tf.no_op)
-      with tf.control_dependencies([inverse_op]):
-        train_op = sync_optimizer.minimize(loss, global_step=global_step)
-  else:
-    train_op = sync_optimizer.minimize(loss, global_step=global_step)
-
-  with tf.train.MonitoredTrainingSession(
-      master=master,
-      is_chief=is_chief,
-      checkpoint_dir=checkpoint_dir,
-      hooks=hooks,
-      stop_grace_period_secs=0) as sess:
-    while not sess.should_stop():
-      global_step_, loss_, accuracy_, _ = sess.run(
-          [global_step, loss, accuracy, train_op])
-      tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_,
-                      loss_, accuracy_)
-  return accuracy_
-
-
-def distributed_grads_and_ops_dedicated_workers(
-    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
-    loss, accuracy, layer_collection):
-  """Minimize loss with a synchronous implementation of K-FAC.
-
-  Different workers are responsible for different parts of K-FAC's Ops. The
-  first 60% of tasks compute gradients; the next 20% accumulate covariance
-  statistics; the last 20% invert the matrices used to precondition gradients.
-  The chief worker applies the gradient .
-
-  Args:
-    task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
-    is_chief: `boolean`, `True` if the worker is chief worker.
-    num_worker_tasks: int. Number of workers in this distributed training setup.
-    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
-      parameter servers are not used.
-    master: string. IP and port of TensorFlow runtime process. Set to empty
-      string to run locally.
-    checkpoint_dir: string or None. Path to store checkpoints under.
-    loss: 0-D Tensor. Loss to be minimized.
-    accuracy: dict mapping strings to 0-D Tensors. Additional accuracy to
-      run with each step.
-    layer_collection: LayerCollection instance describing model architecture.
-      Used by K-FAC to construct preconditioner.
-
-  Returns:
-    final value for 'accuracy'.
-
-  Raises:
-    ValueError: if task_id >= num_worker_tasks.
-  """
-  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
-      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
-  _, cov_update_op, inv_update_ops, _, _, _ = optimizer.make_ops_and_vars()
-  train_op = sync_optimizer.minimize(loss, global_step=global_step)
-  inv_update_queue = oq.OpQueue(inv_update_ops)
-
-  tf.logging.info("Starting training.")
-  is_chief = (task_id == 0)
-  hooks = [sync_optimizer.make_session_run_hook(is_chief)]
-  with tf.train.MonitoredTrainingSession(
-      master=master,
-      is_chief=is_chief,
-      checkpoint_dir=checkpoint_dir,
-      hooks=hooks,
-      stop_grace_period_secs=0) as sess:
-    while not sess.should_stop():
-      # Choose which op this task is responsible for running.
-      if _is_gradient_task(task_id, num_worker_tasks):
-        learning_op = train_op
-      elif _is_cov_update_task(task_id, num_worker_tasks):
-        learning_op = cov_update_op
-      elif _is_inv_update_task(task_id, num_worker_tasks):
-        # TODO(duckworthd): Running this op before cov_update_op has been run a
-        # few times can result in "InvalidArgumentError: Cholesky decomposition
-        # was not successful." Delay running this op until cov_update_op has
-        # been run a few times.
-        learning_op = inv_update_queue.next_op(sess)
-      else:
-        raise ValueError("Which op should task %d do?" % task_id)
-
-      global_step_, loss_, accuracy_, _ = sess.run(
-          [global_step, loss, accuracy, learning_op])
-      tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_,
-                      loss_, accuracy_)
-
-  return accuracy_
-
-
-def train_mnist_single_machine(data_dir,
-                               num_epochs,
-                               use_fake_data=False,
-                               device="/gpu:0"):
-  """Train a ConvNet on MNIST.
-
-  Args:
-    data_dir: string. Directory to read MNIST examples from.
-    num_epochs: int. Number of passes to make over the training set.
-    use_fake_data: bool. If True, generate a synthetic dataset.
-    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and inverse
-      update ops are run on this device.
-
-  Returns:
-    accuracy of model on the final minibatch of training data.
-  """
-  # Load a dataset.
-  tf.logging.info("Loading MNIST into memory.")
-  examples, labels = mnist.load_mnist(
-      data_dir,
-      num_epochs=num_epochs,
-      batch_size=128,
-      use_fake_data=use_fake_data,
-      flatten_images=False)
-
-  # Build a ConvNet.
-  layer_collection = lc.LayerCollection()
-  loss, accuracy = build_model(
-      examples, labels, num_labels=10, layer_collection=layer_collection)
-
-  # Fit model.
-  return minimize_loss_single_machine(
-      loss, accuracy, layer_collection, device=device)
-
-
-def train_mnist_multitower(data_dir, num_epochs, num_towers,
-                           use_fake_data=True, devices=None):
-  """Train a ConvNet on MNIST.
-
-  Training data is split equally among the towers. Each tower computes loss on
-  its own batch of data and the loss is aggregated on the CPU. The model
-  variables are placed on first tower. The covariance and inverse update ops
-  and variables are placed on GPUs in a round robin manner.
-
-  Args:
-    data_dir: string. Directory to read MNIST examples from.
-    num_epochs: int. Number of passes to make over the training set.
-    num_towers: int. Number of CPUs to split inference across.
-    use_fake_data: bool. If True, generate a synthetic dataset.
-    devices: string, Either list of CPU or GPU. The covaraince and inverse
-      update ops are run on this device.
-
-  Returns:
-    accuracy of model on the final minibatch of training data.
-  """
-  if devices:
-    device_count = {"GPU": num_towers}
-  else:
-    device_count = {"CPU": num_towers}
-
-  devices = devices or [
-      "/cpu:{}".format(tower_id) for tower_id in range(num_towers)
-  ]
-  # Load a dataset.
-  tf.logging.info("Loading MNIST into memory.")
-  tower_batch_size = 128
-  batch_size = tower_batch_size * num_towers
-  tf.logging.info(
-      ("Loading MNIST into memory. Using batch_size = %d = %d towers * %d "
-       "tower batch size.") % (batch_size, num_towers, tower_batch_size))
-  examples, labels = mnist.load_mnist(
-      data_dir,
-      num_epochs=num_epochs,
-      batch_size=batch_size,
-      use_fake_data=use_fake_data,
-      flatten_images=False)
-
-  # Split minibatch across towers.
-  examples = tf.split(examples, num_towers)
-  labels = tf.split(labels, num_towers)
-
-  # Build an MLP. Each tower's layers will be added to the LayerCollection.
-  layer_collection = lc.LayerCollection()
-  tower_results = []
-  for tower_id in range(num_towers):
-    with tf.device(devices[tower_id]):
-      with tf.name_scope("tower%d" % tower_id):
-        with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)):
-          tf.logging.info("Building tower %d." % tower_id)
-          tower_results.append(
-              build_model(examples[tower_id], labels[tower_id], 10,
-                          layer_collection))
-  losses, accuracies = zip(*tower_results)
-
-  # Average across towers.
-  loss = tf.reduce_mean(losses)
-  accuracy = tf.reduce_mean(accuracies)
-
-  # Fit model.
-
-  session_config = tf.ConfigProto(
-      allow_soft_placement=False,
-      device_count=device_count,
-  )
-
-  g_step = tf.train.get_or_create_global_step()
-  optimizer = opt.KfacOptimizer(
-      learning_rate=0.0001,
-      cov_ema_decay=0.95,
-      damping=0.001,
-      layer_collection=layer_collection,
-      placement_strategy="round_robin",
-      cov_devices=devices,
-      inv_devices=devices,
-      momentum=0.9)
-  (cov_update_thunks,
-   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
-
-  def make_update_op(update_thunks):
-    update_ops = [thunk() for thunk in update_thunks]
-    return tf.group(*update_ops)
-
-  cov_update_op = make_update_op(cov_update_thunks)
-  with tf.control_dependencies([cov_update_op]):
-    inverse_op = tf.cond(
-        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
-        lambda: make_update_op(inv_update_thunks), tf.no_op)
-    with tf.control_dependencies([inverse_op]):
-      train_op = optimizer.minimize(loss, global_step=g_step)
-
-  tf.logging.info("Starting training.")
-  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
-    while not sess.should_stop():
-      global_step_, loss_, accuracy_, _ = sess.run(
-          [g_step, loss, accuracy, train_op])
-
-      if global_step_ % _INVERT_EVERY == 0:
-        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
-                        global_step_, loss_, accuracy_)
-
-
-def train_mnist_distributed_sync_replicas(task_id,
-                                          is_chief,
-                                          num_worker_tasks,
-                                          num_ps_tasks,
-                                          master,
-                                          data_dir,
-                                          num_epochs,
-                                          op_strategy,
-                                          use_fake_data=False):
-  """Train a ConvNet on MNIST using Sync replicas optimizer.
-
-  Args:
-    task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
-    is_chief: `boolean`, `True` if the worker is chief worker.
-    num_worker_tasks: int. Number of workers in this distributed training setup.
-    num_ps_tasks: int. Number of parameter servers holding variables.
-    master: string. IP and port of TensorFlow runtime process.
-    data_dir: string. Directory to read MNIST examples from.
-    num_epochs: int. Number of passes to make over the training set.
-    op_strategy: `string`, Strategy to run the covariance and inverse
-      ops. If op_strategy == `chief_worker` then covaraiance and inverse
-      update ops are run on chief worker otherwise they are run on dedicated
-      workers.
-
-    use_fake_data: bool. If True, generate a synthetic dataset.
-
-  Returns:
-    accuracy of model on the final minibatch of training data.
-
-  Raises:
-    ValueError: If `op_strategy` not in ["chief_worker", "dedicated_workers"].
-  """
-  # Load a dataset.
-  tf.logging.info("Loading MNIST into memory.")
-  examples, labels = mnist.load_mnist(
-      data_dir,
-      num_epochs=num_epochs,
-      batch_size=128,
-      use_fake_data=use_fake_data,
-      flatten_images=False)
-
-  # Build a ConvNet.
-  layer_collection = lc.LayerCollection()
-  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
-    loss, accuracy = build_model(
-        examples, labels, num_labels=10, layer_collection=layer_collection)
-
-  # Fit model.
-  checkpoint_dir = None if data_dir is None else os.path.join(data_dir, "kfac")
-  if op_strategy == "chief_worker":
-    return distributed_grads_only_and_ops_chief_worker(
-        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
-        checkpoint_dir, loss, accuracy, layer_collection)
-  elif op_strategy == "dedicated_workers":
-    return distributed_grads_and_ops_dedicated_workers(
-        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
-        checkpoint_dir, loss, accuracy, layer_collection)
-  else:
-    raise ValueError("Only supported op strategies are : {}, {}".format(
-        "chief_worker", "dedicated_workers"))
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
deleted file mode 100644
index b4c2d4a9e9..0000000000
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Train a ConvNet on MNIST using K-FAC.
-
-Distributed training with sync replicas optimizer. See
-`convnet.train_mnist_distributed_sync_replicas` for details.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-from absl import flags
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import convnet
-
-FLAGS = flags.FLAGS
-flags.DEFINE_integer("task", -1, "Task identifier")
-flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
-flags.DEFINE_string(
-    "cov_inv_op_strategy", "chief_worker",
-    "In dist training mode run the cov, inv ops on chief or dedicated workers."
-)
-flags.DEFINE_string("master", "local", "Session master.")
-flags.DEFINE_integer("ps_tasks", 2,
-                     "Number of tasks in the parameter server job.")
-flags.DEFINE_integer("replicas_to_aggregate", 5,
-                     "Number of replicas to aggregate.")
-flags.DEFINE_integer("worker_replicas", 5, "Number of replicas in worker job.")
-flags.DEFINE_integer("num_epochs", None, "Number of epochs.")
-
-
-def _is_chief():
-  """Determines whether a job is the chief worker."""
-  if "chief_worker" in FLAGS.brain_jobs:
-    return FLAGS.brain_job_name == "chief_worker"
-  else:
-    return FLAGS.task == 0
-
-
-def main(unused_argv):
-  _ = unused_argv
-  convnet.train_mnist_distributed_sync_replicas(
-      FLAGS.task, _is_chief(), FLAGS.worker_replicas, FLAGS.ps_tasks,
-      FLAGS.master, FLAGS.data_dir, FLAGS.num_epochs, FLAGS.cov_inv_op_strategy)
-
-if __name__ == "__main__":
-  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
deleted file mode 100644
index 4249bf8a8d..0000000000
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Train a ConvNet on MNIST using K-FAC.
-
-Multi tower training mode. See `convnet.train_mnist_multitower` for details.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-from absl import flags
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import convnet
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string("data_dir", "/tmp/multitower_1/mnist", "local mnist dir")
-flags.DEFINE_integer("num_towers", 2,
-                     "Number of towers for multi tower training.")
-
-
-def main(unused_argv):
-  _ = unused_argv
-  assert FLAGS.num_towers > 1
-  devices = ["/gpu:{}".format(tower_id) for tower_id in range(FLAGS.num_towers)]
-  convnet.train_mnist_multitower(
-      FLAGS.data_dir,
-      num_epochs=200,
-      num_towers=FLAGS.num_towers,
-      devices=devices)
-
-
-if __name__ == "__main__":
-  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py
deleted file mode 100644
index ea2b252a05..0000000000
--- a/tensorflow/contrib/kfac/examples/mlp.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Train an MLP on MNIST using K-FAC.
-
-This library fits a 3-layer, tanh-activated MLP on MNIST using K-FAC. After
-~25k steps, this should reach perfect accuracy on the training set.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import mnist
-
-lc = tf.contrib.kfac.layer_collection
-opt = tf.contrib.kfac.optimizer
-
-__all__ = [
-    "fc_layer",
-    "train_mnist",
-    "train_mnist_multitower",
-]
-
-
-def fc_layer(layer_id, inputs, output_size):
-  """Builds a fully connected layer.
-
-  Args:
-    layer_id: int. Integer ID for this layer's variables.
-    inputs: Tensor of shape [num_examples, input_size]. Each row corresponds
-      to a single example.
-    output_size: int. Number of output dimensions after fully connected layer.
-
-  Returns:
-    preactivations: Tensor of shape [num_examples, output_size]. Values of the
-      layer immediately before the activation function.
-    activations: Tensor of shape [num_examples, output_size]. Values of the
-      layer immediately after the activation function.
-    params: Tuple of (weights, bias), parameters for this layer.
-  """
-  # TODO(b/67004004): Delete this function and rely on tf.layers exclusively.
-  layer = tf.layers.Dense(
-      output_size,
-      kernel_initializer=tf.random_normal_initializer(),
-      name="fc_%d" % layer_id)
-  preactivations = layer(inputs)
-  activations = tf.nn.tanh(preactivations)
-
-  # layer.weights is a list. This converts it a (hashable) tuple.
-  return preactivations, activations, (layer.kernel, layer.bias)
-
-
-def build_model(examples, labels, num_labels, layer_collection):
-  """Builds an MLP classification model.
-
-  Args:
-    examples: Tensor of shape [num_examples, num_features]. Represents inputs of
-      model.
-    labels: Tensor of shape [num_examples]. Contains integer IDs to be predicted
-      by softmax for each example.
-    num_labels: int. Number of distinct values 'labels' can take on.
-    layer_collection: LayerCollection instance describing model architecture.
-
-  Returns:
-    loss: 0-D Tensor representing loss to be minimized.
-    accuracy: 0-D Tensor representing model's accuracy.
-  """
-  # Build an MLP. For each layer, we'll keep track of the preactivations,
-  # activations, weights, and bias.
-  pre0, act0, params0 = fc_layer(layer_id=0, inputs=examples, output_size=128)
-  pre1, act1, params1 = fc_layer(layer_id=1, inputs=act0, output_size=64)
-  pre2, act2, params2 = fc_layer(layer_id=2, inputs=act1, output_size=32)
-  logits, _, params3 = fc_layer(layer_id=3, inputs=act2, output_size=num_labels)
-  loss = tf.reduce_mean(
-      tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=labels, logits=logits))
-  accuracy = tf.reduce_mean(
-      tf.cast(tf.equal(labels, tf.argmax(logits, axis=1)), dtype=tf.float32))
-
-  # Register parameters. K-FAC needs to know about the inputs, outputs, and
-  # parameters of each layer and the logits powering the posterior probability
-  # over classes.
-  tf.logging.info("Building LayerCollection.")
-  layer_collection.register_fully_connected(params0, examples, pre0)
-  layer_collection.register_fully_connected(params1, act0, pre1)
-  layer_collection.register_fully_connected(params2, act1, pre2)
-  layer_collection.register_fully_connected(params3, act2, logits)
-  layer_collection.register_categorical_predictive_distribution(
-      logits, name="logits")
-
-  return loss, accuracy
-
-
-def minimize(loss, accuracy, layer_collection, num_towers, session_config=None):
-  """Minimize 'loss' with KfacOptimizer.
-
-  Args:
-    loss: 0-D Tensor. Loss to be minimized.
-    accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
-    layer_collection: LayerCollection instance. Describes layers in model.
-    num_towers: int. Number of CPUs to split minibatch across.
-    session_config: tf.ConfigProto. Configuration for tf.Session().
-
-  Returns:
-    accuracy of classifier on final minibatch.
-  """
-  devices = tuple("/cpu:%d" % tower_id for tower_id in range(num_towers))
-
-  # Train with K-FAC. We'll use a decreasing learning rate that's cut in 1/2
-  # every 10k iterations.
-  tf.logging.info("Building KFAC Optimizer.")
-  global_step = tf.train.get_or_create_global_step()
-  optimizer = opt.KfacOptimizer(
-      learning_rate=tf.train.exponential_decay(
-          0.00002, global_step, 10000, 0.5, staircase=True),
-      cov_ema_decay=0.95,
-      damping=0.0005,
-      layer_collection=layer_collection,
-      momentum=0.99,
-      placement_strategy="round_robin",
-      cov_devices=devices,
-      inv_devices=devices)
-
-  (cov_update_thunks,
-   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
-
-  def make_update_op(update_thunks):
-    update_ops = [thunk() for thunk in update_thunks]
-    return tf.group(*update_ops)
-
-  # TODO(b/78537047): change (some) examples to use PeriodicInvCovUpdateKfacOpt
-  # once that gets moved over?  Could still leave more advanced examples as they
-  # are (e.g. train_mnist_estimator in this file)
-
-  cov_update_op = make_update_op(cov_update_thunks)
-  with tf.control_dependencies([cov_update_op]):
-    # We update the inverses only every 20 iterations.
-    inverse_op = tf.cond(
-        tf.equal(tf.mod(global_step, 100), 0),
-        lambda: make_update_op(inv_update_thunks), tf.no_op)
-    with tf.control_dependencies([inverse_op]):
-      train_op = optimizer.minimize(loss, global_step=global_step)
-
-  tf.logging.info("Starting training.")
-  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
-    while not sess.should_stop():
-      global_step_, loss_, accuracy_, _ = sess.run(
-          [global_step, loss, accuracy, train_op])
-
-      if global_step_ % 100 == 0:
-        tf.logging.info("global_step: %d | loss: %f | accuracy: %f",
-                        global_step_, loss_, accuracy_)
-
-  return accuracy_
-
-
-def train_mnist(data_dir, num_epochs, use_fake_data=False):
-  """Train an MLP on MNIST.
-
-  Args:
-    data_dir: string. Directory to read MNIST examples from.
-    num_epochs: int. Number of passes to make over the training set.
-    use_fake_data: bool. If True, generate a synthetic dataset.
-
-  Returns:
-    accuracy of model on the final minibatch of training data.
-  """
-  # Load a dataset.
-  tf.logging.info("Loading MNIST into memory.")
-  examples, labels = mnist.load_mnist(
-      data_dir,
-      num_epochs=num_epochs,
-      batch_size=64,
-      flatten_images=True,
-      use_fake_data=use_fake_data)
-
-  # Build an MLP. The model's layers will be added to the LayerCollection.
-  tf.logging.info("Building model.")
-  layer_collection = lc.LayerCollection()
-  loss, accuracy = build_model(examples, labels, 10, layer_collection)
-
-  # Fit model.
-  minimize(loss, accuracy, layer_collection, 1)
-
-
-def train_mnist_multitower(data_dir,
-                           num_epochs,
-                           num_towers,
-                           use_fake_data=False):
-  """Train an MLP on MNIST, splitting the minibatch across multiple towers.
-
-  Args:
-    data_dir: string. Directory to read MNIST examples from.
-    num_epochs: int. Number of passes to make over the training set.
-    num_towers: int. Number of CPUs to split minibatch across.
-    use_fake_data: bool. If True, generate a synthetic dataset.
-
-  Returns:
-    accuracy of model on the final minibatch of training data.
-  """
-  # Load a dataset.
-  tower_batch_size = 64
-  batch_size = tower_batch_size * num_towers
-  tf.logging.info(
-      ("Loading MNIST into memory. Using batch_size = %d = %d towers * %d "
-       "tower batch size.") % (batch_size, num_towers, tower_batch_size))
-  examples, labels = mnist.load_mnist(
-      data_dir,
-      num_epochs=num_epochs,
-      batch_size=batch_size,
-      flatten_images=True,
-      use_fake_data=use_fake_data)
-
-  # Split minibatch across towers.
-  examples = tf.split(examples, num_towers)
-  labels = tf.split(labels, num_towers)
-
-  # Build an MLP. Each tower's layers will be added to the LayerCollection.
-  layer_collection = lc.LayerCollection()
-  tower_results = []
-  for tower_id in range(num_towers):
-    with tf.device("/cpu:%d" % tower_id):
-      with tf.name_scope("tower%d" % tower_id):
-        with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)):
-          tf.logging.info("Building tower %d." % tower_id)
-          tower_results.append(
-              build_model(examples[tower_id], labels[tower_id], 10,
-                          layer_collection))
-  losses, accuracies = zip(*tower_results)
-
-  # Average across towers.
-  loss = tf.reduce_mean(losses)
-  accuracy = tf.reduce_mean(accuracies)
-
-  # Fit model.
-  session_config = tf.ConfigProto(
-      allow_soft_placement=False, device_count={
-          "CPU": num_towers
-      })
-  return minimize(
-      loss, accuracy, layer_collection, num_towers,
-      session_config=session_config)
-
-
-def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
-  """Train an MLP on MNIST using tf.estimator.
-
-  Args:
-    data_dir: string. Directory to read MNIST examples from.
-    num_epochs: int. Number of passes to make over the training set.
-    use_fake_data: bool. If True, generate a synthetic dataset.
-
-  Returns:
-    accuracy of model on the final minibatch of training data.
-  """
-
-  # Load a dataset.
-  def input_fn():
-    tf.logging.info("Loading MNIST into memory.")
-    return mnist.load_mnist(
-        data_dir,
-        num_epochs=num_epochs,
-        batch_size=64,
-        flatten_images=True,
-        use_fake_data=use_fake_data)
-
-  def model_fn(features, labels, mode, params):
-    """Model function for MLP trained with K-FAC.
-
-    Args:
-      features: Tensor of shape [batch_size, input_size]. Input features.
-      labels: Tensor of shape [batch_size]. Target labels for training.
-      mode: tf.estimator.ModeKey. Must be TRAIN.
-      params: ignored.
-
-    Returns:
-      EstimatorSpec for training.
-
-    Raises:
-      ValueError: If 'mode' is anything other than TRAIN.
-    """
-    del params
-
-    if mode != tf.estimator.ModeKeys.TRAIN:
-      raise ValueError("Only training is supposed with this API.")
-
-    # Build a ConvNet.
-    layer_collection = lc.LayerCollection()
-    loss, accuracy = build_model(
-        features, labels, num_labels=10, layer_collection=layer_collection)
-
-    # Train with K-FAC.
-    global_step = tf.train.get_or_create_global_step()
-    optimizer = opt.KfacOptimizer(
-        learning_rate=tf.train.exponential_decay(
-            0.00002, global_step, 10000, 0.5, staircase=True),
-        cov_ema_decay=0.95,
-        damping=0.0001,
-        layer_collection=layer_collection,
-        momentum=0.99)
-
-    (cov_update_thunks,
-     inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
-
-    def make_update_op(update_thunks):
-      update_ops = [thunk() for thunk in update_thunks]
-      return tf.group(*update_ops)
-
-    def make_batch_executed_op(update_thunks, batch_size=1):
-      return tf.group(*tf.contrib.kfac.utils.batch_execute(
-          global_step, update_thunks, batch_size=batch_size))
-
-    # Run cov_update_op every step. Run 1 inv_update_ops per step.
-    cov_update_op = make_update_op(cov_update_thunks)
-    with tf.control_dependencies([cov_update_op]):
-      # But make sure to execute all the inverse ops on the first step
-      inverse_op = tf.cond(tf.equal(global_step, 0),
-                           lambda: make_update_op(inv_update_thunks),
-                           lambda: make_batch_executed_op(inv_update_thunks))
-      with tf.control_dependencies([inverse_op]):
-        train_op = optimizer.minimize(loss, global_step=global_step)
-
-    # Print metrics every 5 sec.
-    hooks = [
-        tf.train.LoggingTensorHook(
-            {
-                "loss": loss,
-                "accuracy": accuracy
-            }, every_n_secs=5),
-    ]
-    return tf.estimator.EstimatorSpec(
-        mode=mode, loss=loss, train_op=train_op, training_hooks=hooks)
-
-  run_config = tf.estimator.RunConfig(
-      model_dir="/tmp/mnist", save_checkpoints_steps=1, keep_checkpoint_max=100)
-
-  # Train until input_fn() is empty with Estimator. This is a prerequisite for
-  # TPU compatibility.
-  estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
-  estimator.train(input_fn=input_fn)
diff --git a/tensorflow/contrib/kfac/examples/mlp_mnist_main.py b/tensorflow/contrib/kfac/examples/mlp_mnist_main.py
deleted file mode 100644
index 9c34ade1d2..0000000000
--- a/tensorflow/contrib/kfac/examples/mlp_mnist_main.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Train an MLP on MNIST using K-FAC.
-
-See mlp.py for details.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import mlp
-
-FLAGS = None
-
-
-def main(argv):
-  _ = argv
-  if FLAGS.use_estimator:
-    if FLAGS.num_towers != 1:
-      raise ValueError("Only 1 device supported in tf.estimator example.")
-    mlp.train_mnist_estimator(FLAGS.data_dir, num_epochs=200)
-  elif FLAGS.num_towers > 1:
-    mlp.train_mnist_multitower(
-        FLAGS.data_dir, num_epochs=200, num_towers=FLAGS.num_towers)
-  else:
-    mlp.train_mnist(FLAGS.data_dir, num_epochs=200)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--data_dir",
-      type=str,
-      default="/tmp/mnist",
-      help="Directory to store dataset in.")
-  parser.add_argument(
-      "--num_towers",
-      type=int,
-      default=1,
-      help="Number of CPUs to split minibatch across.")
-  parser.add_argument(
-      "--use_estimator",
-      action="store_true",
-      help="Use tf.estimator API to train.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/kfac/examples/mnist.py b/tensorflow/contrib/kfac/examples/mnist.py
deleted file mode 100644
index 547c4ab25d..0000000000
--- a/tensorflow/contrib/kfac/examples/mnist.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for loading MNIST into TensorFlow."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-__all__ = [
-    'load_mnist',
-]
-
-
-def load_mnist(data_dir,
-               num_epochs,
-               batch_size,
-               flatten_images=True,
-               use_fake_data=False):
-  """Loads MNIST dataset into memory.
-
-  Args:
-    data_dir: string. Directory to read MNIST examples from.
-    num_epochs: int. Number of passes to make over the dataset.
-    batch_size: int. Number of examples per minibatch.
-    flatten_images: bool. If True, [28, 28, 1]-shaped images are flattened into
-      [784]-shaped vectors.
-    use_fake_data: bool. If True, generate a synthetic dataset rather than
-      reading MNIST in.
-
-  Returns:
-    examples: Tensor of shape [batch_size, 784] if 'flatten_images' is
-      True, else [batch_size, 28, 28, 1]. Each row is one example.
-      Values in [0, 1].
-    labels: Tensor of shape [batch_size]. Indices of integer corresponding to
-      each example. Values in {0...9}.
-  """
-  if use_fake_data:
-    rng = np.random.RandomState(42)
-    num_examples = batch_size * 4
-    images = rng.rand(num_examples, 28 * 28)
-    if not flatten_images:
-      images = np.reshape(images, [num_examples, 28, 28, 1])
-    labels = rng.randint(10, size=num_examples)
-  else:
-    mnist_data = tf.contrib.learn.datasets.mnist.read_data_sets(
-        data_dir, reshape=flatten_images)
-    num_examples = len(mnist_data.train.labels)
-    images = mnist_data.train.images
-    labels = mnist_data.train.labels
-
-  dataset = tf.data.Dataset.from_tensor_slices((np.asarray(
-      images, dtype=np.float32), np.asarray(labels, dtype=np.int64)))
-  return (dataset.repeat(num_epochs).shuffle(num_examples).batch(batch_size)
-          .make_one_shot_iterator().get_next())
diff --git a/tensorflow/contrib/kfac/examples/tests/BUILD b/tensorflow/contrib/kfac/examples/tests/BUILD
deleted file mode 100644
index ede7f183fe..0000000000
--- a/tensorflow/contrib/kfac/examples/tests/BUILD
+++ /dev/null
@@ -1,52 +0,0 @@
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-py_test(
-    name = "mlp_test",
-    size = "large",
-    srcs = ["mlp_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/kfac/examples:mlp",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "convnet_test",
-    size = "large",
-    srcs = ["convnet_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/kfac",
-        "//tensorflow/contrib/kfac/examples:convnet",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "mnist_test",
-    srcs = ["mnist_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/kfac/examples:mnist",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/kfac/examples/tests/convnet_test.py b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
deleted file mode 100644
index adecda7166..0000000000
--- a/tensorflow/contrib/kfac/examples/tests/convnet_test.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for convnet.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.contrib.kfac import layer_collection as lc
-from tensorflow.contrib.kfac.examples import convnet
-
-
-class ConvNetTest(tf.test.TestCase):
-
-  def testConvLayer(self):
-    with tf.Graph().as_default():
-      pre, act, (w, b) = convnet.conv_layer(
-          layer_id=1,
-          inputs=tf.zeros([5, 3, 3, 2]),
-          kernel_size=3,
-          out_channels=5)
-      self.assertShapeEqual(np.zeros([5, 3, 3, 5]), pre)
-      self.assertShapeEqual(np.zeros([5, 3, 3, 5]), act)
-      self.assertShapeEqual(np.zeros([3, 3, 2, 5]), tf.convert_to_tensor(w))
-      self.assertShapeEqual(np.zeros([5]), tf.convert_to_tensor(b))
-      self.assertIsInstance(w, tf.Variable)
-      self.assertIsInstance(b, tf.Variable)
-      self.assertIn("conv_1", w.op.name)
-      self.assertIn("conv_1", b.op.name)
-
-  def testMaxPoolLayer(self):
-    with tf.Graph().as_default():
-      act = convnet.max_pool_layer(
-          layer_id=1, inputs=tf.zeros([5, 6, 6, 2]), kernel_size=5, stride=3)
-      self.assertShapeEqual(np.zeros([5, 2, 2, 2]), act)
-      self.assertEqual(act.op.name, "pool_1/pool")
-
-  def testLinearLayer(self):
-    with tf.Graph().as_default():
-      act, (w, b) = convnet.linear_layer(
-          layer_id=1, inputs=tf.zeros([5, 20]), output_size=5)
-      self.assertShapeEqual(np.zeros([5, 5]), act)
-      self.assertShapeEqual(np.zeros([20, 5]), tf.convert_to_tensor(w))
-      self.assertShapeEqual(np.zeros([5]), tf.convert_to_tensor(b))
-      self.assertIsInstance(w, tf.Variable)
-      self.assertIsInstance(b, tf.Variable)
-      self.assertIn("fc_1", w.op.name)
-      self.assertIn("fc_1", b.op.name)
-
-  def testBuildModel(self):
-    with tf.Graph().as_default():
-      x = tf.placeholder(tf.float32, [None, 6, 6, 3])
-      y = tf.placeholder(tf.int64, [None])
-      layer_collection = lc.LayerCollection()
-      loss, accuracy = convnet.build_model(
-          x, y, num_labels=5, layer_collection=layer_collection)
-
-      # Ensure layers and logits were registered.
-      self.assertEqual(len(layer_collection.fisher_blocks), 3)
-      self.assertEqual(len(layer_collection.losses), 1)
-
-      # Ensure inference doesn't crash.
-      with self.test_session() as sess:
-        sess.run(tf.global_variables_initializer())
-        feed_dict = {
-            x: np.random.randn(10, 6, 6, 3).astype(np.float32),
-            y: np.random.randint(5, size=10).astype(np.int64),
-        }
-        sess.run([loss, accuracy], feed_dict=feed_dict)
-
-  def _build_toy_problem(self):
-    """Construct a toy linear regression problem.
-
-    Initial loss should be,
-      2.5 = 0.5 * (1^2 + 2^2)
-
-    Returns:
-      loss: 0-D Tensor representing loss to be minimized.
-      accuracy: 0-D Tensors representing model accuracy.
-      layer_collection: LayerCollection instance describing model architecture.
-    """
-    x = np.asarray([[1.], [2.]]).astype(np.float32)
-    y = np.asarray([1., 2.]).astype(np.float32)
-    x, y = (tf.data.Dataset.from_tensor_slices((x, y))
-            .repeat(100).batch(2).make_one_shot_iterator().get_next())
-    w = tf.get_variable("w", shape=[1, 1], initializer=tf.zeros_initializer())
-    y_hat = tf.matmul(x, w)
-    loss = tf.reduce_mean(0.5 * tf.square(y_hat - y))
-    accuracy = loss
-
-    layer_collection = lc.LayerCollection()
-    layer_collection.register_fully_connected(params=w, inputs=x, outputs=y_hat)
-    layer_collection.register_normal_predictive_distribution(y_hat)
-
-    return loss, accuracy, layer_collection
-
-  def testMinimizeLossSingleMachine(self):
-    with tf.Graph().as_default():
-      loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_single_machine(
-          loss, accuracy, layer_collection, device="/cpu:0")
-      self.assertLess(accuracy_, 2.0)
-
-  def testMinimizeLossDistributed(self):
-    with tf.Graph().as_default():
-      loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.distributed_grads_only_and_ops_chief_worker(
-          task_id=0,
-          is_chief=True,
-          num_worker_tasks=1,
-          num_ps_tasks=0,
-          master="",
-          checkpoint_dir=None,
-          loss=loss,
-          accuracy=accuracy,
-          layer_collection=layer_collection)
-      self.assertLess(accuracy_, 2.0)
-
-  def testTrainMnistSingleMachine(self):
-    with tf.Graph().as_default():
-      # Ensure model training doesn't crash.
-      #
-      # Ideally, we should check that accuracy increases as the model converges,
-      # but there are too few parameters for the model to effectively memorize
-      # the training set the way an MLP can.
-      convnet.train_mnist_single_machine(
-          data_dir=None, num_epochs=1, use_fake_data=True, device="/cpu:0")
-
-  def testTrainMnistMultitower(self):
-    with tf.Graph().as_default():
-      # Ensure model training doesn't crash.
-      convnet.train_mnist_multitower(
-          data_dir=None, num_epochs=1, num_towers=2, use_fake_data=True)
-
-  def testTrainMnistDistributed(self):
-    with tf.Graph().as_default():
-      # Ensure model training doesn't crash.
-      convnet.train_mnist_distributed_sync_replicas(
-          task_id=0,
-          is_chief=True,
-          num_worker_tasks=1,
-          num_ps_tasks=0,
-          master="",
-          data_dir=None,
-          num_epochs=2,
-          op_strategy="chief_worker",
-          use_fake_data=True)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/contrib/kfac/examples/tests/mlp_test.py b/tensorflow/contrib/kfac/examples/tests/mlp_test.py
deleted file mode 100644
index 22da6c29f1..0000000000
--- a/tensorflow/contrib/kfac/examples/tests/mlp_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for mlp.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import mlp
-
-
-class MlpTest(tf.test.TestCase):
-
-  def testFcLayer(self):
-    with tf.Graph().as_default():
-      pre, act, (w, b) = mlp.fc_layer(
-          layer_id=1, inputs=tf.zeros([5, 3]), output_size=10)
-      self.assertShapeEqual(np.zeros([5, 10]), pre)
-      self.assertShapeEqual(np.zeros([5, 10]), act)
-      self.assertShapeEqual(np.zeros([3, 10]), tf.convert_to_tensor(w))
-      self.assertShapeEqual(np.zeros([10]), tf.convert_to_tensor(b))
-      self.assertIsInstance(w, tf.Variable)
-      self.assertIsInstance(b, tf.Variable)
-      self.assertIn("fc_1/", w.op.name)
-      self.assertIn("fc_1/", b.op.name)
-
-  def testTrainMnist(self):
-    with tf.Graph().as_default():
-      # Ensure model training doesn't crash.
-      #
-      # Ideally, we should check that accuracy increases as the model converges,
-      # but that takes a non-trivial amount of compute.
-      mlp.train_mnist(data_dir=None, num_epochs=1, use_fake_data=True)
-
-  def testTrainMnistMultitower(self):
-    with tf.Graph().as_default():
-      # Ensure model training doesn't crash.
-      mlp.train_mnist_multitower(
-          data_dir=None, num_epochs=1, num_towers=2, use_fake_data=True)
-
-  def testTrainMnistEstimator(self):
-    with tf.Graph().as_default():
-      # Ensure model training doesn't crash.
-      mlp.train_mnist_estimator(data_dir=None, num_epochs=1, use_fake_data=True)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/contrib/kfac/examples/tests/mnist_test.py b/tensorflow/contrib/kfac/examples/tests/mnist_test.py
deleted file mode 100644
index 92f8462357..0000000000
--- a/tensorflow/contrib/kfac/examples/tests/mnist_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for mnist.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import mnist
-
-
-class MnistTest(tf.test.TestCase):
-
-  def testValues(self):
-    """Ensure values are in their expected range."""
-    with tf.Graph().as_default():
-      examples, labels = mnist.load_mnist(
-          data_dir=None, num_epochs=1, batch_size=64, use_fake_data=True)
-
-      with self.test_session() as sess:
-        examples_, labels_ = sess.run([examples, labels])
-        self.assertTrue(np.all((0 <= examples_) & (examples_ < 1)))
-        self.assertTrue(np.all((0 <= labels_) & (labels_ < 10)))
-
-  def testFlattenedShapes(self):
-    """Ensure images are flattened into their appropriate shape."""
-    with tf.Graph().as_default():
-      examples, labels = mnist.load_mnist(
-          data_dir=None,
-          num_epochs=1,
-          batch_size=64,
-          flatten_images=True,
-          use_fake_data=True)
-
-      with self.test_session() as sess:
-        examples_, labels_ = sess.run([examples, labels])
-        self.assertEqual(examples_.shape, (64, 784))
-        self.assertEqual(labels_.shape, (64,))
-
-  def testNotFlattenedShapes(self):
-    """Ensure non-flattened images are their appropriate shape."""
-    with tf.Graph().as_default():
-      examples, labels = mnist.load_mnist(
-          data_dir=None,
-          num_epochs=1,
-          batch_size=64,
-          flatten_images=False,
-          use_fake_data=True)
-
-      with self.test_session() as sess:
-        examples_, labels_ = sess.run([examples, labels])
-        self.assertEqual(examples_.shape, (64, 28, 28, 1))
-        self.assertEqual(labels_.shape, (64,))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/contrib/kfac/g3doc/autoencoder.png b/tensorflow/contrib/kfac/g3doc/autoencoder.png
deleted file mode 100644
index 20f93c7703..0000000000
--- a/tensorflow/contrib/kfac/g3doc/autoencoder.png
+++ /dev/null
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
deleted file mode 100644
index 6e4a8d71ba..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ /dev/null
@@ -1,160 +0,0 @@
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-py_test(
-    name = "estimator_test",
-    srcs = ["estimator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:fisher_estimator",
-        "//tensorflow/contrib/kfac/python/ops:layer_collection",
-        "//tensorflow/contrib/kfac/python/ops:utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "fisher_factors_test",
-    srcs = ["fisher_factors_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:fisher_blocks",
-        "//tensorflow/contrib/kfac/python/ops:fisher_factors",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "fisher_blocks_test",
-    srcs = ["fisher_blocks_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:fisher_blocks",
-        "//tensorflow/contrib/kfac/python/ops:layer_collection",
-        "//tensorflow/contrib/kfac/python/ops:linear_operator",
-        "//tensorflow/contrib/kfac/python/ops:utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "layer_collection_test",
-    srcs = ["layer_collection_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:fisher_blocks",
-        "//tensorflow/contrib/kfac/python/ops:fisher_factors",
-        "//tensorflow/contrib/kfac/python/ops:layer_collection",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
-py_test(
-    name = "optimizer_test",
-    srcs = ["optimizer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:fisher_factors",
-        "//tensorflow/contrib/kfac/python/ops:kfac_optimizer",
-        "//tensorflow/contrib/kfac/python/ops:layer_collection",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "utils_test",
-    srcs = ["utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:utils",
-        "//tensorflow/contrib/tpu",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "op_queue_test",
-    srcs = ["op_queue_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:op_queue",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
-py_test(
-    name = "loss_functions_test",
-    srcs = ["loss_functions_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/kfac/python/ops:loss_functions",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
deleted file mode 100644
index 0e65d419a3..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.kfac.python.ops import estimator
-from tensorflow.contrib.kfac.python.ops import layer_collection as lc
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.training import training_util
-
-_ALL_ESTIMATION_MODES = ["gradients", "empirical", "curvature_prop", "exact"]
-
-
-class EstimatorTest(test.TestCase):
-
-  def setUp(self):
-    self._graph = ops.Graph()
-    with self._graph.as_default():
-      self.layer_collection = lc.LayerCollection()
-
-      self.inputs = random_ops.random_normal((2, 2), dtype=dtypes.float32)
-      self.weights = variable_scope.get_variable(
-          "w", shape=(2, 2), dtype=dtypes.float32)
-      self.bias = variable_scope.get_variable(
-          "b", initializer=init_ops.zeros_initializer(), shape=(2, 1))
-      self.output = math_ops.matmul(self.inputs, self.weights) + self.bias
-
-      # Only register the weights.
-      self.layer_collection.register_fully_connected(
-          params=(self.weights,), inputs=self.inputs, outputs=self.output)
-
-      self.outputs = math_ops.tanh(self.output)
-      self.targets = array_ops.zeros_like(self.outputs)
-      self.layer_collection.register_categorical_predictive_distribution(
-          logits=self.outputs, targets=self.targets)
-
-  def testEstimatorInitManualRegistration(self):
-    with self._graph.as_default():
-      # We should be able to build an estimator for only the registered vars.
-      estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          cov_ema_decay=0.1,
-          damping=0.2,
-          layer_collection=self.layer_collection
-      )
-
-      # Check that we throw an error if we try to build an estimator for vars
-      # that were not manually registered.
-      with self.assertRaises(ValueError):
-        est = estimator.FisherEstimatorRoundRobin(
-            variables=[self.weights, self.bias],
-            cov_ema_decay=0.1,
-            damping=0.2,
-            layer_collection=self.layer_collection
-        )
-        est.make_vars_and_create_op_thunks()
-
-      # Check that we throw an error if we don't include registered variables,
-      # i.e. self.weights
-      with self.assertRaises(ValueError):
-        est = estimator.FisherEstimatorRoundRobin(
-            variables=[],
-            cov_ema_decay=0.1,
-            damping=0.2,
-            layer_collection=self.layer_collection)
-        est.make_vars_and_create_op_thunks()
-
-  @test.mock.patch.object(utils.SubGraph, "variable_uses", return_value=42)
-  def testVariableWrongNumberOfUses(self, mock_uses):
-    with self.assertRaises(ValueError):
-      est = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          cov_ema_decay=0.1,
-          damping=0.2,
-          layer_collection=self.layer_collection)
-      est.make_vars_and_create_op_thunks()
-
-  def testInvalidEstimationMode(self):
-    with self.assertRaises(ValueError):
-      est = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          cov_ema_decay=0.1,
-          damping=0.2,
-          layer_collection=self.layer_collection,
-          estimation_mode="not_a_real_mode")
-      est.make_vars_and_create_op_thunks()
-
-  def testGradientsModeBuild(self):
-    with self._graph.as_default():
-      est = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          cov_ema_decay=0.1,
-          damping=0.2,
-          layer_collection=self.layer_collection,
-          estimation_mode="gradients")
-      est.make_vars_and_create_op_thunks()
-
-  def testEmpiricalModeBuild(self):
-    with self._graph.as_default():
-      est = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          cov_ema_decay=0.1,
-          damping=0.2,
-          layer_collection=self.layer_collection,
-          estimation_mode="empirical")
-      est.make_vars_and_create_op_thunks()
-
-  def testCurvaturePropModeBuild(self):
-    with self._graph.as_default():
-      est = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          cov_ema_decay=0.1,
-          damping=0.2,
-          layer_collection=self.layer_collection,
-          estimation_mode="curvature_prop")
-      est.make_vars_and_create_op_thunks()
-
-  def testExactModeBuild(self):
-    with self._graph.as_default():
-      est = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          cov_ema_decay=0.1,
-          damping=0.2,
-          layer_collection=self.layer_collection,
-          estimation_mode="exact")
-      est.make_vars_and_create_op_thunks()
-
-  def test_cov_update_thunks(self):
-    """Ensures covariance update ops run once per global_step."""
-    with self._graph.as_default(), self.test_session() as sess:
-      fisher_estimator = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          layer_collection=self.layer_collection,
-          damping=0.2,
-          cov_ema_decay=0.0)
-
-      # Construct an op that executes one covariance update per step.
-      global_step = training_util.get_or_create_global_step()
-      (cov_variable_thunks, cov_update_op_thunks, _,
-       _) = fisher_estimator.create_ops_and_vars_thunks()
-      for thunk in cov_variable_thunks:
-        thunk()
-      cov_matrices = [
-          fisher_factor.get_cov()
-          for fisher_factor in self.layer_collection.get_factors()
-      ]
-      cov_update_op = control_flow_ops.case(
-          [(math_ops.equal(global_step, i), thunk)
-           for i, thunk in enumerate(cov_update_op_thunks)])
-      increment_global_step = global_step.assign_add(1)
-
-      sess.run(variables.global_variables_initializer())
-      initial_cov_values = sess.run(cov_matrices)
-
-      # Ensure there's one update per covariance matrix.
-      self.assertEqual(len(cov_matrices), len(cov_update_op_thunks))
-
-      # Test is no-op if only 1 covariance matrix.
-      assert len(cov_matrices) > 1
-
-      for i in range(len(cov_matrices)):
-        # Compare new and old covariance values
-        new_cov_values = sess.run(cov_matrices)
-        is_cov_equal = [
-            np.allclose(initial_cov_value, new_cov_value)
-            for (initial_cov_value,
-                 new_cov_value) in zip(initial_cov_values, new_cov_values)
-        ]
-        num_cov_equal = sum(is_cov_equal)
-
-        # Ensure exactly one covariance matrix changes per step.
-        self.assertEqual(num_cov_equal, len(cov_matrices) - i)
-
-        # Run all covariance update ops.
-        sess.run(cov_update_op)
-        sess.run(increment_global_step)
-
-  def test_round_robin_placement(self):
-    """Check if the ops and variables are placed on devices correctly."""
-    with self._graph.as_default():
-      fisher_estimator = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          layer_collection=self.layer_collection,
-          damping=0.2,
-          cov_ema_decay=0.0,
-          cov_devices=["/cpu:{}".format(i) for i in range(2)],
-          inv_devices=["/cpu:{}".format(i) for i in range(2)])
-
-      # Construct an op that executes one covariance update per step.
-      (cov_update_thunks,
-       inv_update_thunks) = fisher_estimator.make_vars_and_create_op_thunks(
-           scope="test")
-      cov_update_ops = tuple(thunk() for thunk in cov_update_thunks)
-      inv_update_ops = tuple(thunk() for thunk in inv_update_thunks)
-      self.assertEqual(cov_update_ops[0].device, "/device:CPU:0")
-      self.assertEqual(cov_update_ops[1].device, "/device:CPU:1")
-      self.assertEqual(inv_update_ops[0].device, "/device:CPU:0")
-      self.assertEqual(inv_update_ops[1].device, "/device:CPU:1")
-      cov_matrices = [
-          fisher_factor.get_cov()
-          for fisher_factor in self.layer_collection.get_factors()
-      ]
-      inv_matrices = [
-          matrix
-          for fisher_factor in self.layer_collection.get_factors()
-          for matrix in fisher_factor._matpower_by_exp_and_damping.values()
-      ]
-      self.assertEqual(cov_matrices[0].device, "/device:CPU:0")
-      self.assertEqual(cov_matrices[1].device, "/device:CPU:1")
-      # Inverse matrices need to be explicitly placed.
-      self.assertEqual(inv_matrices[0].device, "")
-      self.assertEqual(inv_matrices[1].device, "")
-
-  def test_inv_update_thunks(self):
-    """Ensures inverse update ops run once per global_step."""
-    with self._graph.as_default(), self.test_session() as sess:
-      fisher_estimator = estimator.FisherEstimatorRoundRobin(
-          variables=[self.weights],
-          layer_collection=self.layer_collection,
-          damping=0.2,
-          cov_ema_decay=0.0)
-
-      # Construct op that updates one inverse per global step.
-      global_step = training_util.get_or_create_global_step()
-      (cov_variable_thunks, _, inv_variable_thunks,
-       inv_update_op_thunks) = fisher_estimator.create_ops_and_vars_thunks()
-      for thunk in cov_variable_thunks:
-        thunk()
-      for thunk in inv_variable_thunks:
-        thunk()
-      inv_matrices = [
-          matrix
-          for fisher_factor in self.layer_collection.get_factors()
-          for matrix in fisher_factor._matpower_by_exp_and_damping.values()
-      ]
-      inv_update_op = control_flow_ops.case(
-          [(math_ops.equal(global_step, i), thunk)
-           for i, thunk in enumerate(inv_update_op_thunks)])
-      increment_global_step = global_step.assign_add(1)
-
-      sess.run(variables.global_variables_initializer())
-      initial_inv_values = sess.run(inv_matrices)
-
-      # Ensure there's one update per inverse matrix. This is true as long as
-      # there's no fan-in/fan-out or parameter re-use.
-      self.assertEqual(len(inv_matrices), len(inv_update_op_thunks))
-
-      # Test is no-op if only 1 invariance matrix.
-      assert len(inv_matrices) > 1
-
-      # Assign each covariance matrix a value other than the identity. This
-      # ensures that the inverse matrices are updated to something different as
-      # well.
-      cov_matrices = [
-          fisher_factor.get_cov()
-          for fisher_factor in self.layer_collection.get_factors()
-      ]
-      sess.run([
-          cov_matrix.assign(2 * linalg_ops.eye(int(cov_matrix.shape[0])))
-          for cov_matrix in cov_matrices
-      ])
-
-      for i in range(len(inv_matrices)):
-        # Compare new and old inverse values
-        new_inv_values = sess.run(inv_matrices)
-        is_inv_equal = [
-            np.allclose(initial_inv_value, new_inv_value)
-            for (initial_inv_value,
-                 new_inv_value) in zip(initial_inv_values, new_inv_values)
-        ]
-        num_inv_equal = sum(is_inv_equal)
-
-        # Ensure exactly one inverse matrix changes per step.
-        self.assertEqual(num_inv_equal, len(inv_matrices) - i)
-
-        # Run all inverse update ops.
-        sess.run(inv_update_op)
-        sess.run(increment_global_step)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
deleted file mode 100644
index 86ec7a095a..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
+++ /dev/null
@@ -1,1018 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.fisher_blocks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
-from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
-from tensorflow.contrib.kfac.python.ops import layer_collection as lc
-from tensorflow.contrib.kfac.python.ops import linear_operator as lo
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import test
-
-
-# We need to set these constants since the numerical values used in the tests
-# were chosen when these used to be the defaults.
-ff.set_global_constants(init_covariances_at_zero=False,
-                        zero_debias=False,
-                        init_inverses_at_zero=False)
-
-# TODO(b/78538100): As far as I can tell, all the tests that say "Make sure our
-# inverse is something other than the identity" are actually broken. They never
-# run the covariance update ops and so the inverse actually is the identity
-# (possible plus the damping term, which would still make it a multiple of the
-# identity).
-
-
-def _make_psd(dim):
-  """Constructs a PSD matrix of the given dimension."""
-  mat = np.ones((dim, dim), dtype=np.float32)
-  mat[np.arange(dim), np.arange(dim)] = 2. + np.arange(dim)
-  return array_ops.constant(mat)
-
-
-class UtilsTest(test.TestCase):
-
-  def testComputePiTracenorm(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      diag = ops.convert_to_tensor([1., 2., 0., 1.])
-      left_factor = lo.LinearOperatorDiag(diag)
-      right_factor = lo.LinearOperatorFullMatrix(array_ops.ones([2, 2]))
-
-      # pi is the sqrt of the left trace norm divided by the right trace norm
-      pi = fb.compute_pi_tracenorm(left_factor, right_factor)
-
-      pi_val = sess.run(pi)
-      self.assertEqual(1., pi_val)
-
-
-class FullFBTest(test.TestCase):
-
-  def testFullFBInitSingleTensor(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-
-      self.assertAllEqual(params, block.tensors_to_compute_grads())
-
-  def testFullFBInitTensorTuple(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-
-      self.assertAllEqual(params, block.tensors_to_compute_grads())
-
-  def testInstantiateFactors(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-
-      grads = (params[0]**2, math_ops.sqrt(params[1]))
-      block.instantiate_factors(grads, 0.5)
-
-  def testMultiplyInverseTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-      grads = (params[0]**2, math_ops.sqrt(params[1]))
-      block.instantiate_factors((grads,), 0.5)
-      block._factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._factor.make_inverse_update_ops())
-
-      vector = array_ops.ones(3,) * 2
-      output = block.multiply_inverse(vector)
-
-      self.assertAllClose(sess.run(vector * 2 / 3.), sess.run(output))
-
-  def testMultiplyInverseNotTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = array_ops.constant([[1.], [2.]])
-      block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-      grads = params**2
-      block.instantiate_factors((grads,), 0.5)
-      block._factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._factor.make_inverse_update_ops())
-
-      vector = array_ops.ones(2,) * 2
-      output = block.multiply_inverse(vector)
-
-      self.assertAllClose(sess.run(vector * 2 / 3.), sess.run(output))
-
-  def testMultiplyInverseAgainstExplicit(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-      grads = (array_ops.constant([2., 3.]), array_ops.constant(4.))
-      damping = 0.5
-      block.instantiate_factors((grads,), damping)
-      block._factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(state_ops.assign(block._factor._cov, _make_psd(3)))
-      sess.run(block._factor.make_inverse_update_ops())
-
-      v_flat = np.array([4., 5., 6.], dtype=np.float32)
-      vector = utils.column_to_tensors(params, array_ops.constant(v_flat))
-      output = block.multiply_inverse(vector)
-      output_flat = sess.run(utils.tensors_to_column(output)).ravel()
-
-      full = sess.run(block.full_fisher_block())
-      explicit = np.dot(np.linalg.inv(full + damping * np.eye(3)), v_flat)
-
-      self.assertAllClose(output_flat, explicit)
-
-
-class NaiveDiagonalFBTest(test.TestCase):
-
-  def testNaiveDiagonalFBInitSingleTensor(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-
-      self.assertAllEqual(params, block.tensors_to_compute_grads())
-
-  def testNaiveDiagonalFBInitTensorTuple(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-
-      self.assertAllEqual(params, block.tensors_to_compute_grads())
-
-  def testInstantiateFactors(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-
-      grads = (params[0]**2, math_ops.sqrt(params[1]))
-      block.instantiate_factors(grads, 0.5)
-
-  def testMultiplyInverseTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-      grads = (params[0]**2, math_ops.sqrt(params[1]))
-      block.instantiate_factors((grads,), 0.5)
-      block._factor.instantiate_cov_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._factor.make_inverse_update_ops())
-
-      vector = array_ops.ones(3,) * 2
-      output = block.multiply_inverse(vector)
-
-      self.assertAllClose(sess.run(vector * 2 / 3.), sess.run(output))
-
-  def testMultiplyInverseNotTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = array_ops.constant([[1.], [2.]])
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-      grads = params**2
-      block.instantiate_factors((grads,), 0.5)
-      block._factor.instantiate_cov_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._factor.make_inverse_update_ops())
-      vector = array_ops.ones(2,) * 2
-      output = block.multiply_inverse(vector)
-
-      self.assertAllClose(sess.run(vector * 2 / 3.), sess.run(output))
-
-  def testMultiplyInverseAgainstExplicit(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
-      block.register_additional_tower(32)
-      grads = (params[0]**2, math_ops.sqrt(params[1]))
-      damping = 0.5
-      block.instantiate_factors((grads,), damping)
-      block._factor.instantiate_cov_variables()
-
-      cov = array_ops.reshape(array_ops.constant([2., 3., 4.]), [-1, 1])
-      sess.run(state_ops.assign(block._factor._cov, cov))
-      sess.run(block._factor.make_inverse_update_ops())
-
-      v_flat = np.array([4., 5., 6.], dtype=np.float32)
-      vector = utils.column_to_tensors(params, array_ops.constant(v_flat))
-      output = block.multiply_inverse(vector)
-      output_flat = sess.run(utils.tensors_to_column(output)).ravel()
-
-      full = sess.run(block.full_fisher_block())
-      explicit = np.dot(np.linalg.inv(full + damping * np.eye(3)), v_flat)
-      self.assertAllClose(output_flat, explicit)
-
-
-class FullyConnectedDiagonalFBTest(test.TestCase):
-
-  def setUp(self):
-    super(FullyConnectedDiagonalFBTest, self).setUp()
-
-    self.batch_size = 4
-    self.input_size = 6
-    self.output_size = 3
-
-    self.inputs = np.random.randn(self.batch_size, self.input_size).astype(
-        np.float32)
-    self.outputs = np.zeros([self.batch_size, self.output_size]).astype(
-        np.float32)
-    self.output_grads = np.random.randn(self.batch_size,
-                                        self.output_size).astype(np.float32)
-    self.w = np.random.randn(self.input_size, self.output_size).astype(
-        np.float32)
-    self.b = np.random.randn(self.output_size).astype(np.float32)
-
-  def fisherApprox(self, has_bias=False):
-    """Fisher approximation using default inputs."""
-    if has_bias:
-      inputs = np.concatenate(
-          [self.inputs, np.ones([self.batch_size, 1])], axis=1)
-    else:
-      inputs = self.inputs
-    return self.buildDiagonalFisherApproximation(inputs, self.output_grads)
-
-  def buildDiagonalFisherApproximation(self, inputs, output_grads):
-    """Builds explicit diagonal Fisher approximation.
-
-    Fisher's diagonal is (d loss / d w)'s elements squared for
-      d/dw = E[outer(input, output_grad)]
-
-    where the expectation is taken over examples.
-
-    Args:
-      inputs: np.array of shape [batch_size, input_size].
-      output_grads: np.array of shape [batch_size, output_size].
-
-    Returns:
-      Diagonal np.array of shape [num_params, num_params] for num_params =
-      input_size * output_size.
-    """
-    batch_size = inputs.shape[0]
-    assert output_grads.shape[0] == batch_size
-    input_size = inputs.shape[1]
-    output_size = output_grads.shape[1]
-    fisher_diag = np.zeros((input_size, output_size))
-    for i in range(batch_size):
-      fisher_diag += np.square(np.outer(inputs[i], output_grads[i]))
-    return np.diag(fisher_diag.flatten()) / batch_size
-
-  def testMultiply(self):
-    result, _ = self.runFisherBlockOps(self.w, [self.inputs], [self.outputs],
-                                       [self.output_grads])
-
-    # Construct Fisher-vector product.
-    expected_result = self.fisherApprox().dot(self.w.flatten())
-    expected_result = expected_result.reshape(
-        [self.input_size, self.output_size])
-
-    self.assertAllClose(expected_result, result)
-
-  def testMultiplyInverse(self):
-    _, result = self.runFisherBlockOps(self.w, [self.inputs], [self.outputs],
-                                       [self.output_grads])
-
-    # Construct inverse Fisher-vector product.
-    expected_result = np.linalg.inv(self.fisherApprox()).dot(self.w.flatten())
-    expected_result = expected_result.reshape(
-        [self.input_size, self.output_size])
-
-    self.assertAllClose(expected_result, result)
-
-  def testRegisterAdditionalTower(self):
-    """Ensure 1 big tower and 2 small towers are equivalent."""
-    multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
-        self.w, [self.inputs], [self.outputs], [self.output_grads])
-    multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
-                               np.split(self.outputs, 2),
-                               np.split(self.output_grads, 2)))
-
-    self.assertAllClose(multiply_result_big, multiply_result_small)
-    self.assertAllClose(multiply_inverse_result_big,
-                        multiply_inverse_result_small)
-
-  def testMultiplyHasBias(self):
-    result, _ = self.runFisherBlockOps((self.w, self.b), [self.inputs],
-                                       [self.outputs], [self.output_grads])
-    expected_result = self.fisherApprox(True).dot(
-        np.concatenate([self.w.flatten(), self.b.flatten()]))
-    expected_result = expected_result.reshape(
-        [self.input_size + 1, self.output_size])
-    expected_result = (expected_result[:-1], expected_result[-1])
-
-    self.assertEqual(len(result), 2)
-    self.assertAllClose(expected_result[0], result[0])
-    self.assertAllClose(expected_result[1], result[1])
-
-  def runFisherBlockOps(self, params, inputs, outputs, output_grads):
-    """Run Ops guaranteed by FisherBlock interface.
-
-    Args:
-      params: Tensor or 2-tuple of Tensors. Represents weights or weights and
-        bias of this layer.
-      inputs: list of Tensors of shape [batch_size, input_size]. Inputs to
-        layer.
-      outputs: list of Tensors of shape [batch_size, output_size].
-        Preactivations produced by layer.
-      output_grads: list of Tensors of shape [batch_size, output_size].
-        Gradient of loss with respect to 'outputs'.
-
-    Returns:
-      multiply_result: Result of FisherBlock.multiply(params)
-      multiply_inverse_result: Result of FisherBlock.multiply_inverse(params)
-    """
-    with ops.Graph().as_default(), self.test_session() as sess:
-      inputs = as_tensors(inputs)
-      outputs = as_tensors(outputs)
-      output_grads = as_tensors(output_grads)
-      params = as_tensors(params)
-
-      block = fb.FullyConnectedDiagonalFB(
-          lc.LayerCollection(), has_bias=isinstance(params, (tuple, list)))
-      for (i, o) in zip(inputs, outputs):
-        block.register_additional_tower(i, o)
-
-      block.instantiate_factors((output_grads,), damping=0.0)
-      block._factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._factor.make_covariance_update_op(0.0))
-      multiply_result = sess.run(block.multiply(params))
-      multiply_inverse_result = sess.run(block.multiply_inverse(params))
-
-    return multiply_result, multiply_inverse_result
-
-
-class EmbeddingKFACFBTest(test.TestCase):
-
-  def testInstantiateFactors(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-
-      # Create a Fisher Block.
-      vocab_size = 5
-      block = fb.EmbeddingKFACFB(lc.LayerCollection(), vocab_size)
-
-      # Add some examples.
-      inputs = array_ops.constant([[0, 1], [1, 2], [2, 3]])
-      outputs = array_ops.constant([[0.], [1.], [2.]])
-      block.register_additional_tower(inputs, outputs)
-
-      # Instantiate factor's variables. Ensure it doesn't fail.
-      grads = outputs**2.
-      damping = array_ops.constant(0.)
-      block.instantiate_factors(((grads,),), damping)
-
-  def testMultiplyInverse(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-
-      # Create a Fisher Block.
-      vocab_size = 5
-      block = fb.EmbeddingKFACFB(lc.LayerCollection(), vocab_size)
-
-      # Add some examples.
-      inputs = array_ops.constant([[0, 1], [1, 2], [2, 3]])
-      outputs = array_ops.constant([[0.], [1.], [2.]])
-      block.register_additional_tower(inputs, outputs)
-
-      # Instantiate factor's variables. Ensure it doesn't fail.
-      grads = outputs**2.
-      damping = array_ops.constant(0.)
-      block.instantiate_factors(((grads,),), damping)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      # Create a sparse update.
-      indices = array_ops.constant([1, 3, 4])
-      values = array_ops.constant([[1.], [1.], [1.]])
-      sparse_vector = ops.IndexedSlices(
-          values, indices, dense_shape=[vocab_size, 1])
-      dense_vector = array_ops.reshape([0., 1., 0., 1., 1.], [vocab_size, 1])
-
-      # Compare Fisher-vector product against explicit result.
-      result = block.multiply_inverse(sparse_vector)
-      expected_result = linalg_ops.matrix_solve(block.full_fisher_block(),
-                                                dense_vector)
-
-      sess.run(tf_variables.global_variables_initializer())
-      self.assertAlmostEqual(
-          sess.run(expected_result[1]), sess.run(result.values[0]))
-      self.assertAlmostEqual(
-          sess.run(expected_result[3]), sess.run(result.values[1]))
-      self.assertAlmostEqual(
-          sess.run(expected_result[4]), sess.run(result.values[2]))
-
-
-class FullyConnectedKFACBasicFBTest(test.TestCase):
-
-  def testFullyConnectedKFACBasicFBInit(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([1., 2.])
-      outputs = array_ops.constant([3., 4.])
-      block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection())
-      block.register_additional_tower(inputs, outputs)
-
-      self.assertAllEqual([outputs], block.tensors_to_compute_grads())
-
-  def testInstantiateFactorsHasBias(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([[1., 2.], [3., 4.]])
-      outputs = array_ops.constant([[3., 4.], [5., 6.]])
-      block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=True)
-      block.register_additional_tower(inputs, outputs)
-
-      grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
-
-  def testInstantiateFactorsNoBias(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([[1., 2.], [3., 4.]])
-      outputs = array_ops.constant([[3., 4.], [5., 6.]])
-      block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_tower(inputs, outputs)
-
-      grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
-
-  def testMultiplyInverseTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]])
-      outputs = array_ops.constant([[3., 4.], [5., 6.]])
-      block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_tower(inputs, outputs)
-      grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
-
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._input_factor.make_inverse_update_ops())
-      sess.run(block._output_factor.make_inverse_update_ops())
-
-      vector = (
-          np.arange(2, 6).reshape(2, 2).astype(np.float32),  #
-          np.arange(1, 3).reshape(2, 1).astype(np.float32))
-      output = block.multiply_inverse((array_ops.constant(vector[0]),
-                                       array_ops.constant(vector[1])))
-
-      output = sess.run(output)
-      self.assertAllClose([[0.686291, 1.029437], [1.372583, 1.715729]],
-                          output[0])
-      self.assertAllClose([0.343146, 0.686291], output[1])
-
-  def testMultiplyInverseNotTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([[1., 2.], [3., 4.]])
-      outputs = array_ops.constant([[3., 4.], [5., 6.]])
-      block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_tower(inputs, outputs)
-      grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._input_factor.make_inverse_update_ops())
-      sess.run(block._output_factor.make_inverse_update_ops())
-
-      vector = np.arange(2, 6).reshape(2, 2).astype(np.float32)
-      output = block.multiply_inverse(array_ops.constant(vector))
-
-      self.assertAllClose([[0.686291, 1.029437], [1.372583, 1.715729]],
-                          sess.run(output))
-
-  def testMultiplyInverseAgainstExplicit(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      input_dim, output_dim = 3, 2
-      inputs = array_ops.zeros([32, input_dim])
-      outputs = array_ops.zeros([32, output_dim])
-      params = array_ops.zeros([input_dim, output_dim])
-      block = fb.FullyConnectedKFACBasicFB(lc.LayerCollection(), has_bias=False)
-      block.register_additional_tower(inputs, outputs)
-      grads = outputs**2
-      damping = 0.  # This test is only valid without damping.
-      block.instantiate_factors(((grads,),), damping)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-
-      sess.run(state_ops.assign(block._input_factor._cov, _make_psd(3)))
-      sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2)))
-
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      sess.run(block._input_factor.make_inverse_update_ops())
-      sess.run(block._output_factor.make_inverse_update_ops())
-
-      v_flat = np.arange(6, dtype=np.float32)
-      vector = utils.column_to_tensors(params, array_ops.constant(v_flat))
-      output = block.multiply_inverse(vector)
-      output_flat = sess.run(utils.tensors_to_column(output)).ravel()
-
-      full = sess.run(block.full_fisher_block())
-      explicit = np.dot(np.linalg.inv(full + damping * np.eye(6)), v_flat)
-
-      self.assertAllClose(output_flat, explicit)
-
-
-class ConvDiagonalFBTest(test.TestCase):
-
-  def setUp(self):
-    super(ConvDiagonalFBTest, self).setUp()
-
-    self.batch_size = 2
-    self.height = 8
-    self.width = 4
-    self.input_channels = 6
-    self.output_channels = 3
-    self.kernel_size = 1
-
-    self.inputs = np.random.randn(self.batch_size, self.height, self.width,
-                                  self.input_channels).astype(np.float32)
-    self.outputs = np.zeros(
-        [self.batch_size, self.height, self.width,
-         self.output_channels]).astype(np.float32)
-    self.output_grads = np.random.randn(
-        self.batch_size, self.height, self.width, self.output_channels).astype(
-            np.float32)
-    self.w = np.random.randn(self.kernel_size, self.kernel_size,
-                             self.input_channels, self.output_channels).astype(
-                                 np.float32)
-    self.b = np.random.randn(self.output_channels).astype(np.float32)
-
-  def fisherApprox(self, has_bias=False):
-    """Fisher approximation using default inputs."""
-    if has_bias:
-      inputs = np.concatenate(
-          [self.inputs,
-           np.ones([self.batch_size, self.height, self.width, 1])],
-          axis=-1)
-    else:
-      inputs = self.inputs
-    return self.buildDiagonalFisherApproximation(inputs, self.output_grads,
-                                                 self.kernel_size)
-
-  def buildDiagonalFisherApproximation(self, inputs, output_grads, kernel_size):
-    r"""Builds explicit diagonal Fisher approximation.
-
-    Fisher's diagonal is (d loss / d w)'s elements squared for
-      d/dw = E[\sum_{loc} outer(input_{loc}, output_grad_{loc})]
-
-    where the expectation is taken over examples and the sum over (x, y)
-    locations upon which the convolution is applied.
-
-    Args:
-      inputs: np.array of shape [batch_size, height, width, input_channels].
-      output_grads: np.array of shape [batch_size, height, width,
-        output_channels].
-      kernel_size: int. height and width of kernel.
-
-    Returns:
-      Diagonal np.array of shape [num_params, num_params] for num_params =
-      kernel_size^2 * input_channels * output_channels.
-    """
-    batch_size, height, width, input_channels = inputs.shape
-    assert output_grads.shape[0] == batch_size
-    assert output_grads.shape[1] == height
-    assert output_grads.shape[2] == width
-    output_channels = output_grads.shape[3]
-
-    # If kernel_size == 1, then we don't need to worry about capturing context
-    # around the pixel upon which a convolution is applied. This makes testing
-    # easier.
-    assert kernel_size == 1, "kernel_size != 1 isn't supported."
-    num_locations = height * width
-    inputs = np.reshape(inputs, [batch_size, num_locations, input_channels])
-    output_grads = np.reshape(output_grads,
-                              [batch_size, num_locations, output_channels])
-
-    fisher_diag = np.zeros((input_channels, output_channels))
-    for i in range(batch_size):
-      # Each example's approximation is a square(sum-of-outer-products).
-      example_fisher_diag = np.zeros((input_channels, output_channels))
-      for j in range(num_locations):
-        example_fisher_diag += np.outer(inputs[i, j], output_grads[i, j])
-      fisher_diag += np.square(example_fisher_diag)
-
-    # Normalize by batch_size (not num_locations).
-    return np.diag(fisher_diag.flatten()) / batch_size
-
-  def testMultiply(self):
-    result, _ = self.runFisherBlockOps(self.w, [self.inputs], [self.outputs],
-                                       [self.output_grads])
-
-    # Construct Fisher-vector product.
-    expected_result = self.fisherApprox().dot(self.w.flatten())
-    expected_result = expected_result.reshape([
-        self.kernel_size, self.kernel_size, self.input_channels,
-        self.output_channels
-    ])
-
-    self.assertAllClose(expected_result, result)
-
-  def testMultiplyInverse(self):
-    _, result = self.runFisherBlockOps(self.w, [self.inputs], [self.outputs],
-                                       [self.output_grads])
-
-    # Construct inverse Fisher-vector product.
-    expected_result = np.linalg.inv(self.fisherApprox()).dot(self.w.flatten())
-    expected_result = expected_result.reshape([
-        self.kernel_size, self.kernel_size, self.input_channels,
-        self.output_channels
-    ])
-
-    self.assertAllClose(expected_result, result, atol=1e-3)
-
-  def testRegisterAdditionalTower(self):
-    """Ensure 1 big tower and 2 small towers are equivalent."""
-    multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
-        self.w, [self.inputs], [self.outputs], [self.output_grads])
-    multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
-                               np.split(self.outputs, 2),
-                               np.split(self.output_grads, 2)))
-
-    self.assertAllClose(multiply_result_big, multiply_result_small)
-    self.assertAllClose(multiply_inverse_result_big,
-                        multiply_inverse_result_small)
-
-  def testMultiplyHasBias(self):
-    result, _ = self.runFisherBlockOps((self.w, self.b), [self.inputs],
-                                       [self.outputs], [self.output_grads])
-    # Clone 'b' along 'input_channels' dimension.
-    b_filter = np.tile(
-        np.reshape(self.b, [1, 1, 1, self.output_channels]),
-        [self.kernel_size, self.kernel_size, 1, 1])
-    params = np.concatenate([self.w, b_filter], axis=2)
-    expected_result = self.fisherApprox(True).dot(params.flatten())
-
-    # Extract 'b' from concatenated parameters.
-    expected_result = expected_result.reshape([
-        self.kernel_size, self.kernel_size, self.input_channels + 1,
-        self.output_channels
-    ])
-    expected_result = (expected_result[:, :, 0:-1, :],
-                       np.reshape(expected_result[:, :, -1, :],
-                                  [self.output_channels]))
-
-    self.assertEqual(len(result), 2)
-    self.assertAllClose(expected_result[0], result[0])
-    self.assertAllClose(expected_result[1], result[1])
-
-  def runFisherBlockOps(self, params, inputs, outputs, output_grads):
-    """Run Ops guaranteed by FisherBlock interface.
-
-    Args:
-      params: Tensor or 2-tuple of Tensors. Represents weights or weights and
-        bias of this layer.
-      inputs: list of Tensors of shape [batch_size, input_size]. Inputs to
-        layer.
-      outputs: list of Tensors of shape [batch_size, output_size].
-        Preactivations produced by layer.
-      output_grads: list of Tensors of shape [batch_size, output_size].
-        Gradient of loss with respect to 'outputs'.
-
-    Returns:
-      multiply_result: Result of FisherBlock.multiply(params)
-      multiply_inverse_result: Result of FisherBlock.multiply_inverse(params)
-    """
-    with ops.Graph().as_default(), self.test_session() as sess:
-      inputs = as_tensors(inputs)
-      outputs = as_tensors(outputs)
-      output_grads = as_tensors(output_grads)
-      params = as_tensors(params)
-
-      block = fb.ConvDiagonalFB(
-          lc.LayerCollection(), params, strides=[1, 1, 1, 1], padding='SAME')
-      for (i, o) in zip(inputs, outputs):
-        block.register_additional_tower(i, o)
-
-      block.instantiate_factors((output_grads,), damping=0.0)
-      block._factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._factor.make_covariance_update_op(0.0))
-      multiply_result = sess.run(block.multiply(params))
-      multiply_inverse_result = sess.run(block.multiply_inverse(params))
-
-    return multiply_result, multiply_inverse_result
-
-
-class DepthwiseConvKFCBasicFBTest(test.TestCase):
-
-  def testInstantiateFactors(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      params = random_ops.random_normal((3, 3, 8, 2))
-      inputs = random_ops.random_normal((32, 5, 5, 8))
-      outputs = random_ops.random_normal((32, 5, 5, 16))
-      layer_collection = lc.LayerCollection()
-      block = fb.DepthwiseConvKFCBasicFB(
-          layer_collection, params=params, strides=[1, 1, 1, 1], padding='SAME')
-      block.register_additional_tower(inputs, outputs)
-      grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
-
-  def testMultiplyInverse(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = random_ops.random_normal((3, 3, 8, 2))
-      inputs = random_ops.random_normal((32, 5, 5, 8))
-      outputs = random_ops.random_normal((32, 5, 5, 16))
-      layer_collection = lc.LayerCollection()
-      block = fb.DepthwiseConvKFCBasicFB(
-          layer_collection, params=params, strides=[1, 1, 1, 1], padding='SAME')
-      block.register_additional_tower(inputs, outputs)
-      grads = outputs**2
-      block.instantiate_factors(([grads],), 0.5)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      # Ensure inverse update op doesn't crash.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run([
-          factor.make_inverse_update_ops()
-          for factor in layer_collection.get_factors()
-      ])
-
-      # Ensure inverse-vector multiply doesn't crash.
-      output = block.multiply_inverse(params)
-      sess.run(output)
-
-      # Ensure same shape.
-      self.assertAllEqual(output.shape, params.shape)
-
-
-class ConvKFCBasicFBTest(test.TestCase):
-
-  def _testConvKFCBasicFBInitParams(self, params):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      if isinstance(params, (list, tuple)):
-        params = [array_ops.constant(param) for param in params]
-      else:
-        params = array_ops.constant(params)
-      inputs = random_ops.random_normal((2, 2, 2))
-      outputs = random_ops.random_normal((2, 2, 2))
-      block = fb.ConvKFCBasicFB(
-          lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_tower(inputs, outputs)
-
-      self.assertAllEqual([outputs], block.tensors_to_compute_grads())
-
-  def testConvKFCBasicFBInitParamsParamsTuple(self):
-    self._testConvKFCBasicFBInitParams([np.ones([1, 2, 2]), np.ones([2])])
-
-  def testConvKFCBasicFBInitParamsParamsSingle(self):
-    self._testConvKFCBasicFBInitParams([np.ones([1, 2, 2])])
-
-  def testMultiplyInverseTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = random_ops.random_normal((2, 2, 2, 2))
-      inputs = random_ops.random_normal((2, 2, 2, 2))
-      outputs = random_ops.random_normal((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(
-          lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_tower(inputs, outputs)
-      grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._input_factor.make_inverse_update_ops())
-      sess.run(block._output_factor.make_inverse_update_ops())
-
-      vector = (np.arange(1, 15).reshape(7, 2).astype(np.float32),
-                np.arange(2, 4).reshape(2, 1).astype(np.float32))
-      output = block.multiply_inverse((array_ops.constant(vector[0]),
-                                       array_ops.constant(vector[1])))
-
-      output = sess.run(output)
-      self.assertAllClose([0.136455, 0.27291], output[0][0])
-      self.assertAllClose([0.27291, 0.409365], output[1])
-
-  def testMultiplyInverseNotTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = random_ops.random_normal((2, 2, 2, 2))
-      inputs = random_ops.random_normal((2, 2, 2, 2))
-      outputs = random_ops.random_normal((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(
-          lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_tower(inputs, outputs)
-      self.assertFalse(block._has_bias)
-      grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._input_factor.make_inverse_update_ops())
-      sess.run(block._output_factor.make_inverse_update_ops())
-
-      vector = np.arange(1, 17).reshape(8, 2).astype(np.float32)
-      output = block.multiply_inverse(array_ops.constant(vector))
-
-      self.assertAllClose([0.136455, 0.27291], sess.run(output)[0])
-
-  def testMultiplyInverseNotTupleWithBias(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = [random_ops.random_normal((2, 2, 2, 2))]
-      inputs = random_ops.random_normal((2, 2, 2, 2))
-      outputs = random_ops.random_normal((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(
-          lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_tower(inputs, outputs)
-      self.assertTrue(block._has_bias)
-      grads = outputs**2
-      block.instantiate_factors(((grads,),), 0.5)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      # Make sure our inverse is something other than the identity.
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(block._input_factor.make_inverse_update_ops())
-      sess.run(block._output_factor.make_inverse_update_ops())
-
-      vector = np.arange(1, 19).reshape(9, 2).astype(np.float32)
-      output = block.multiply_inverse(array_ops.constant(vector))
-
-      self.assertAllClose([0.136455, 0.27291], sess.run(output)[0])
-
-  def testMultiplyInverseAgainstExplicit(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      params = array_ops.zeros((2, 2, 2, 2))
-      inputs = array_ops.zeros((2, 2, 2, 2))
-      outputs = array_ops.zeros((2, 2, 2, 2))
-      block = fb.ConvKFCBasicFB(
-          lc.LayerCollection(), params=params, padding='SAME')
-      block.register_additional_tower(inputs, outputs)
-      grads = outputs**2
-      damping = 0.  # This test is only valid without damping.
-      block.instantiate_factors(((grads,),), damping)
-      block._input_factor.instantiate_cov_variables()
-      block._output_factor.instantiate_cov_variables()
-      block.register_inverse()
-      block._input_factor.instantiate_inv_variables()
-      block._output_factor.instantiate_inv_variables()
-
-      sess.run(state_ops.assign(block._input_factor._cov, _make_psd(8)))
-      sess.run(state_ops.assign(block._output_factor._cov, _make_psd(2)))
-      sess.run(block._input_factor.make_inverse_update_ops())
-      sess.run(block._output_factor.make_inverse_update_ops())
-
-      v_flat = np.arange(16, dtype=np.float32)
-      vector = utils.column_to_tensors(params, array_ops.constant(v_flat))
-      output = block.multiply_inverse(vector)
-      output_flat = sess.run(utils.tensors_to_column(output)).ravel()
-
-      full = sess.run(block.full_fisher_block())
-      explicit = np.dot(np.linalg.inv(full + damping * np.eye(16)), v_flat)
-
-      self.assertAllClose(output_flat, explicit)
-
-
-class FullyConnectedSeriesFBTest(test.TestCase):
-
-  def testFullyConnectedSeriesFBInit(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([1., 2.])
-      outputs = array_ops.constant([3., 4.])
-      block = fb.FullyConnectedSeriesFB(lc.LayerCollection())
-      block.register_additional_tower([inputs], [outputs])
-      self.assertAllEqual([[outputs]], block.tensors_to_compute_grads())
-
-  def testInstantiateFactorsHasBias(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([[1., 2.], [3., 4.]])
-      outputs = array_ops.constant([[3., 4.], [5., 6.]])
-      block = fb.FullyConnectedSeriesFB(
-          lc.LayerCollection(),
-          has_bias=True)
-      block.register_additional_tower([inputs], [outputs])
-      grads = outputs**2
-      block.instantiate_factors((((grads,),),), 0.5)
-
-  def testInstantiateFactorsNoBias(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      inputs = array_ops.constant([[1., 2.], [3., 4.]])
-      outputs = array_ops.constant([[3., 4.], [5., 6.]])
-      block = fb.FullyConnectedSeriesFB(
-          lc.LayerCollection(),
-          has_bias=False)
-      block.register_additional_tower([inputs], [outputs])
-      grads = outputs**2
-      block.instantiate_factors((((grads,),),), 0.5)
-
-
-def as_tensors(tensor_or_tuple):
-  """Converts a potentially nested tuple of np.array to Tensors."""
-  if isinstance(tensor_or_tuple, (tuple, list)):
-    return tuple(as_tensors(t) for t in tensor_or_tuple)
-  return ops.convert_to_tensor(tensor_or_tuple)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
deleted file mode 100644
index fad47cd02f..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ /dev/null
@@ -1,955 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.fisher_factors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import numpy.random as npr
-
-from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
-from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops as tf_ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import test
-
-
-# We need to set these constants since the numerical values used in the tests
-# were chosen when these used to be the defaults.
-ff.set_global_constants(init_covariances_at_zero=False,
-                        zero_debias=False,
-                        init_inverses_at_zero=False)
-
-
-def make_damping_func(damping):
-  return fb._package_func(lambda: damping, damping)
-
-
-class FisherFactorTestingDummy(ff.FisherFactor):
-  """Dummy class to test the non-abstract methods on ff.FisherFactor."""
-
-  @property
-  def _var_scope(self):
-    return 'dummy/a_b_c'
-
-  @property
-  def _cov_shape(self):
-    raise NotImplementedError
-
-  @property
-  def _num_sources(self):
-    return 1
-
-  @property
-  def _dtype(self):
-    return dtypes.float32
-
-  def _compute_new_cov(self):
-    raise NotImplementedError
-
-  def instantiate_covariance(self):
-    pass
-
-  def make_inverse_update_ops(self):
-    return []
-
-  def get_cov(self):
-    return NotImplementedError
-
-  def instantiate_inv_variables(self):
-    return NotImplementedError
-
-  def _num_towers(self):
-    raise NotImplementedError
-
-  def _get_data_device(self):
-    raise NotImplementedError
-
-  def register_matpower(self, exp, damping_func):
-    raise NotImplementedError
-
-  def register_cholesky(self, damping_func):
-    raise NotImplementedError
-
-  def register_cholesky_inverse(self, damping_func):
-    raise NotImplementedError
-
-  def get_matpower(self, exp, damping_func):
-    raise NotImplementedError
-
-  def get_cholesky(self, damping_func):
-    raise NotImplementedError
-
-  def get_cholesky_inverse(self, damping_func):
-    raise NotImplementedError
-
-  def get_cov_as_linear_operator(self):
-    raise NotImplementedError
-
-
-class DenseSquareMatrixFactorTestingDummy(ff.DenseSquareMatrixFactor):
-  """Dummy class to test the non-abstract methods on ff.DenseSquareMatrixFactor.
-  """
-
-  def __init__(self, shape):
-    self._shape = shape
-    super(DenseSquareMatrixFactorTestingDummy, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return 'dummy/a_b_c'
-
-  @property
-  def _cov_shape(self):
-    return self._shape
-
-  @property
-  def _num_sources(self):
-    return 1
-
-  @property
-  def _dtype(self):
-    return dtypes.float32
-
-  def _compute_new_cov(self):
-    raise NotImplementedError
-
-  def instantiate_covariance(self):
-    pass
-
-  def _num_towers(self):
-    raise NotImplementedError
-
-  def _get_data_device(self):
-    raise NotImplementedError
-
-
-class NumericalUtilsTest(test.TestCase):
-
-  def testComputeCovAgainstNumpy(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      npr.seed(0)
-      random_seed.set_random_seed(200)
-
-      x = npr.randn(100, 3)
-      cov = ff.compute_cov(array_ops.constant(x))
-      np_cov = np.dot(x.T, x) / x.shape[0]
-
-      self.assertAllClose(sess.run(cov), np_cov)
-
-  def testComputeCovAgainstNumpyWithAlternativeNormalizer(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      npr.seed(0)
-      random_seed.set_random_seed(200)
-
-      normalizer = 10.
-      x = npr.randn(100, 3)
-      cov = ff.compute_cov(array_ops.constant(x), normalizer=normalizer)
-      np_cov = np.dot(x.T, x) / normalizer
-
-      self.assertAllClose(sess.run(cov), np_cov)
-
-  def testAppendHomog(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      npr.seed(0)
-
-      m, n = 3, 4
-      a = npr.randn(m, n)
-      a_homog = ff.append_homog(array_ops.constant(a))
-      np_result = np.hstack([a, np.ones((m, 1))])
-
-      self.assertAllClose(sess.run(a_homog), np_result)
-
-
-class NameStringUtilFunctionTest(test.TestCase):
-
-  def _make_tensor(self):
-    x = array_ops.placeholder(dtypes.float64, (3, 1))
-    w = array_ops.constant(npr.RandomState(0).randn(3, 3))
-    y = math_ops.matmul(w, x)
-    g = gradients_impl.gradients(y, x)[0]
-    return g
-
-  def testScopeStringFromParamsSingleTensor(self):
-    with tf_ops.Graph().as_default():
-      g = self._make_tensor()
-      scope_string = ff.scope_string_from_params(g)
-      self.assertEqual('gradients_MatMul_grad_MatMul_1', scope_string)
-
-  def testScopeStringFromParamsMultipleTensors(self):
-    with tf_ops.Graph().as_default():
-      x = array_ops.constant(1,)
-      y = array_ops.constant(2,)
-      scope_string = ff.scope_string_from_params((x, y))
-      self.assertEqual('Const_Const_1', scope_string)
-
-  def testScopeStringFromParamsMultipleTypes(self):
-    with tf_ops.Graph().as_default():
-      x = array_ops.constant(1,)
-      y = array_ops.constant(2,)
-      scope_string = ff.scope_string_from_params([[1, 2, 3], 'foo', True, 4,
-                                                  (x, y)])
-      self.assertEqual('1-2-3_foo_True_4_Const__Const_1', scope_string)
-
-  def testScopeStringFromParamsUnsupportedType(self):
-    with tf_ops.Graph().as_default():
-      x = array_ops.constant(1,)
-      y = array_ops.constant(2,)
-      unsupported = 1.2  # Floats are not supported.
-      with self.assertRaises(ValueError):
-        ff.scope_string_from_params([[1, 2, 3], 'foo', True, 4, (x, y),
-                                     unsupported])
-
-  def testScopeStringFromName(self):
-    with tf_ops.Graph().as_default():
-      g = self._make_tensor()
-      scope_string = ff.scope_string_from_name(g)
-      self.assertEqual('gradients_MatMul_grad_MatMul_1', scope_string)
-
-  def testScalarOrTensorToString(self):
-    with tf_ops.Graph().as_default():
-      self.assertEqual(ff.scalar_or_tensor_to_string(5.), repr(5.))
-
-      g = self._make_tensor()
-      scope_string = ff.scope_string_from_name(g)
-      self.assertEqual(ff.scalar_or_tensor_to_string(g), scope_string)
-
-
-class FisherFactorTest(test.TestCase):
-
-  def testMakeInverseUpdateOps(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      factor = FisherFactorTestingDummy()
-
-      self.assertEqual(0, len(factor.make_inverse_update_ops()))
-
-
-class DenseSquareMatrixFactorTest(test.TestCase):
-
-  def testRegisterDampedInverse(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      shape = [2, 2]
-      factor = DenseSquareMatrixFactorTestingDummy(shape)
-      factor_var_scope = 'dummy/a_b_c'
-
-      damping_funcs = [make_damping_func(0.1),
-                       make_damping_func(0.1),
-                       make_damping_func(1e-5),
-                       make_damping_func(1e-5)]
-      for damping_func in damping_funcs:
-        factor.register_inverse(damping_func)
-
-      factor.instantiate_inv_variables()
-
-      inv = factor.get_inverse(damping_funcs[0]).to_dense()
-      self.assertEqual(inv, factor.get_inverse(damping_funcs[1]).to_dense())
-      self.assertNotEqual(inv, factor.get_inverse(damping_funcs[2]).to_dense())
-      self.assertEqual(factor.get_inverse(damping_funcs[2]).to_dense(),
-                       factor.get_inverse(damping_funcs[3]).to_dense())
-      factor_vars = tf_ops.get_collection(tf_ops.GraphKeys.GLOBAL_VARIABLES,
-                                          factor_var_scope)
-      factor_tensors = (tf_ops.convert_to_tensor(var) for var in factor_vars)
-
-      self.assertEqual(set([inv,
-                            factor.get_inverse(damping_funcs[2]).to_dense()]),
-                       set(factor_tensors))
-      self.assertEqual(shape, inv.get_shape())
-
-  def testRegisterMatpower(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      shape = [3, 3]
-      factor = DenseSquareMatrixFactorTestingDummy(shape)
-      factor_var_scope = 'dummy/a_b_c'
-
-      # TODO(b/74201126): Change to using the same func for both once
-      # Topohash is in place.
-      damping_func_1 = make_damping_func(0.5)
-      damping_func_2 = make_damping_func(0.5)
-
-      factor.register_matpower(-0.5, damping_func_1)
-      factor.register_matpower(2, damping_func_2)
-
-      factor.instantiate_inv_variables()
-
-      factor_vars = tf_ops.get_collection(tf_ops.GraphKeys.GLOBAL_VARIABLES,
-                                          factor_var_scope)
-
-      factor_tensors = (tf_ops.convert_to_tensor(var) for var in factor_vars)
-
-      matpower1 = factor.get_matpower(-0.5, damping_func_1).to_dense()
-      matpower2 = factor.get_matpower(2, damping_func_2).to_dense()
-
-      self.assertEqual(set([matpower1, matpower2]), set(factor_tensors))
-
-      self.assertEqual(shape, matpower1.get_shape())
-      self.assertEqual(shape, matpower2.get_shape())
-
-  def testMakeInverseUpdateOps(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      factor = FisherFactorTestingDummy()
-
-      self.assertEqual(0, len(factor.make_inverse_update_ops()))
-
-  def testMakeInverseUpdateOpsManyInversesEigenDecomp(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      cov = np.array([[1., 2.], [3., 4.]])
-      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
-      factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
-
-      damping_funcs = []
-      for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
-        damping_funcs.append(make_damping_func(1./i))
-
-      for i in range(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD):
-        factor.register_inverse(damping_funcs[i])
-
-      factor.instantiate_inv_variables()
-      ops = factor.make_inverse_update_ops()
-      self.assertEqual(1, len(ops))
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_invs = []
-      sess.run(ops)
-      for i in range(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD):
-        # The inverse op will assign the damped inverse of cov to the inv var.
-        new_invs.append(
-            sess.run(factor.get_inverse(damping_funcs[i]).to_dense()))
-
-      # We want to see that the new invs are all different from each other.
-      for i in range(len(new_invs)):
-        for j in range(i + 1, len(new_invs)):
-          # Just check the first element.
-          self.assertNotEqual(new_invs[i][0][0], new_invs[j][0][0])
-
-  def testMakeInverseUpdateOpsMatPowerEigenDecomp(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      cov = np.array([[6., 2.], [2., 4.]])
-      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
-      factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
-      exp = 2  # NOTE(mattjj): must be int to test with np.linalg.matrix_power
-      damping = 0.5
-      damping_func = make_damping_func(damping)
-
-      factor.register_matpower(exp, damping_func)
-      factor.instantiate_inv_variables()
-      ops = factor.make_inverse_update_ops()
-      self.assertEqual(1, len(ops))
-
-      sess.run(tf_variables.global_variables_initializer())
-      sess.run(ops[0])
-      matpower = sess.run(factor.get_matpower(exp, damping_func).to_dense())
-      matpower_np = np.linalg.matrix_power(cov + np.eye(2) * damping, exp)
-      self.assertAllClose(matpower, matpower_np)
-
-  def testMakeInverseUpdateOpsNoEigenDecomp(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      cov = np.array([[5., 2.], [2., 4.]])  # NOTE(mattjj): must be symmetric
-      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
-      factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
-
-      damping_func = make_damping_func(0)
-
-      factor.register_inverse(damping_func)
-      factor.instantiate_inv_variables()
-      ops = factor.make_inverse_update_ops()
-      self.assertEqual(1, len(ops))
-
-      sess.run(tf_variables.global_variables_initializer())
-      # The inverse op will assign the damped inverse of cov to the inv var.
-      old_inv = sess.run(factor.get_inverse(damping_func).to_dense())
-      self.assertAllClose(
-          sess.run(ff.inverse_initializer(cov.shape, dtypes.float32)), old_inv)
-
-      sess.run(ops)
-      new_inv = sess.run(factor.get_inverse(damping_func).to_dense())
-      self.assertAllClose(new_inv, np.linalg.inv(cov))
-
-
-class FullFactorTest(test.TestCase):
-
-  def testFullFactorInit(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
-      factor = ff.FullFactor((tensor,), 32)
-      factor.instantiate_cov_variables()
-      self.assertEqual([6, 6], factor.get_cov().get_shape().as_list())
-
-  def testFullFactorInitFloat64(self):
-    with tf_ops.Graph().as_default():
-      dtype = dtypes.float64_ref
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      factor = ff.FullFactor((tensor,), 32)
-      factor.instantiate_cov_variables()
-      cov = factor.get_cov()
-      self.assertEqual(cov.dtype, dtype)
-      self.assertEqual([6, 6], cov.get_shape().as_list())
-
-  def testMakeCovarianceUpdateOp(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      tensor = array_ops.constant([1., 2.], name='a/b/c')
-      factor = ff.FullFactor((tensor,), 2)
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[0.75, 0.5], [0.5, 1.5]], new_cov)
-
-
-class NaiveDiagonalFactorTest(test.TestCase):
-
-  def testNaiveDiagonalFactorInit(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
-      factor = ff.NaiveDiagonalFactor((tensor,), 32)
-      factor.instantiate_cov_variables()
-      self.assertEqual([6, 1], factor.get_cov().get_shape().as_list())
-
-  def testNaiveDiagonalFactorInitFloat64(self):
-    with tf_ops.Graph().as_default():
-      dtype = dtypes.float64_ref
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      factor = ff.NaiveDiagonalFactor((tensor,), 32)
-      factor.instantiate_cov_variables()
-      cov = factor.get_cov()
-      self.assertEqual(cov.dtype, dtype)
-      self.assertEqual([6, 1], cov.get_shape().as_list())
-
-  def testMakeCovarianceUpdateOp(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      tensor = array_ops.constant([1., 2.], name='a/b/c')
-      factor = ff.NaiveDiagonalFactor((tensor,), 2)
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[0.75], [1.5]], new_cov)
-
-
-class EmbeddingInputKroneckerFactorTest(test.TestCase):
-
-  def testInitialization(self):
-    with tf_ops.Graph().as_default():
-      input_ids = array_ops.constant([[0], [1], [4]])
-      vocab_size = 5
-      factor = ff.EmbeddingInputKroneckerFactor((input_ids,), vocab_size)
-      factor.instantiate_cov_variables()
-      cov = factor.get_cov()
-      self.assertEqual(cov.shape.as_list(), [vocab_size])
-
-  def testCovarianceUpdateOp(self):
-    with tf_ops.Graph().as_default():
-      input_ids = array_ops.constant([[0], [1], [4]])
-      vocab_size = 5
-      factor = ff.EmbeddingInputKroneckerFactor((input_ids,), vocab_size)
-      factor.instantiate_cov_variables()
-      cov_update_op = factor.make_covariance_update_op(0.0)
-
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        new_cov = sess.run(cov_update_op)
-        self.assertAllClose(np.array([1., 1., 0., 0., 1.]) / 3., new_cov)
-
-
-class ConvDiagonalFactorTest(test.TestCase):
-
-  def setUp(self):
-    self.batch_size = 10
-    self.height = self.width = 32
-    self.in_channels = 3
-    self.out_channels = 1
-    self.kernel_height = self.kernel_width = 3
-    self.strides = [1, 2, 2, 1]
-    self.data_format = 'NHWC'
-    self.padding = 'SAME'
-    self.kernel_shape = [
-        self.kernel_height, self.kernel_width, self.in_channels,
-        self.out_channels
-    ]
-
-  def testInit(self):
-    with tf_ops.Graph().as_default():
-      inputs = random_ops.random_uniform(
-          [self.batch_size, self.height, self.width, self.in_channels])
-      outputs_grads = [
-          random_ops.random_uniform([
-              self.batch_size, self.height // self.strides[1],
-              self.width // self.strides[2], self.out_channels
-          ]) for _ in range(3)
-      ]
-
-      factor = ff.ConvDiagonalFactor(
-          (inputs,),
-          (outputs_grads,),
-          self.kernel_shape,
-          self.strides,
-          self.padding,
-          data_format=self.data_format)
-      factor.instantiate_cov_variables()
-
-      # Ensure covariance matrix's shape makes sense.
-      self.assertEqual([
-          self.kernel_height * self.kernel_width * self.in_channels,
-          self.out_channels
-      ],
-                       factor.get_cov().shape.as_list())
-
-  def testMakeCovarianceUpdateOp(self):
-    with tf_ops.Graph().as_default():
-      # Construct all arguments such that convolution kernel is applied in
-      # exactly one spatial location.
-      inputs = np.random.randn(
-          1,  # batch_size
-          self.kernel_height,
-          self.kernel_width,
-          self.in_channels)  # in_channels
-      outputs_grad = np.random.randn(
-          1,  # batch_size
-          1,  # output_height
-          1,  # output_width
-          self.out_channels)
-
-      factor = ff.ConvDiagonalFactor(
-          (constant_op.constant(inputs),),
-          ((constant_op.constant(outputs_grad),),),
-          self.kernel_shape,
-          strides=[1, 1, 1, 1],
-          padding='VALID')
-      factor.instantiate_cov_variables()
-
-      # Completely forget initial value on first update.
-      cov_update_op = factor.make_covariance_update_op(0.0)
-
-      # Ensure new covariance value is same as outer-product of inputs/outputs
-      # vectorized, squared.
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        cov = sess.run(cov_update_op)
-        expected_cov = np.outer(inputs.flatten(), outputs_grad.flatten())**2
-        self.assertAllClose(expected_cov, cov)
-
-  def testHasBias(self):
-    with tf_ops.Graph().as_default():
-      inputs = random_ops.random_uniform(
-          [self.batch_size, self.height, self.width, self.in_channels])
-      outputs_grads = [
-          random_ops.random_uniform([
-              self.batch_size, self.height // self.strides[1],
-              self.width // self.strides[2], self.out_channels
-          ]) for _ in range(3)
-      ]
-
-      factor = ff.ConvDiagonalFactor(
-          (inputs,),
-          (outputs_grads,),
-          self.kernel_shape,
-          self.strides,
-          self.padding,
-          data_format=self.data_format,
-          has_bias=True)
-      factor.instantiate_cov_variables()
-
-      # Ensure shape accounts for bias.
-      self.assertEqual([
-          self.kernel_height * self.kernel_width * self.in_channels + 1,
-          self.out_channels
-      ],
-                       factor.get_cov().shape.as_list())
-
-      # Ensure update op doesn't crash.
-      cov_update_op = factor.make_covariance_update_op(0.0)
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        sess.run(cov_update_op)
-
-
-class FullyConnectedKroneckerFactorTest(test.TestCase):
-
-  def _testFullyConnectedKroneckerFactorInit(self,
-                                             has_bias,
-                                             final_shape,
-                                             dtype=dtypes.float32_ref):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor(((tensor,),), has_bias=has_bias)
-      factor.instantiate_cov_variables()
-      cov = factor.get_cov()
-      self.assertEqual(cov.dtype, dtype)
-      self.assertEqual(final_shape, cov.get_shape().as_list())
-
-  def testFullyConnectedKroneckerFactorInitNoBias(self):
-    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
-      self._testFullyConnectedKroneckerFactorInit(False, [3, 3], dtype=dtype)
-
-  def testFullyConnectedKroneckerFactorInitWithBias(self):
-    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
-      self._testFullyConnectedKroneckerFactorInit(True, [4, 4], dtype=dtype)
-
-  def testMakeCovarianceUpdateOpWithBias(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor(((tensor,),), has_bias=True)
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[3, 3.5, 1], [3.5, 5.5, 1.5], [1, 1.5, 1]], new_cov)
-
-  def testMakeCovarianceUpdateOpNoBias(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedKroneckerFactor(((tensor,),))
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[3, 3.5], [3.5, 5.5]], new_cov)
-
-
-class ConvFactorTestCase(test.TestCase):
-
-  def assertMatrixRank(self, rank, matrix, atol=1e-5):
-    assert rank <= matrix.shape[0], 'Rank cannot be larger than matrix size.'
-    eigvals = np.linalg.eigvals(matrix)
-    nnz_eigvals = np.sum(eigvals > atol)
-    self.assertEqual(
-        rank,
-        nnz_eigvals,
-        msg=('Found %d of %d expected non-zero eigenvalues: %s.' %
-             (nnz_eigvals, rank, eigvals)))
-
-
-class ConvInputKroneckerFactorTest(ConvFactorTestCase):
-
-  def test3DConvolution(self):
-    with tf_ops.Graph().as_default():
-      batch_size = 1
-      width = 3
-      in_channels = 3**3
-      out_channels = 4
-
-      factor = ff.ConvInputKroneckerFactor(
-          inputs=(random_ops.random_uniform(
-              (batch_size, width, width, width, in_channels), seed=0),),
-          filter_shape=(width, width, width, in_channels, out_channels),
-          padding='SAME',
-          strides=(2, 2, 2),
-          extract_patches_fn='extract_convolution_patches',
-          has_bias=False)
-      factor.instantiate_cov_variables()
-
-      # Ensure shape of covariance matches input size of filter.
-      input_size = in_channels * (width**3)
-      self.assertEqual([input_size, input_size],
-                       factor.get_cov().shape.as_list())
-
-      # Ensure cov_update_op doesn't crash.
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov())
-
-      # Cov should be rank-8, as the filter will be applied at each corner of
-      # the 4-D cube.
-      self.assertMatrixRank(8, cov)
-
-  def testPointwiseConv2d(self):
-    with tf_ops.Graph().as_default():
-      batch_size = 1
-      width = 3
-      in_channels = 3**2
-      out_channels = 4
-
-      factor = ff.ConvInputKroneckerFactor(
-          inputs=(random_ops.random_uniform(
-              (batch_size, width, width, in_channels), seed=0),),
-          filter_shape=(1, 1, in_channels, out_channels),
-          padding='SAME',
-          strides=(1, 1, 1, 1),
-          extract_patches_fn='extract_pointwise_conv2d_patches',
-          has_bias=False)
-      factor.instantiate_cov_variables()
-
-      # Ensure shape of covariance matches input size of filter.
-      self.assertEqual([in_channels, in_channels],
-                       factor.get_cov().shape.as_list())
-
-      # Ensure cov_update_op doesn't crash.
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov())
-
-      # Cov should be rank-9, as the filter will be applied at each location.
-      self.assertMatrixRank(9, cov)
-
-  def testStrides(self):
-    with tf_ops.Graph().as_default():
-      batch_size = 1
-      width = 3
-      in_channels = 3**2
-      out_channels = 4
-
-      factor = ff.ConvInputKroneckerFactor(
-          inputs=(random_ops.random_uniform(
-              (batch_size, width, width, in_channels), seed=0),),
-          filter_shape=(1, 1, in_channels, out_channels),
-          padding='SAME',
-          strides=(1, 2, 1, 1),
-          extract_patches_fn='extract_image_patches',
-          has_bias=False)
-      factor.instantiate_cov_variables()
-
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov())
-
-      # Cov should be the sum of 3 * 2 = 6 outer products.
-      self.assertMatrixRank(6, cov)
-
-  def testDilationRate(self):
-    with tf_ops.Graph().as_default():
-      batch_size = 1
-      width = 3
-      in_channels = 2
-      out_channels = 4
-
-      factor = ff.ConvInputKroneckerFactor(
-          inputs=(random_ops.random_uniform(
-              (batch_size, width, width, in_channels), seed=0),),
-          filter_shape=(3, 3, in_channels, out_channels),
-          padding='SAME',
-          extract_patches_fn='extract_image_patches',
-          strides=(1, 1, 1, 1),
-          dilation_rate=(1, width, width, 1),
-          has_bias=False)
-      factor.instantiate_cov_variables()
-
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov())
-
-      # Cov should be rank = in_channels, as only the center of the filter
-      # receives non-zero input for each input channel.
-      self.assertMatrixRank(in_channels, cov)
-
-  def testConvInputKroneckerFactorInitNoBias(self):
-    with tf_ops.Graph().as_default():
-      tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c')
-      factor = ff.ConvInputKroneckerFactor(
-          inputs=(tensor,),
-          filter_shape=(1, 2, 3, 4),
-          padding='SAME',
-          has_bias=False)
-      factor.instantiate_cov_variables()
-      self.assertEqual([1 * 2 * 3, 1 * 2 * 3],
-                       factor.get_cov().get_shape().as_list())
-
-  def testConvInputKroneckerFactorInit(self):
-    with tf_ops.Graph().as_default():
-      tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c')
-      factor = ff.ConvInputKroneckerFactor(
-          (tensor,), filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
-      factor.instantiate_cov_variables()
-      self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
-                       factor.get_cov().get_shape().as_list())
-
-  def testConvInputKroneckerFactorInitFloat64(self):
-    with tf_ops.Graph().as_default():
-      dtype = dtypes.float64_ref
-      tensor = array_ops.ones((64, 1, 2, 3), name='a/b/c', dtype=dtypes.float64)
-      factor = ff.ConvInputKroneckerFactor(
-          (tensor,), filter_shape=(1, 2, 3, 4), padding='SAME', has_bias=True)
-      factor.instantiate_cov_variables()
-      cov = factor.get_cov()
-      self.assertEqual(cov.dtype, dtype)
-      self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
-                       cov.get_shape().as_list())
-
-  def testMakeCovarianceUpdateOpWithBias(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      input_shape = (2, 1, 1, 1)
-      tensor = array_ops.constant(
-          np.arange(1, 1 + np.prod(input_shape)).reshape(input_shape).astype(
-              np.float32))
-      factor = ff.ConvInputKroneckerFactor(
-          (tensor,), filter_shape=(1, 1, 1, 1), padding='SAME', has_bias=True)
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(0.))
-      self.assertAllClose(
-          [
-              [(1. + 4.) / 2., (1. + 2.) / 2.],  #
-              [(1. + 2.) / 2., (1. + 1.) / 2.]
-          ],  #
-          new_cov)
-
-  def testMakeCovarianceUpdateOpNoBias(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      input_shape = (2, 1, 1, 1)
-      tensor = array_ops.constant(
-          np.arange(1, 1 + np.prod(input_shape)).reshape(input_shape).astype(
-              np.float32))
-      factor = ff.ConvInputKroneckerFactor(
-          (tensor,), filter_shape=(1, 1, 1, 1), padding='SAME')
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(0.))
-      self.assertAllClose([[(1. + 4.) / 2.]], new_cov)
-
-  def testSubSample(self):
-    with tf_ops.Graph().as_default():
-      patches_1 = array_ops.constant(1, shape=(10, 2))
-      patches_2 = array_ops.constant(1, shape=(10, 8))
-      patches_3 = array_ops.constant(1, shape=(3, 3))
-      patches_1_sub = ff._subsample_for_cov_computation(patches_1)
-      patches_2_sub = ff._subsample_for_cov_computation(patches_2)
-      patches_3_sub = ff._subsample_for_cov_computation(patches_3)
-      patches_1_sub_batch_size = patches_1_sub.shape.as_list()[0]
-      patches_2_sub_batch_size = patches_2_sub.shape.as_list()[0]
-      patches_3_sub_batch_size = patches_3_sub.shape.as_list()[0]
-      self.assertEqual(2, patches_1_sub_batch_size)
-      self.assertEqual(8, patches_2_sub_batch_size)
-      self.assertEqual(3, patches_3_sub_batch_size)
-
-
-class ConvOutputKroneckerFactorTest(ConvFactorTestCase):
-
-  def test3DConvolution(self):
-    with tf_ops.Graph().as_default():
-      batch_size = 1
-      width = 3
-      out_channels = width**3
-
-      factor = ff.ConvOutputKroneckerFactor(outputs_grads=([
-          random_ops.random_uniform(
-              (batch_size, width, width, width, out_channels), seed=0)
-      ],))
-      factor.instantiate_cov_variables()
-
-      with self.test_session() as sess:
-        sess.run(tf_variables.global_variables_initializer())
-        sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov())
-
-      # Cov should be rank 3^3, as each spatial position donates a rank-1
-      # update.
-      self.assertMatrixRank(width**3, cov)
-
-  def testConvOutputKroneckerFactorInit(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3, 4, 5), name='a/b/c')
-      factor = ff.ConvOutputKroneckerFactor(((tensor,),))
-      factor.instantiate_cov_variables()
-      self.assertEqual([5, 5], factor.get_cov().get_shape().as_list())
-
-  def testConvOutputKroneckerFactorInitFloat64(self):
-    with tf_ops.Graph().as_default():
-      dtype = dtypes.float64_ref
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3, 4, 5), dtype=dtype, name='a/b/c')
-      factor = ff.ConvOutputKroneckerFactor(((tensor,),))
-      factor.instantiate_cov_variables()
-      cov = factor.get_cov()
-      self.assertEqual(cov.dtype, dtype)
-      self.assertEqual([5, 5], cov.get_shape().as_list())
-
-  def testMakeCovarianceUpdateOp(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      tensor = np.arange(1, 17).reshape(2, 2, 2, 2).astype(np.float32)
-      factor = ff.ConvOutputKroneckerFactor(((array_ops.constant(tensor),),))
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[43, 46.5], [46.5, 51.5]], new_cov)
-
-
-class FullyConnectedMultiKFTest(test.TestCase):
-
-  def testFullyConnectedMultiKFInit(self):
-    with tf_ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
-      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=False)
-      factor.instantiate_cov_variables()
-      self.assertEqual([3, 3], factor.get_cov().get_shape().as_list())
-
-  def testFullyConnectedMultiKFInitFloat64(self):
-    with tf_ops.Graph().as_default():
-      dtype = dtypes.float64_ref
-      random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
-      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=False)
-      factor.instantiate_cov_variables()
-      cov = factor.get_cov()
-      self.assertEqual(cov.dtype, dtype)
-      self.assertEqual([3, 3], cov.get_shape().as_list())
-
-  def testMakeCovarianceUpdateOpWithBias(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedMultiKF(((tensor,),), has_bias=True)
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[3, 3.5, 1], [3.5, 5.5, 1.5], [1, 1.5, 1]], new_cov)
-
-  def testMakeCovarianceUpdateOpNoBias(self):
-    with tf_ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
-      factor = ff.FullyConnectedMultiKF(((tensor,),))
-      factor.instantiate_cov_variables()
-
-      sess.run(tf_variables.global_variables_initializer())
-      new_cov = sess.run(factor.make_covariance_update_op(.5))
-      self.assertAllClose([[3, 3.5], [3.5, 5.5]], new_cov)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
deleted file mode 100644
index cb80fca370..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.layer_collection."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.kfac.python.ops import fisher_blocks
-from tensorflow.contrib.kfac.python.ops import fisher_factors
-from tensorflow.contrib.kfac.python.ops import layer_collection
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-
-
-class MockFisherBlock(object):
-  """A fake FisherBlock."""
-
-  num_registered_towers = 2
-
-  def __init__(self, name='MockFisherBlock'):
-    self.name = name
-
-  def __eq__(self, other):
-    return isinstance(other, MockFisherBlock) and other.name == self.name
-
-  def __hash__(self):
-    return hash(self.name)
-
-
-class LayerParametersDictTest(test.TestCase):
-
-  def testSetItem(self):
-    """Ensure insertion, contains, retrieval works for supported key types."""
-    with ops.Graph().as_default():
-      lp_dict = layer_collection.LayerParametersDict()
-
-      x = array_ops.constant(0)
-      y0 = array_ops.constant(0)
-      y1 = array_ops.constant(0)
-      z0 = array_ops.constant(0)
-      z1 = array_ops.constant(0)
-      keys = [x, (y0, y1), [z0, z1]]
-      for key in keys:
-        lp_dict[key] = key
-
-      for key in keys:
-        self.assertTrue(key in lp_dict)
-        self.assertEqual(lp_dict[key], key)
-
-  def testSetItemOverlap(self):
-    """Ensure insertion fails if key overlaps with existing key."""
-    with ops.Graph().as_default():
-      lp_dict = layer_collection.LayerParametersDict()
-
-      x = array_ops.constant(0)
-      y = array_ops.constant(0)
-      lp_dict[x] = 'value'
-
-      with self.assertRaises(ValueError):
-        lp_dict[(x, y)] = 'value'
-
-      # Ensure 'y' wasn't inserted.
-      self.assertTrue(x in lp_dict)
-      self.assertFalse(y in lp_dict)
-
-
-class LayerCollectionTest(test.TestCase):
-
-  def testLayerCollectionInit(self):
-    lc = layer_collection.LayerCollection()
-    self.assertEqual(0, len(lc.get_blocks()))
-    self.assertEqual(0, len(lc.get_factors()))
-    self.assertFalse(lc.losses)
-
-  def testRegisterBlocks(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      lc = layer_collection.LayerCollection()
-      lc.register_fully_connected(
-          array_ops.constant(1), array_ops.constant(2), array_ops.constant(3))
-      lc.register_fully_connected(
-          array_ops.constant(1),
-          array_ops.constant(2),
-          array_ops.constant(3),
-          approx=layer_collection.APPROX_DIAGONAL_NAME)
-      lc.register_conv2d(
-          params=array_ops.ones((2, 3, 4, 5)),
-          strides=[1, 1, 1, 1],
-          padding='SAME',
-          inputs=array_ops.ones((1, 2, 3, 4)),
-          outputs=array_ops.ones((1, 1, 1, 5)))
-      lc.register_conv2d(
-          params=array_ops.ones((2, 3, 4, 5)),
-          strides=[1, 1, 1, 1],
-          padding='SAME',
-          inputs=array_ops.ones((1, 2, 3, 4)),
-          outputs=array_ops.ones((1, 1, 1, 5)),
-          approx=layer_collection.APPROX_DIAGONAL_NAME)
-      lc.register_separable_conv2d(
-          depthwise_params=array_ops.ones((3, 3, 1, 2)),
-          pointwise_params=array_ops.ones((1, 1, 2, 4)),
-          inputs=array_ops.ones((32, 5, 5, 1)),
-          depthwise_outputs=array_ops.ones((32, 5, 5, 2)),
-          pointwise_outputs=array_ops.ones((32, 5, 5, 4)),
-          strides=[1, 1, 1, 1],
-          padding='SAME')
-      lc.register_convolution(
-          params=array_ops.ones((3, 3, 1, 8)),
-          inputs=array_ops.ones((32, 5, 5, 1)),
-          outputs=array_ops.ones((32, 5, 5, 8)),
-          padding='SAME')
-      lc.register_generic(
-          array_ops.constant(5), 16, approx=layer_collection.APPROX_FULL_NAME)
-      lc.register_generic(
-          array_ops.constant(6),
-          16,
-          approx=layer_collection.APPROX_DIAGONAL_NAME)
-      lc.register_fully_connected_multi(
-          array_ops.constant(1),
-          (array_ops.constant(2), array_ops.constant(3)),
-          (array_ops.constant(4), array_ops.constant(5)))
-      lc.register_conv2d_multi(
-          params=array_ops.ones((2, 3, 4, 5)),
-          strides=[1, 1, 1, 1],
-          padding='SAME',
-          inputs=(array_ops.ones((1, 2, 3, 4)), array_ops.ones((5, 6, 7, 8))),
-          outputs=(array_ops.ones((1, 1, 1, 5)), array_ops.ones((2, 2, 2, 10))))
-      lc.register_embedding_multi(
-          array_ops.constant((1,)),
-          (array_ops.constant(2), array_ops.constant(3)),
-          (array_ops.constant(4), array_ops.constant(5)))
-
-      self.assertEqual(12, len(lc.get_blocks()))
-
-  def testRegisterBlocksMultipleRegistrations(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      lc = layer_collection.LayerCollection()
-      key = array_ops.constant(1)
-      lc.register_fully_connected(key, array_ops.constant(2),
-                                  array_ops.constant(3))
-      with self.assertRaises(ValueError) as cm:
-        lc.register_generic(key, 16)
-      self.assertIn('already in LayerCollection', str(cm.exception))
-
-  def testRegisterSingleParamNotRegistered(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {
-        variable_scope.get_variable('y', initializer=array_ops.constant(1,)):
-            '1'
-    }
-    lc.register_block(x, 'foo')
-
-  def testShouldRegisterSingleParamRegistered(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {x: '1'}
-    with self.assertRaises(ValueError) as cm:
-      lc.register_block(x, 'foo')
-    self.assertIn('already in LayerCollection', str(cm.exception))
-
-  def testRegisterSingleParamRegisteredInTuple(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {(x, y): '1'}
-    with self.assertRaises(ValueError) as cm:
-      lc.register_block(x, 'foo')
-    self.assertIn('was already registered', str(cm.exception))
-
-  def testRegisterTupleParamNotRegistered(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {
-        variable_scope.get_variable('z', initializer=array_ops.constant(1,)):
-            '1'
-    }
-
-    lc.register_block((x, y), 'foo')
-    self.assertEqual(set(['1', 'foo']), set(lc.get_blocks()))
-
-  def testRegisterTupleParamRegistered(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {(x, y): '1'}
-
-    with self.assertRaises(ValueError) as cm:
-      lc.register_block((x, y), 'foo')
-    self.assertIn('already in LayerCollection', str(cm.exception))
-
-  def testRegisterTupleParamRegisteredInSuperset(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
-    z = variable_scope.get_variable('z', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {(x, y, z): '1'}
-
-    with self.assertRaises(ValueError) as cm:
-      lc.register_block((x, y), 'foo')
-    self.assertIn('was already registered', str(cm.exception))
-
-  def testRegisterTupleParamSomeRegistered(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
-    z = variable_scope.get_variable('z', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {x: MockFisherBlock('1'), z: MockFisherBlock('2')}
-
-    with self.assertRaises(ValueError) as cm:
-      lc.register_block((x, y), MockFisherBlock('foo'))
-    self.assertIn('was already registered', str(cm.exception))
-
-  def testRegisterTupleVarSomeRegisteredInOtherTuples(self):
-    x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
-    y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
-    z = variable_scope.get_variable('z', initializer=array_ops.constant(1,))
-    w = variable_scope.get_variable('w', initializer=array_ops.constant(1,))
-    lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {(x, z): '1', (z, w): '2'}
-
-    with self.assertRaises(ValueError) as cm:
-      lc.register_block((x, y), 'foo')
-    self.assertIn('was already registered', str(cm.exception))
-
-  def testRegisterCategoricalPredictiveDistribution(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      logits = linalg_ops.eye(2)
-
-      lc = layer_collection.LayerCollection()
-      lc.register_categorical_predictive_distribution(logits, seed=200)
-      single_loss = sess.run(lc.total_sampled_loss())
-
-      lc2 = layer_collection.LayerCollection()
-      lc2.register_categorical_predictive_distribution(logits, seed=200)
-      lc2.register_categorical_predictive_distribution(logits, seed=200)
-      double_loss = sess.run(lc2.total_sampled_loss())
-      self.assertAlmostEqual(2 * single_loss, double_loss)
-
-  def testLossFunctionByName(self):
-    """Ensure loss functions can be identified by name."""
-    with ops.Graph().as_default():
-      logits = linalg_ops.eye(2)
-      lc = layer_collection.LayerCollection()
-
-      # Create a new loss function by name.
-      lc.register_categorical_predictive_distribution(logits, name='loss1')
-      self.assertEqual(1, len(lc.towers_by_loss))
-
-      # Add logits to same loss function.
-      lc.register_categorical_predictive_distribution(
-          logits, name='loss1', reuse=True)
-      self.assertEqual(1, len(lc.towers_by_loss))
-
-      # Add another new loss function.
-      lc.register_categorical_predictive_distribution(logits, name='loss2')
-      self.assertEqual(2, len(lc.towers_by_loss))
-
-  def testLossFunctionWithoutName(self):
-    """Ensure loss functions get unique names if 'name' not specified."""
-    with ops.Graph().as_default():
-      logits = linalg_ops.eye(2)
-      lc = layer_collection.LayerCollection()
-
-      # Create a new loss function with default names.
-      lc.register_categorical_predictive_distribution(logits)
-      lc.register_categorical_predictive_distribution(logits)
-      self.assertEqual(2, len(lc.losses))
-
-  def testCategoricalPredictiveDistributionMultipleMinibatches(self):
-    """Ensure multiple minibatches are registered."""
-    with ops.Graph().as_default():
-      batch_size = 3
-      output_size = 2
-      logits = array_ops.zeros([batch_size, output_size])
-      targets = array_ops.ones([batch_size], dtype=dtypes.int32)
-      lc = layer_collection.LayerCollection()
-
-      # Create a new loss function.
-      lc.register_categorical_predictive_distribution(
-          logits, targets=targets, name='loss1')
-
-      # Can add when reuse=True
-      lc.register_categorical_predictive_distribution(
-          logits, targets=targets, name='loss1', reuse=True)
-
-      # Can add when reuse=VARIABLE_SCOPE and reuse=True there.
-      with variable_scope.variable_scope(
-          variable_scope.get_variable_scope(), reuse=True):
-        lc.register_categorical_predictive_distribution(
-            logits,
-            targets=targets,
-            name='loss1',
-            reuse=layer_collection.VARIABLE_SCOPE)
-
-      # Can't add when reuse=False
-      with self.assertRaises(KeyError):
-        lc.register_categorical_predictive_distribution(
-            logits, targets=targets, name='loss1', reuse=False)
-
-      # Can't add when reuse=VARIABLE_SCOPE and reuse=False there.
-      with self.assertRaises(KeyError):
-        lc.register_categorical_predictive_distribution(
-            logits,
-            targets=targets,
-            name='loss1',
-            reuse=layer_collection.VARIABLE_SCOPE)
-
-      self.assertEqual(len(lc.towers_by_loss), 1)
-      # Three successful registrations.
-      self.assertEqual(len(lc.towers_by_loss[0]), 3)
-
-  def testRegisterCategoricalPredictiveDistributionBatchSize1(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      logits = random_ops.random_normal((1, 2))
-      lc = layer_collection.LayerCollection()
-
-      lc.register_categorical_predictive_distribution(logits, seed=200)
-
-  def testRegisterCategoricalPredictiveDistributionSpecifiedTargets(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      logits = array_ops.constant([[1., 2.], [3., 4.]], dtype=dtypes.float32)
-      lc = layer_collection.LayerCollection()
-      targets = array_ops.constant([0, 1], dtype=dtypes.int32)
-
-      lc.register_categorical_predictive_distribution(logits, targets=targets)
-      single_loss = sess.run(lc.total_loss())
-      self.assertAlmostEqual(1.6265233, single_loss)
-
-  def testRegisterNormalPredictiveDistribution(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      predictions = array_ops.constant(
-          [[1., 2.], [3., 4]], dtype=dtypes.float32)
-
-      lc = layer_collection.LayerCollection()
-      lc.register_normal_predictive_distribution(predictions, 1., seed=200)
-      single_loss = sess.run(lc.total_sampled_loss())
-
-      lc2 = layer_collection.LayerCollection()
-      lc2.register_normal_predictive_distribution(predictions, 1., seed=200)
-      lc2.register_normal_predictive_distribution(predictions, 1., seed=200)
-      double_loss = sess.run(lc2.total_sampled_loss())
-
-      self.assertAlmostEqual(2 * single_loss, double_loss)
-
-  def testRegisterNormalPredictiveDistributionSpecifiedTargets(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      predictions = array_ops.constant(
-          [[1., 2.], [3., 4.]], dtype=dtypes.float32)
-      lc = layer_collection.LayerCollection()
-      targets = array_ops.constant([[3., 1.], [4., 2.]], dtype=dtypes.float32)
-
-      lc.register_normal_predictive_distribution(
-          predictions, 2.**2, targets=targets)
-      single_loss = sess.run(lc.total_loss())
-      self.assertAlmostEqual(7.6983433, single_loss)
-
-  def ensureLayerReuseWorks(self, register_fn):
-    """Ensure the 'reuse' keyword argument function as intended.
-
-    Args:
-      register_fn: function for registering a layer. Arguments are
-        layer_collection, reuse, and approx.
-    """
-    # Fails on second if reuse=False.
-    lc = layer_collection.LayerCollection()
-    register_fn(lc)
-    with self.assertRaises(ValueError):
-      register_fn(lc, reuse=False)
-
-    # Succeeds on second if reuse=True.
-    lc = layer_collection.LayerCollection()
-    register_fn(lc)
-    register_fn(lc, reuse=True)
-
-    # Fails on second if reuse=VARIABLE_SCOPE and no variable reuse.
-    lc = layer_collection.LayerCollection()
-    register_fn(lc)
-    with self.assertRaises(ValueError):
-      register_fn(lc, reuse=layer_collection.VARIABLE_SCOPE)
-
-    # Succeeds on second if reuse=VARIABLE_SCOPE and variable reuse.
-    lc = layer_collection.LayerCollection()
-    register_fn(lc)
-    with variable_scope.variable_scope(
-        variable_scope.get_variable_scope(), reuse=True):
-      register_fn(lc, reuse=layer_collection.VARIABLE_SCOPE)
-
-    # Fails if block type changes.
-    lc = layer_collection.LayerCollection()
-    register_fn(lc, approx=layer_collection.APPROX_KRONECKER_NAME)
-    with self.assertRaises(ValueError):
-      register_fn(lc, approx=layer_collection.APPROX_DIAGONAL_NAME, reuse=True)
-
-    # Fails if reuse requested but no FisherBlock exists.
-    lc = layer_collection.LayerCollection()
-    with self.assertRaises(KeyError):
-      register_fn(lc, reuse=True)
-
-  def testRegisterFullyConnectedReuse(self):
-    """Ensure the 'reuse' works with register_fully_connected."""
-    with ops.Graph().as_default():
-      inputs = array_ops.ones([2, 10])
-      outputs = array_ops.zeros([2, 5])
-      params = (
-          variable_scope.get_variable('w', [10, 5]),  #
-          variable_scope.get_variable('b', [5]))
-
-      def register_fn(lc, **kwargs):
-        lc.register_fully_connected(
-            params=params, inputs=inputs, outputs=outputs, **kwargs)
-
-      self.ensureLayerReuseWorks(register_fn)
-
-  def testRegisterConv2dReuse(self):
-    """Ensure the 'reuse' works with register_conv2d."""
-    with ops.Graph().as_default():
-      inputs = array_ops.ones([2, 5, 5, 10])
-      outputs = array_ops.zeros([2, 5, 5, 3])
-      params = (
-          variable_scope.get_variable('w', [1, 1, 10, 3]),  #
-          variable_scope.get_variable('b', [3]))
-
-      def register_fn(lc, **kwargs):
-        lc.register_conv2d(
-            params=params,
-            strides=[1, 1, 1, 1],
-            padding='SAME',
-            inputs=inputs,
-            outputs=outputs,
-            **kwargs)
-
-      self.ensureLayerReuseWorks(register_fn)
-
-  def testReuseWithInvalidRegistration(self):
-    """Invalid registrations shouldn't overwrite existing blocks."""
-    with ops.Graph().as_default():
-      inputs = array_ops.ones([2, 5, 5, 10])
-      outputs = array_ops.zeros([2, 5, 5, 3])
-      w = variable_scope.get_variable('w', [1, 1, 10, 3])
-      b = variable_scope.get_variable('b', [3])
-      lc = layer_collection.LayerCollection()
-      lc.register_fully_connected(w, inputs, outputs)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 1)
-      with self.assertRaises(KeyError):
-        lc.register_fully_connected((w, b), inputs, outputs, reuse=True)
-      self.assertNotIn((w, b), lc.fisher_blocks)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 1)
-      lc.register_fully_connected(w, inputs, outputs, reuse=True)
-      self.assertEqual(lc.fisher_blocks[w].num_registered_towers, 2)
-
-  def testMakeOrGetFactor(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      lc = layer_collection.LayerCollection()
-      key = array_ops.constant(1)
-      lc.make_or_get_factor(fisher_factors.FullFactor, ((key,), 16))
-      lc.make_or_get_factor(fisher_factors.FullFactor, ((key,), 16))
-      lc.make_or_get_factor(fisher_factors.FullFactor,
-                            ((array_ops.constant(2),), 16))
-
-      self.assertEqual(2, len(lc.get_factors()))
-      variables = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      self.assertTrue(
-          all([var.name.startswith('LayerCollection') for var in variables]))
-
-  def testMakeOrGetFactorCustomScope(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      scope = 'Foo'
-      lc = layer_collection.LayerCollection(name=scope)
-      key = array_ops.constant(1)
-      lc.make_or_get_factor(fisher_factors.FullFactor, ((key,), 16))
-      lc.make_or_get_factor(fisher_factors.FullFactor, ((key,), 16))
-      lc.make_or_get_factor(fisher_factors.FullFactor,
-                            ((array_ops.constant(2),), 16))
-
-      self.assertEqual(2, len(lc.get_factors()))
-      variables = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      self.assertTrue(all([var.name.startswith(scope) for var in variables]))
-
-  def testIdentifyLinkedParametersSomeRegisteredInOtherTuples(self):
-    x = variable_scope.get_variable('x', shape=())
-    y = variable_scope.get_variable('y', shape=())
-    z = variable_scope.get_variable('z', shape=())
-    lc = layer_collection.LayerCollection()
-    lc.define_linked_parameters((x, y))
-
-    with self.assertRaises(ValueError):
-      lc.define_linked_parameters((x, z))
-
-  def testIdentifySubsetPreviouslyRegisteredTensor(self):
-    x = variable_scope.get_variable('x', shape=())
-    y = variable_scope.get_variable('y', shape=())
-    lc = layer_collection.LayerCollection()
-    lc.define_linked_parameters((x, y))
-
-    with self.assertRaises(ValueError):
-      lc.define_linked_parameters(x)
-
-  def testSpecifyApproximation(self):
-    w_0 = variable_scope.get_variable('w_0', [10, 10])
-    w_1 = variable_scope.get_variable('w_1', [10, 10])
-
-    b_0 = variable_scope.get_variable('b_0', [10])
-    b_1 = variable_scope.get_variable('b_1', [10])
-
-    x_0 = array_ops.placeholder(dtypes.float32, shape=(32, 10))
-    x_1 = array_ops.placeholder(dtypes.float32, shape=(32, 10))
-
-    pre_bias_0 = math_ops.matmul(x_0, w_0)
-    pre_bias_1 = math_ops.matmul(x_1, w_1)
-
-    # Build the fully connected layers in the graph.
-    pre_bias_0 + b_0  # pylint: disable=pointless-statement
-    pre_bias_1 + b_1  # pylint: disable=pointless-statement
-
-    lc = layer_collection.LayerCollection()
-    lc.define_linked_parameters(
-        w_0, approximation=layer_collection.APPROX_DIAGONAL_NAME)
-    lc.define_linked_parameters(
-        w_1, approximation=layer_collection.APPROX_DIAGONAL_NAME)
-    lc.define_linked_parameters(
-        b_0, approximation=layer_collection.APPROX_FULL_NAME)
-    lc.define_linked_parameters(
-        b_1, approximation=layer_collection.APPROX_FULL_NAME)
-
-    lc.register_fully_connected(w_0, x_0, pre_bias_0)
-    lc.register_fully_connected(
-        w_1, x_1, pre_bias_1, approx=layer_collection.APPROX_KRONECKER_NAME)
-    self.assertIsInstance(lc.fisher_blocks[w_0],
-                          fisher_blocks.FullyConnectedDiagonalFB)
-    self.assertIsInstance(lc.fisher_blocks[w_1],
-                          fisher_blocks.FullyConnectedKFACBasicFB)
-
-    lc.register_generic(b_0, batch_size=1)
-    lc.register_generic(
-        b_1, batch_size=1, approx=layer_collection.APPROX_DIAGONAL_NAME)
-    self.assertIsInstance(lc.fisher_blocks[b_0], fisher_blocks.FullFB)
-    self.assertIsInstance(lc.fisher_blocks[b_1], fisher_blocks.NaiveDiagonalFB)
-
-  def testDefaultLayerCollection(self):
-    with ops.Graph().as_default():
-      # Can't get default if there isn't one set.
-      with self.assertRaises(ValueError):
-        layer_collection.get_default_layer_collection()
-
-      # Can't set default twice.
-      lc = layer_collection.LayerCollection()
-      layer_collection.set_default_layer_collection(lc)
-      with self.assertRaises(ValueError):
-        layer_collection.set_default_layer_collection(lc)
-
-      # Same as one set.
-      self.assertTrue(lc is layer_collection.get_default_layer_collection())
-
-      # Can set to None.
-      layer_collection.set_default_layer_collection(None)
-      with self.assertRaises(ValueError):
-        layer_collection.get_default_layer_collection()
-
-      # as_default() is the same as setting/clearing.
-      with lc.as_default():
-        self.assertTrue(lc is layer_collection.get_default_layer_collection())
-      with self.assertRaises(ValueError):
-        layer_collection.get_default_layer_collection()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
deleted file mode 100644
index c00af5593f..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.loss_functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.kfac.python.ops import loss_functions
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class InsertSliceInZerosTest(test.TestCase):
-
-  def testBadShape(self):
-    bad_shaped_ones = array_ops.ones(shape=[1, 3])  # n.b. shape[1] != 1
-    with self.assertRaises(ValueError):
-      loss_functions.insert_slice_in_zeros(bad_shaped_ones, 1, 42, 17)
-
-  def test3d(self):
-    input_tensor = constant_op.constant([[[1, 2]], [[3, 4]]])
-    expected_output_array = [[[1, 2], [0, 0]], [[3, 4], [0, 0]]]
-    op = loss_functions.insert_slice_in_zeros(input_tensor, 1, 2, 0)
-    with self.test_session() as sess:
-      actual_output_array = sess.run(op)
-    self.assertAllEqual(expected_output_array, actual_output_array)
-
-
-class CategoricalLogitsNegativeLogProbLossTest(test.TestCase):
-
-  def testSample(self):
-    """Ensure samples can be drawn."""
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.asarray([
-          [0., 0., 0.],  #
-          [1., -1., 0.]
-      ]).astype(np.float32)
-      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(
-          array_ops.constant(logits))
-      sample = loss.sample(42)
-      sample = sess.run(sample)
-      self.assertEqual(sample.shape, (2,))
-
-  def testEvaluateOnTargets(self):
-    """Ensure log probability can be evaluated correctly."""
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.asarray([
-          [0., 0., 0.],  #
-          [1., -1., 0.]
-      ]).astype(np.float32)
-      targets = np.asarray([2, 1]).astype(np.int32)
-      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(
-          array_ops.constant(logits), targets=array_ops.constant(targets))
-      neg_log_prob = loss.evaluate()
-      neg_log_prob = sess.run(neg_log_prob)
-
-      # Calculate explicit log probability of targets.
-      probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
-      log_probs = np.log([
-          probs[0, targets[0]],  #
-          probs[1, targets[1]]
-      ])
-      expected_log_prob = np.sum(log_probs)
-
-      self.assertAllClose(neg_log_prob, -expected_log_prob)
-
-  def testEvaluateOnSample(self):
-    """Ensure log probability of a sample can be drawn."""
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.asarray([
-          [0., 0., 0.],  #
-          [1., -1., 0.]
-      ]).astype(np.float32)
-      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(
-          array_ops.constant(logits))
-      neg_log_prob = loss.evaluate_on_sample(42)
-
-      # Simply ensure this doesn't crash. As the output is random, it's
-      # difficult to say if the output is correct or not...
-      neg_log_prob = sess.run(neg_log_prob)
-
-  def testMultiplyFisherSingleVector(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.array([1., 2., 3.])
-      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
-
-      # the LossFunction.multiply_fisher docstring only says it supports the
-      # case where the vector is the same shape as the input natural parameters
-      # (i.e. the logits here), but here we also test leading dimensions
-      vector = np.array([1., 2., 3.])
-      vectors = [vector, vector.reshape(1, -1), np.stack([vector] * 4)]
-
-      probs = np.exp(logits - np.logaddexp.reduce(logits))
-      fisher = np.diag(probs) - np.outer(probs, probs)
-
-      for vector in vectors:
-        result = loss.multiply_fisher(vector)
-        expected_result = np.dot(vector, fisher)
-        self.assertAllClose(expected_result, sess.run(result))
-
-  def testMultiplyFisherBatch(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.array([[1., 2., 3.], [4., 6., 8.]])
-      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
-
-      vector = np.array([[1., 2., 3.], [5., 3., 1.]])
-
-      na = np.newaxis
-      probs = np.exp(logits - np.logaddexp.reduce(logits, axis=-1,
-                                                  keepdims=True))
-      fishers = probs[..., na] * np.eye(3) - probs[..., na] * probs[..., na, :]
-
-      result = loss.multiply_fisher(vector)
-      expected_result = np.matmul(vector[..., na, :], fishers)[..., 0, :]
-      self.assertEqual(sess.run(result).shape, logits.shape)
-      self.assertAllClose(expected_result, sess.run(result))
-
-
-class OnehotCategoricalLogitsNegativeLogProbLossTest(test.TestCase):
-
-  def testSample(self):
-    """Ensure samples can be drawn."""
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.asarray([
-          [0., 0., 0.],  #
-          [1., -1., 0.]
-      ]).astype(np.float32)
-      loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
-          array_ops.constant(logits))
-      sample = loss.sample(42)
-      sample = sess.run(sample)
-      self.assertEqual(sample.shape, (2, 3))
-
-  def testEvaluateOnTargets(self):
-    """Ensure log probability can be evaluated correctly."""
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.asarray([
-          [0., 0., 0.],  #
-          [1., -1., 0.]
-      ]).astype(np.float32)
-      targets = np.asarray([2, 1]).astype(np.int32)
-      loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
-          array_ops.constant(logits), targets=array_ops.one_hot(targets, 3))
-      neg_log_prob = loss.evaluate()
-      neg_log_prob = sess.run(neg_log_prob)
-
-      # Calculate explicit log probability of targets.
-      probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
-      log_probs = np.log([
-          probs[0, targets[0]],  #
-          probs[1, targets[1]]
-      ])
-      expected_log_prob = np.sum(log_probs)
-
-      self.assertAllClose(neg_log_prob, -expected_log_prob)
-
-  def testEvaluateOnSample(self):
-    """Ensure log probability of a sample can be drawn."""
-    with ops.Graph().as_default(), self.test_session() as sess:
-      logits = np.asarray([
-          [0., 0., 0.],  #
-          [1., -1., 0.]
-      ]).astype(np.float32)
-      loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
-          array_ops.constant(logits))
-      neg_log_prob = loss.evaluate_on_sample(42)
-
-      # Simply ensure this doesn't crash. As the output is random, it's
-      # difficult to say if the output is correct or not...
-      neg_log_prob = sess.run(neg_log_prob)
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/op_queue_test.py b/tensorflow/contrib/kfac/python/kernel_tests/op_queue_test.py
deleted file mode 100644
index b20a70e4ca..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/op_queue_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.op_queue."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.kfac.python.ops import op_queue
-from tensorflow.python.framework import ops as tf_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class OpQueueTest(test.TestCase):
-
-  def testNextOp(self):
-    """Ensures all ops get selected eventually."""
-    with tf_ops.Graph().as_default():
-      ops = [
-          math_ops.add(1, 2),
-          math_ops.subtract(1, 2),
-          math_ops.reduce_mean([1, 2]),
-      ]
-      queue = op_queue.OpQueue(ops, seed=0)
-
-      with self.test_session() as sess:
-        # Ensure every inv update op gets selected.
-        selected_ops = set([queue.next_op(sess) for _ in ops])
-        self.assertEqual(set(ops), set(selected_ops))
-
-        # Ensure additional calls don't create any new ops.
-        selected_ops.add(queue.next_op(sess))
-        self.assertEqual(set(ops), set(selected_ops))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py b/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
deleted file mode 100644
index 560a9b0b42..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.optimizer."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
-from tensorflow.contrib.kfac.python.ops import layer_collection as lc
-from tensorflow.contrib.kfac.python.ops import optimizer
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import test
-
-
-# We need to set these constants since the numerical values used in the tests
-# were chosen when these used to be the defaults.
-ff.set_global_constants(init_covariances_at_zero=False,
-                        zero_debias=False,
-                        init_inverses_at_zero=False)
-
-
-def dummy_layer_collection():
-  lcoll = lc.LayerCollection()
-  dummy = array_ops.constant([1., 2.])
-  lcoll.register_categorical_predictive_distribution(logits=dummy)
-  return lcoll
-
-
-class OptimizerTest(test.TestCase):
-
-  def testOptimizerInitInvalidMomentumRegistration(self):
-    with self.assertRaises(ValueError):
-      optimizer.KfacOptimizer(
-          0.1, 0.2, 0.3, lc.LayerCollection(), momentum_type='foo')
-
-  def testOptimizerInit(self):
-    with ops.Graph().as_default():
-      layer_collection = lc.LayerCollection()
-
-      inputs = array_ops.ones((2, 1)) * 2
-      weights_val = np.ones((1, 1), dtype=np.float32) * 3.
-      weights = variable_scope.get_variable(
-          'w', initializer=array_ops.constant(weights_val))
-      bias = variable_scope.get_variable(
-          'b', initializer=init_ops.zeros_initializer(), shape=(1, 1))
-      output = math_ops.matmul(inputs, weights) + bias
-
-      layer_collection.register_fully_connected((weights, bias), inputs, output)
-
-      logits = math_ops.tanh(output)
-      targets = array_ops.constant([[0.], [1.]])
-      output = math_ops.reduce_mean(
-          nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets))
-
-      layer_collection.register_categorical_predictive_distribution(logits)
-
-      optimizer.KfacOptimizer(
-          0.1,
-          0.2,
-          0.3,
-          layer_collection,
-          momentum=0.5,
-          momentum_type='regular')
-
-  def testSquaredFisherNorm(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      grads_and_vars = [(array_ops.constant([[1., 2.], [3., 4.]]), None),
-                        (array_ops.constant([[2., 3.], [4., 5.]]), None)]
-      pgrads_and_vars = [(array_ops.constant([[3., 4.], [5., 6.]]), None),
-                         (array_ops.constant([[7., 8.], [9., 10.]]), None)]
-      opt = optimizer.KfacOptimizer(0.1, 0.2, 0.3, dummy_layer_collection())
-      sq_norm = opt._squared_fisher_norm(grads_and_vars, pgrads_and_vars)
-      self.assertAlmostEqual(174., sess.run(sq_norm), places=5)
-
-  def testUpdateClipCoeff(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      grads_and_vars = [(array_ops.constant([[1., 2.], [3., 4.]]), None),
-                        (array_ops.constant([[2., 3.], [4., 5.]]), None)]
-      pgrads_and_vars = [(array_ops.constant([[3., 4.], [5., 6.]]), None),
-                         (array_ops.constant([[7., 8.], [9., 10.]]), None)]
-      lrate = 0.1
-
-      # Note: without rescaling, the squared Fisher norm of the update
-      # is 1.74
-
-      # If the update already satisfies the norm constraint, there should
-      # be no rescaling.
-      opt = optimizer.KfacOptimizer(
-          lrate, 0.2, 0.3, dummy_layer_collection(), norm_constraint=10.)
-      coeff = opt._update_clip_coeff(grads_and_vars, pgrads_and_vars)
-      self.assertAlmostEqual(1., sess.run(coeff), places=5)
-
-      # If the update violates the constraint, it should be rescaled to
-      # be on the constraint boundary.
-      opt = optimizer.KfacOptimizer(
-          lrate, 0.2, 0.3, dummy_layer_collection(), norm_constraint=0.5)
-      coeff = opt._update_clip_coeff(grads_and_vars, pgrads_and_vars)
-      sq_norm_pgrad = opt._squared_fisher_norm(grads_and_vars, pgrads_and_vars)
-      sq_norm_update = lrate**2 * coeff**2 * sq_norm_pgrad
-      self.assertAlmostEqual(0.5, sess.run(sq_norm_update), places=5)
-
-  def testComputeUpdateStepsRegular(self):
-    # TODO(olganw): implement this.
-    pass
-
-  def testComputeUpdateStepsAdam(self):
-    # TODO(olganw): implement this.
-    pass
-
-  def testUpdateVelocities(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      layers = lc.LayerCollection()
-      layers.register_categorical_predictive_distribution(
-          array_ops.constant([1.0]))
-      opt = optimizer.KfacOptimizer(
-          0.1, 0.2, 0.3, layers, momentum=0.5, momentum_type='regular')
-      x = variable_scope.get_variable('x', initializer=array_ops.ones((2, 2)))
-      y = variable_scope.get_variable(
-          'y', initializer=array_ops.ones((2, 2)) * 2)
-      vec1 = array_ops.ones((2, 2)) * 3
-      vec2 = array_ops.ones((2, 2)) * 4
-
-      model_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      update_op = opt._update_velocities([(vec1, x), (vec2, y)], 0.5)
-      opt_vars = [
-          v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-          if v not in model_vars
-      ]
-
-      sess.run(tf_variables.global_variables_initializer())
-      old_opt_vars = sess.run(opt_vars)
-
-      # Optimizer vars start out at 0.
-      for opt_var in old_opt_vars:
-        self.assertAllEqual(sess.run(array_ops.zeros_like(opt_var)), opt_var)
-
-      sess.run(update_op)
-      new_opt_vars = sess.run(opt_vars)
-      # After one update, the velocities are equal to the vectors.
-      for vec, opt_var in zip([vec1, vec2], new_opt_vars):
-        self.assertAllEqual(sess.run(vec), opt_var)
-
-      sess.run(update_op)
-      final_opt_vars = sess.run(opt_vars)
-      for first, second in zip(new_opt_vars, final_opt_vars):
-        self.assertFalse(np.equal(first, second).all())
-
-  def testApplyGradients(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      layer_collection = lc.LayerCollection()
-
-      inputs = array_ops.ones((2, 1)) * 2
-      weights_val = np.ones((1, 1), dtype=np.float32) * 3.
-      weights = variable_scope.get_variable(
-          'w', initializer=array_ops.constant(weights_val))
-      bias = variable_scope.get_variable(
-          'b', initializer=init_ops.zeros_initializer(), shape=(1, 1))
-      output = math_ops.matmul(inputs, weights) + bias
-
-      layer_collection.register_fully_connected((weights, bias), inputs, output)
-
-      logits = math_ops.tanh(output)
-      targets = array_ops.constant([[0.], [1.]])
-      output = math_ops.reduce_mean(
-          nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets))
-
-      layer_collection.register_categorical_predictive_distribution(logits)
-
-      opt = optimizer.KfacOptimizer(
-          0.1,
-          0.2,
-          0.3,
-          layer_collection,
-          momentum=0.5,
-          momentum_type='regular')
-      (cov_update_thunks,
-       inv_update_thunks) = opt.make_vars_and_create_op_thunks()
-      cov_update_ops = tuple(thunk() for thunk in cov_update_thunks)
-      inv_update_ops = tuple(thunk() for thunk in inv_update_thunks)
-
-      grads_and_vars = opt.compute_gradients(output, [weights, bias])
-      all_vars = [grad_and_var[1] for grad_and_var in grads_and_vars]
-
-      op = opt.apply_gradients(grads_and_vars)
-
-      sess.run(tf_variables.global_variables_initializer())
-      old_vars = sess.run(all_vars)
-      sess.run(cov_update_ops)
-      sess.run(inv_update_ops)
-      sess.run(op)
-      new_vars = sess.run(all_vars)
-
-      for old_var, new_var in zip(old_vars, new_vars):
-        self.assertNotEqual(old_var, new_var)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
deleted file mode 100644
index 2cee01212a..0000000000
--- a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf.contrib.kfac.utils."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import numpy.random as npr
-
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class SequenceDictTest(test.TestCase):
-
-  def testSequenceDictInit(self):
-    seq_dict = utils.SequenceDict()
-    self.assertFalse(seq_dict._dict)
-
-  def testSequenceDictInitWithIterable(self):
-    reg_dict = {'a': 'foo', 'b': 'bar'}
-    itr = zip(reg_dict.keys(), reg_dict.values())
-    seq_dict = utils.SequenceDict(itr)
-    self.assertEqual(reg_dict, seq_dict._dict)
-
-  def testGetItemSingleKey(self):
-    seq_dict = utils.SequenceDict({'a': 'foo', 'b': 'bar'})
-    self.assertEqual('foo', seq_dict['a'])
-
-  def testGetItemMultipleKeys(self):
-    seq_dict = utils.SequenceDict({'a': 'foo', 'b': 'bar'})
-    self.assertEqual(['foo', 'bar'], seq_dict[('a', 'b')])
-
-  def testSetItemSingleKey(self):
-    seq_dict = utils.SequenceDict()
-    seq_dict['a'] = 'foo'
-    self.assertEqual([('a', 'foo')], seq_dict.items())
-
-  def testSetItemMultipleKeys(self):
-    seq_dict = utils.SequenceDict()
-    keys = ('a', 'b', 'c')
-    values = ('foo', 'bar', 'baz')
-    seq_dict[keys] = values
-    self.assertItemsEqual(list(zip(keys, values)), seq_dict.items())
-
-
-class SubGraphTest(test.TestCase):
-
-  def testBasicGraph(self):
-    a = array_ops.constant([[1., 2.], [3., 4.]])
-    b = array_ops.constant([[5., 6.], [7., 8.]])
-    c = a + b
-    d = a * b
-    sub_graph = utils.SubGraph((c,))
-    self.assertTrue(sub_graph.is_member(a))
-    self.assertTrue(sub_graph.is_member(b))
-    self.assertTrue(sub_graph.is_member(c))
-    self.assertFalse(sub_graph.is_member(d))
-
-  def testRepeatedAdds(self):
-    a = array_ops.constant([[1., 2.], [3., 4.]])
-    b = array_ops.constant([[5., 6.], [7., 8.]])
-    c = a + b + a  # note that a appears twice in this graph
-    sub_graph = utils.SubGraph((c,))
-    self.assertTrue(sub_graph.is_member(a))
-    self.assertTrue(sub_graph.is_member(b))
-    self.assertTrue(sub_graph.is_member(c))
-
-  def testFilterList(self):
-    a = array_ops.constant([[1., 2.], [3., 4.]])
-    b = array_ops.constant([[5., 6.], [7., 8.]])
-    c = a + b
-    d = a * b
-    sub_graph = utils.SubGraph((c,))
-    input_list = [b, d]
-    filtered_list = sub_graph.filter_list(input_list)
-    self.assertEqual(filtered_list, [b])
-
-  def testVariableUses(self):
-    with ops.Graph().as_default():
-      var = variable_scope.get_variable('var', shape=[10, 10])
-      resource_var = variable_scope.get_variable(
-          'resource_var', shape=[10, 10], use_resource=True)
-      x = array_ops.zeros([3, 10])
-      z0 = math_ops.matmul(x, var) + math_ops.matmul(x, var)
-      z1 = math_ops.matmul(x, resource_var)
-      sub_graph = utils.SubGraph((z0, z1))
-      self.assertEqual(2, sub_graph.variable_uses(var))
-      self.assertEqual(1, sub_graph.variable_uses(resource_var))
-
-
-class UtilsTest(test.TestCase):
-
-  def _fully_connected_layer_params(self):
-    weights_part = array_ops.constant([[1., 2.], [4., 3.]])
-    bias_part = array_ops.constant([1., 2.])
-    return (weights_part, bias_part)
-
-  def _conv_layer_params(self):
-    weights_shape = 2, 2, 3, 4
-    biases_shape = weights_shape[-1:]
-    weights = array_ops.constant(npr.RandomState(0).randn(*weights_shape))
-    biases = array_ops.constant(npr.RandomState(1).randn(*biases_shape))
-    return (weights, biases)
-
-  def testFullyConnectedLayerParamsTupleToMat2d(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      layer_params = self._fully_connected_layer_params()
-      output = utils.layer_params_to_mat2d(layer_params)
-      self.assertListEqual([3, 2], output.get_shape().as_list())
-      self.assertAllClose(
-          sess.run(output), np.array([[1., 2.], [4., 3.], [1., 2.]]))
-
-  def testFullyConnectedLayerParamsTensorToMat2d(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      layer_params = self._fully_connected_layer_params()
-      output = utils.layer_params_to_mat2d(layer_params[0])
-      self.assertListEqual([2, 2], output.get_shape().as_list())
-      self.assertAllClose(sess.run(output), np.array([[1., 2.], [4., 3.]]))
-
-  def testConvLayerParamsTupleToMat2d(self):
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(200)
-      layer_params = self._conv_layer_params()
-      output = utils.layer_params_to_mat2d(layer_params)
-      self.assertListEqual([2 * 2 * 3 + 1, 4], output.get_shape().as_list())
-
-  def testKron(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      mat1 = np.array([[1., 2.], [3., 4.]])
-      mat2 = np.array([[5., 6.], [7., 8.]])
-      mat1_tf = array_ops.constant(mat1)
-      mat2_tf = array_ops.constant(mat2)
-      ans_tf = sess.run(utils.kronecker_product(mat1_tf, mat2_tf))
-      ans_np = np.kron(mat1, mat2)
-      self.assertAllClose(ans_tf, ans_np)
-
-  def testMat2dToFullyConnectedLayerParamsTuple(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      vector_template = self._fully_connected_layer_params()
-      mat2d = array_ops.constant([[5., 4.], [3., 2.], [1., 0.]])
-
-      output = sess.run(utils.mat2d_to_layer_params(vector_template, mat2d))
-
-      self.assertIsInstance(output, tuple)
-      self.assertEqual(len(output), 2)
-      a, b = output
-      self.assertAllClose(a, np.array([[5., 4.], [3., 2.]]))
-      self.assertAllClose(b, np.array([1., 0.]))
-
-  def testMat2dToFullyConnectedLayerParamsTensor(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      vector_template = self._fully_connected_layer_params()[0]
-      mat2d = array_ops.constant([[5., 4.], [3., 2.]])
-
-      output = sess.run(utils.mat2d_to_layer_params(vector_template, mat2d))
-
-      self.assertAllClose(output, np.array([[5., 4.], [3., 2.]]))
-
-  def testTensorsToColumn(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-
-      vector = array_ops.constant(np.array([[0., 1.], [2., 3.]]))
-      output = utils.tensors_to_column(vector)
-      self.assertListEqual([4, 1], output.get_shape().as_list())
-      self.assertAllClose(sess.run(output), np.array([0., 1., 2., 3.])[:, None])
-
-      vector = self._fully_connected_layer_params()
-      output = utils.tensors_to_column(vector)
-      self.assertListEqual([6, 1], output.get_shape().as_list())
-      self.assertAllClose(
-          sess.run(output), np.array([1., 2., 4., 3., 1., 2.])[:, None])
-
-      vector = list(vector)
-      vector.append(array_ops.constant([[6.], [7.], [8.], [9.]]))
-
-      output = utils.tensors_to_column(vector)
-      self.assertListEqual([10, 1], output.get_shape().as_list())
-      self.assertAllClose(
-          sess.run(output),
-          np.array([1., 2., 4., 3., 1., 2., 6., 7., 8., 9.])[:, None])
-
-  def testColumnToTensors(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-
-      vector_template = array_ops.constant(np.array([[0., 1.], [2., 3.]]))
-      colvec = array_ops.constant(np.arange(4.)[:, None])
-      output = sess.run(utils.column_to_tensors(vector_template, colvec))
-      self.assertAllClose(output, np.array([[0., 1.], [2., 3.]]))
-
-      vector_template = self._fully_connected_layer_params()
-      colvec = array_ops.constant(np.arange(6.)[:, None])
-      output = sess.run(utils.column_to_tensors(vector_template, colvec))
-
-      self.assertIsInstance(output, tuple)
-      self.assertEqual(len(output), 2)
-      a, b = output
-      self.assertAllClose(a, np.array([[0., 1.], [2., 3.]]))
-      self.assertAllClose(b, np.array([4., 5.]))
-
-      vector_template = list(vector_template)
-      vector_template.append(array_ops.constant([[6.], [7.], [8.], [9.]]))
-      colvec = array_ops.constant(np.arange(10.)[:, None])
-      output = sess.run(utils.column_to_tensors(vector_template, colvec))
-      self.assertIsInstance(output, tuple)
-      self.assertEqual(len(output), 3)
-      a, b, c = output
-      self.assertAllClose(a, np.array([[0., 1.], [2., 3.]]))
-      self.assertAllClose(b, np.array([4., 5.]))
-      self.assertAllClose(c, np.array([[6.], [7.], [8.], [9.]]))
-
-  def testPosDefInvCholesky(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      npr.seed(0)
-      square = lambda x: np.dot(x, x.T)
-
-      size = 3
-      x = square(npr.randn(size, size))
-      damp = 0.1
-      identity = linalg_ops.eye(size, dtype=dtypes.float64)
-
-      tf_inv = utils.posdef_inv_cholesky(array_ops.constant(x), identity, damp)
-      np_inv = np.linalg.inv(x + damp * np.eye(size))
-      self.assertAllClose(sess.run(tf_inv), np_inv)
-
-  def testPosDefInvMatrixInverse(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      npr.seed(0)
-      square = lambda x: np.dot(x, x.T)
-
-      size = 3
-      x = square(npr.randn(size, size))
-      damp = 0.1
-      identity = linalg_ops.eye(size, dtype=dtypes.float64)
-
-      tf_inv = utils.posdef_inv_matrix_inverse(
-          array_ops.constant(x), identity, damp)
-      np_inv = np.linalg.inv(x + damp * np.eye(size))
-      self.assertAllClose(sess.run(tf_inv), np_inv)
-
-  def testCrossReplicaMean(self):
-    """Ensures that cross_replica_mean() executes only when num_shards > 1."""
-    with ops.Graph().as_default():
-      with tpu_function.tpu_shard_context(4):
-        tensor = array_ops.zeros([], dtype=dtypes.float32)
-        mean = utils.cross_replica_mean(tensor)
-      self.assertNotEqual(mean, tensor)
-
-    with ops.Graph().as_default():
-      with tpu_function.tpu_shard_context(1):
-        tensor = array_ops.zeros([], dtype=dtypes.float32)
-        mean = utils.cross_replica_mean(tensor)
-      self.assertEqual(mean, tensor)
-
-    with ops.Graph().as_default():
-      with self.assertRaises(ValueError):  # Outside of TPU context.
-        tensor = array_ops.zeros([], dtype=dtypes.float32)
-        mean = utils.cross_replica_mean(tensor)
-
-  def testBatchExecute(self):
-    """Ensure batch_execute runs in a round-robin fashion."""
-
-    def increment_var(var):
-      return lambda: var.assign_add(1)
-
-    with ops.Graph().as_default(), self.test_session() as sess:
-      i = variable_scope.get_variable('i', initializer=0)
-      accumulators = [
-          variable_scope.get_variable('var%d' % j, initializer=0)
-          for j in range(3)
-      ]
-      thunks = [increment_var(var) for var in accumulators]
-      increment_accumulators = utils.batch_execute(i, thunks, 2)
-      increment_i = i.assign_add(1)
-
-      sess.run(variables.global_variables_initializer())
-
-      # Ensure one op per thunk.
-      self.assertEqual(3, len(increment_accumulators))
-
-      # Ensure round-robin execution.
-      values = []
-      for _ in range(5):
-        sess.run(increment_accumulators)
-        sess.run(increment_i)
-        values.append(sess.run(accumulators))
-      self.assertAllClose(
-          [
-              [1, 1, 0],  #
-              [2, 1, 1],  #
-              [2, 2, 2],  #
-              [3, 3, 2],  #
-              [4, 3, 3]
-          ],
-          values)
-
-  def testExtractConvolutionPatches(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      batch_size = 10
-      image_spatial_shape = [9, 10, 11]
-      in_channels = out_channels = 32
-      kernel_spatial_shape = [5, 3, 3]
-      spatial_strides = [1, 2, 1]
-      spatial_dilation = [1, 1, 1]
-      padding = 'SAME'
-
-      images = random_ops.random_uniform(
-          [batch_size] + image_spatial_shape + [in_channels], seed=0)
-      kernel_shape = kernel_spatial_shape + [in_channels, out_channels]
-      kernel = random_ops.random_uniform(kernel_shape, seed=1)
-
-      # Ensure shape matches expectation.
-      patches = utils.extract_convolution_patches(
-          images,
-          kernel_shape,
-          padding,
-          strides=spatial_strides,
-          dilation_rate=spatial_dilation)
-      result_spatial_shape = (
-          patches.shape.as_list()[1:1 + len(image_spatial_shape)])
-      self.assertEqual(patches.shape.as_list(),
-                       [batch_size] + result_spatial_shape +
-                       kernel_spatial_shape + [in_channels])
-
-      # Ensure extract...patches() + matmul() and convolution() implementation
-      # give the same answer.
-      outputs = nn_ops.convolution(
-          images,
-          kernel,
-          padding,
-          strides=spatial_strides,
-          dilation_rate=spatial_dilation)
-
-      patches_flat = array_ops.reshape(
-          patches, [-1, np.prod(kernel_spatial_shape) * in_channels])
-      kernel_flat = array_ops.reshape(kernel, [-1, out_channels])
-      outputs_flat = math_ops.matmul(patches_flat, kernel_flat)
-
-      outputs_, outputs_flat_ = sess.run([outputs, outputs_flat])
-      self.assertAllClose(outputs_.flatten(), outputs_flat_.flatten())
-
-  def testExtractPointwiseConv2dPatches(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      batch_size = 10
-      image_height = image_width = 8
-      in_channels = out_channels = 3
-      kernel_height = kernel_width = 1
-      strides = [1, 1, 1, 1]
-      padding = 'VALID'
-
-      images = random_ops.random_uniform(
-          [batch_size, image_height, image_width, in_channels], seed=0)
-      kernel_shape = [kernel_height, kernel_width, in_channels, out_channels]
-      kernel = random_ops.random_uniform(kernel_shape, seed=1)
-
-      # Ensure shape matches expectation.
-      patches = utils.extract_pointwise_conv2d_patches(images, kernel_shape)
-      self.assertEqual(patches.shape.as_list(), [
-          batch_size, image_height, image_width, kernel_height, kernel_width,
-          in_channels
-      ])
-
-      # Ensure extract...patches() + matmul() and conv2d() implementation
-      # give the same answer.
-      outputs = nn_ops.conv2d(images, kernel, strides, padding)
-
-      patches_flat = array_ops.reshape(
-          patches, [-1, kernel_height * kernel_width * in_channels])
-      kernel_flat = array_ops.reshape(kernel, [-1, out_channels])
-      outputs_flat = math_ops.matmul(patches_flat, kernel_flat)
-
-      outputs_, outputs_flat_ = sess.run([outputs, outputs_flat])
-      self.assertAllClose(outputs_.flatten(), outputs_flat_.flatten())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
deleted file mode 100644
index 3c01eb65e7..0000000000
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ /dev/null
@@ -1,263 +0,0 @@
-package(default_visibility = [
-    "//tensorflow/contrib/kfac:__pkg__",
-    "//tensorflow/contrib/kfac/python/kernel_tests:__pkg__",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "fisher_blocks",
-    srcs = ["fisher_blocks.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":fisher_factors",
-        ":utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "fisher_blocks_lib",
-    srcs = ["fisher_blocks_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":fisher_blocks",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "fisher_factors",
-    srcs = ["fisher_factors.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":linear_operator",
-        ":utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:special_math_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "fisher_factors_lib",
-    srcs = ["fisher_factors_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":fisher_factors",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "linear_operator",
-    srcs = ["linear_operator.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":utils",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/ops/linalg",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "loss_functions",
-    srcs = ["loss_functions.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/ops/distributions",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "loss_functions_lib",
-    srcs = ["loss_functions_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":loss_functions",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "curvature_matrix_vector_products",
-    srcs = ["curvature_matrix_vector_products.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":utils",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "curvature_matrix_vector_products_lib",
-    srcs = ["curvature_matrix_vector_products_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":curvature_matrix_vector_products",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "layer_collection",
-    srcs = ["layer_collection.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":fisher_blocks",
-        ":loss_functions",
-        ":utils",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "layer_collection_lib",
-    srcs = ["layer_collection_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":layer_collection",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "kfac_optimizer",
-    srcs = [
-        "optimizer.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":curvature_matrix_vector_products",
-        ":fisher_estimator",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
-py_library(
-    name = "kfac_optimizer_lib",
-    srcs = [
-        "optimizer_lib.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":kfac_optimizer",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "fisher_estimator",
-    srcs = [
-        "estimator.py",
-        "placement.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":utils",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "fisher_estimator_lib",
-    srcs = [
-        "estimator_lib.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":fisher_estimator",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "utils",
-    srcs = ["utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/tpu",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "utils_lib",
-    srcs = ["utils_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":utils",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "op_queue",
-    srcs = ["op_queue.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/python:framework_ops",
-    ],
-)
-
-py_library(
-    name = "op_queue_lib",
-    srcs = ["op_queue_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":op_queue",
-        "//tensorflow/python:util",
-    ],
-)
diff --git a/tensorflow/contrib/kfac/python/ops/curvature_matrix_vector_products.py b/tensorflow/contrib/kfac/python/ops/curvature_matrix_vector_products.py
deleted file mode 100644
index 21b5cde9b9..0000000000
--- a/tensorflow/contrib/kfac/python/ops/curvature_matrix_vector_products.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Curvature matrix-vector multiplication."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.util import nest
-
-
-class CurvatureMatrixVectorProductComputer(object):
-  """Class for computing matrix-vector products for Fishers, GGNs and Hessians.
-
-  In other words we compute M*v where M is the matrix, v is the vector, and
-  * refers to standard matrix/vector multiplication (not element-wise
-  multiplication).
-
-  The matrices are defined in terms of some differential quantity of the total
-  loss function with respect to a provided list of tensors ("wrt_tensors").
-  For example, the Fisher associated with a log-prob loss w.r.t. the
-  parameters.
-
-  The 'vecs' argument to each method are lists of tensors that must be the
-  size as the corresponding ones from "wrt_tensors".  They represent
-  the vector being multiplied.
-
-  "factors" of the matrix M are defined as matrices B such that B*B^T = M.
-  Methods that multiply by the factor B take a 'loss_inner_vecs' argument
-  instead of 'vecs', which must be a list of tensors with shapes given by the
-  corresponding XXX_inner_shapes property.
-
-  Note that matrix-vector products are not normalized by the batch size, nor
-  are any damping terms added to the results.  These things can be easily
-  applied externally, if desired.
-
-  See for example: www.cs.utoronto.ca/~jmartens/docs/HF_book_chapter.pdf
-  and https://arxiv.org/abs/1412.1193 for more information about the
-  generalized Gauss-Newton, Fisher, etc., and how to compute matrix-vector
-  products.
-  """
-
-  def __init__(self, losses, wrt_tensors):
-    """Create a CurvatureMatrixVectorProductComputer object.
-
-    Args:
-      losses: A list of LossFunction instances whose sum defines the total loss.
-      wrt_tensors: A list of Tensors to compute the differential quantities
-        (defining the matrices) with respect to.  See class description for more
-        info.
-    """
-    self._losses = losses
-    self._inputs_to_losses = list(loss.inputs for loss in losses)
-    self._inputs_to_losses_flat = nest.flatten(self._inputs_to_losses)
-    self._wrt_tensors = wrt_tensors
-
-  @property
-  def _total_loss(self):
-    return math_ops.add_n(tuple(loss.evaluate() for loss in self._losses))
-
-  # Jacobian multiplication functions:
-  def _multiply_jacobian(self, vecs):
-    """Multiply vecs by the Jacobian of losses."""
-    # We stop gradients at wrt_tensors to produce partial derivatives (which is
-    # what we want for Jacobians).
-    jacobian_vecs_flat = utils.fwd_gradients(
-        self._inputs_to_losses_flat, self._wrt_tensors, grad_xs=vecs,
-        stop_gradients=self._wrt_tensors)
-    return nest.pack_sequence_as(self._inputs_to_losses, jacobian_vecs_flat)
-
-  def _multiply_jacobian_transpose(self, loss_vecs):
-    """Multiply vecs by the transpose Jacobian of losses."""
-    loss_vecs_flat = nest.flatten(loss_vecs)
-    # We stop gradients at wrt_tensors to produce partial derivatives (which is
-    # what we want for Jacobians).
-    return gradients_impl.gradients(
-        self._inputs_to_losses_flat, self._wrt_tensors, grad_ys=loss_vecs_flat,
-        stop_gradients=self._wrt_tensors)
-
-  # Losses Fisher/Hessian multiplication functions:
-  def _multiply_loss_fisher(self, loss_vecs):
-    """Multiply loss_vecs by Fisher of total loss."""
-    return tuple(
-        loss.multiply_fisher(loss_vec)
-        for loss, loss_vec in zip(self._losses, loss_vecs))
-
-  def _multiply_loss_fisher_factor(self, loss_inner_vecs):
-    """Multiply loss_inner_vecs by factor of Fisher of total loss."""
-    return tuple(
-        loss.multiply_fisher_factor(loss_vec)
-        for loss, loss_vec in zip(self._losses, loss_inner_vecs))
-
-  def _multiply_loss_fisher_factor_transpose(self, loss_vecs):
-    """Multiply loss_vecs by transpose factor of Fisher of total loss."""
-    return tuple(
-        loss.multiply_fisher_factor_transpose(loss_vec)
-        for loss, loss_vec in zip(self._losses, loss_vecs))
-
-  def _multiply_loss_hessian(self, loss_vecs):
-    """Multiply loss_vecs by Hessian of total loss."""
-    return tuple(
-        loss.multiply_hessian(loss_vec)
-        for loss, loss_vec in zip(self._losses, loss_vecs))
-
-  def _multiply_loss_hessian_factor(self, loss_inner_vecs):
-    """Multiply loss_inner_vecs by factor of Hessian of total loss."""
-    return tuple(
-        loss.multiply_hessian_factor(loss_vec)
-        for loss, loss_vec in zip(self._losses, loss_inner_vecs))
-
-  def _multiply_loss_hessian_factor_transpose(self, loss_vecs):
-    """Multiply loss_vecs by transpose factor of Hessian of total loss."""
-    return tuple(
-        loss.multiply_hessian_factor_transpose(loss_vec)
-        for loss, loss_vec in zip(self._losses, loss_vecs))
-
-  # Matrix-vector product functions:
-  def multiply_fisher(self, vecs):
-    """Multiply vecs by Fisher of total loss."""
-    jacobian_vecs = self._multiply_jacobian(vecs)
-    loss_fisher_jacobian_vecs = self._multiply_loss_fisher(jacobian_vecs)
-    return self._multiply_jacobian_transpose(loss_fisher_jacobian_vecs)
-
-  def multiply_fisher_factor_transpose(self, vecs):
-    """Multiply vecs by transpose of factor of Fisher of total loss."""
-    jacobian_vecs = self._multiply_jacobian(vecs)
-    return self._multiply_loss_fisher_factor_transpose(jacobian_vecs)
-
-  def multiply_fisher_factor(self, loss_inner_vecs):
-    """Multiply loss_inner_vecs by factor of Fisher of total loss."""
-    fisher_factor_transpose_vecs = self._multiply_loss_fisher_factor_transpose(
-        loss_inner_vecs)
-    return self._multiply_jacobian_transpose(fisher_factor_transpose_vecs)
-
-  def multiply_hessian(self, vecs):
-    """Multiply vecs by Hessian of total loss."""
-    return gradients_impl.gradients(
-        gradients_impl.gradients(self._total_loss, self._wrt_tensors),
-        self._wrt_tensors,
-        grad_ys=vecs)
-
-  def multiply_generalized_gauss_newton(self, vecs):
-    """Multiply vecs by generalized Gauss-Newton of total loss."""
-    jacobian_vecs = self._multiply_jacobian(vecs)
-    loss_hessian_jacobian_vecs = self._multiply_loss_hessian(jacobian_vecs)
-    return self._multiply_jacobian_transpose(loss_hessian_jacobian_vecs)
-
-  def multiply_generalized_gauss_newton_factor_transpose(self, vecs):
-    """Multiply vecs by transpose of factor of GGN of total loss."""
-    jacobian_vecs = self._multiply_jacobian(vecs)
-    return self._multiply_loss_hessian_factor_transpose(jacobian_vecs)
-
-  def multiply_generalized_gauss_newton_factor(self, loss_inner_vecs):
-    """Multiply loss_inner_vecs by factor of GGN of total loss."""
-    hessian_factor_transpose_vecs = (
-        self._multiply_loss_hessian_factor_transpose(loss_inner_vecs))
-    return self._multiply_jacobian_transpose(hessian_factor_transpose_vecs)
-
-  # Shape properties for multiply_XXX_factor methods:
-  @property
-  def fisher_factor_inner_shapes(self):
-    """Shapes required by multiply_fisher_factor."""
-    return tuple(loss.fisher_factor_inner_shape for loss in self._losses)
-
-  @property
-  def generalized_gauss_newton_factor_inner_shapes(self):
-    """Shapes required by multiply_generalized_gauss_newton_factor."""
-    return tuple(loss.hessian_factor_inner_shape for loss in self._losses)
diff --git a/tensorflow/contrib/kfac/python/ops/curvature_matrix_vector_products_lib.py b/tensorflow/contrib/kfac/python/ops/curvature_matrix_vector_products_lib.py
deleted file mode 100644
index 6e8c6404dc..0000000000
--- a/tensorflow/contrib/kfac/python/ops/curvature_matrix_vector_products_lib.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Curvature matrix-vector multiplication."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.curvature_matrix_vector_products import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    'CurvatureMatrixVectorProductComputer',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
deleted file mode 100644
index 854f885c26..0000000000
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ /dev/null
@@ -1,516 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Defines the high-level Fisher estimator class."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-
-from tensorflow.contrib.kfac.python.ops import placement
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import ops as tf_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import nest
-
-
-# The linter is confused.
-# pylint: disable=abstract-class-instantiated
-def make_fisher_estimator(placement_strategy=None, **kwargs):
-  """Creates Fisher estimator instances based on the placement strategy.
-
-  For example if the `placement_strategy` is 'round_robin' then
-  `FisherEstimatorRoundRobin` instance is returned.
-
-  Args:
-    placement_strategy: `string`, Strategy to be used for placing covariance
-      variables, covariance ops and inverse ops. Check
-      `placement.FisherEstimatorRoundRobin` for a concrete example.
-   **kwargs: Arguments to be passed into `FisherEstimator` class initializer.
-
-  Returns:
-    An instance of class which inherits from `FisherEstimator` and the mixin
-    which implements specific placement strategy. See,
-    `FisherEstimatorRoundRobin` which inherits from `FisherEstimator` and
-    `RoundRobinPlacementMixin`.
-
-  Raises:
-    ValueError: If the `placement_strategy` is not equal to 'round_robin'.
-  """
-  if placement_strategy in [None, "round_robin"]:
-    return FisherEstimatorRoundRobin(**kwargs)
-  else:
-    raise ValueError("Unimplemented vars and ops "
-                     "placement strategy : {}".format(placement_strategy))
-# pylint: enable=abstract-class-instantiated
-
-
-@six.add_metaclass(abc.ABCMeta)
-class FisherEstimator(object):
-  """Fisher estimator class supporting various approximations of the Fisher.
-
-  This is an abstract base class which does not implement a strategy for
-  placing covariance variables, covariance update ops and inverse update ops.
-  The placement strategies are implemented in `placement.py`. See
-  `FisherEstimatorRoundRobin` for example of a concrete subclass with
-  a round-robin placement strategy.
-  """
-
-  def __init__(self,
-               variables,
-               cov_ema_decay,
-               damping,
-               layer_collection,
-               exps=(-1,),
-               estimation_mode="gradients",
-               colocate_gradients_with_ops=True,
-               name="FisherEstimator",
-               compute_cholesky=False,
-               compute_cholesky_inverse=False):
-    """Create a FisherEstimator object.
-
-    Args:
-      variables: A `list` of variables or `callable` which returns the variables
-          for which to estimate the Fisher. This must match the variables
-          registered in layer_collection (if it is not None).
-      cov_ema_decay: The decay factor used when calculating the covariance
-          estimate moving averages.
-      damping: float. The damping factor used to stabilize training due to
-          errors in the local approximation with the Fisher information matrix,
-          and to regularize the update direction by making it closer to the
-          gradient. (Higher damping means the update looks more like a standard
-          gradient update - see Tikhonov regularization.)
-      layer_collection: The layer collection object, which holds the fisher
-          blocks, kronecker factors, and losses associated with the
-          graph.
-      exps: List of floats or ints. These represent the different matrix
-          powers of the approximate Fisher that the FisherEstimator will be able
-          to multiply vectors by. If the user asks for a matrix power other
-          one of these (or 1, which is always supported), there will be a
-          failure. (Default: (-1,))
-      estimation_mode: The type of estimator to use for the Fishers.  Can be
-          'gradients', 'empirical', 'curvature_prop', or 'exact'.
-          (Default: 'gradients').  'gradients' is the basic estimation approach
-          from the original K-FAC paper.  'empirical' computes the 'empirical'
-          Fisher information matrix (which uses the data's distribution for the
-          targets, as opposed to the true Fisher which uses the model's
-          distribution) and requires that each registered loss have specified
-          targets. 'curvature_propagation' is a method which estimates the
-          Fisher using self-products of random 1/-1 vectors times "half-factors"
-          of the Fisher, as described here: https://arxiv.org/abs/1206.6464 .
-          Finally, 'exact' is the obvious generalization of Curvature
-          Propagation to compute the exact Fisher (modulo any additional
-          diagonal or Kronecker approximations) by looping over one-hot vectors
-          for each coordinate of the output instead of using 1/-1 vectors.  It
-          is more expensive to compute than the other three options by a factor
-          equal to the output dimension, roughly speaking.
-      colocate_gradients_with_ops: Whether we should request gradients be
-          colocated with their respective ops. (Default: True)
-      name: A string. A name given to this estimator, which is added to the
-          variable scope when constructing variables and ops.
-          (Default: "FisherEstimator")
-      compute_cholesky: Bool. Whether or not the FisherEstimator will be
-          able to multiply vectors by the Cholesky factor.
-          (Default: False)
-      compute_cholesky_inverse: Bool. Whether or not the FisherEstimator
-          will be able to multiply vectors by the Cholesky factor inverse.
-          (Default: False)
-    Raises:
-      ValueError: If no losses have been registered with layer_collection.
-    """
-    self._variables = variables
-    self._cov_ema_decay = cov_ema_decay
-    self._damping = damping
-    self._estimation_mode = estimation_mode
-    self._layers = layer_collection
-    self._gradient_fns = {
-        "gradients": self._get_grads_lists_gradients,
-        "empirical": self._get_grads_lists_empirical,
-        "curvature_prop": self._get_grads_lists_curvature_prop,
-        "exact": self._get_grads_lists_exact
-    }
-    self._colocate_gradients_with_ops = colocate_gradients_with_ops
-
-    self._made_vars = False
-    self._exps = exps
-    self._compute_cholesky = compute_cholesky
-    self._compute_cholesky_inverse = compute_cholesky_inverse
-
-    self._name = name
-
-  @property
-  def variables(self):
-    if callable(self._variables):
-      return self._variables()
-    else:
-      return self._variables
-
-  @property
-  def damping(self):
-    return self._damping
-
-  @property
-  def blocks(self):
-    """All registered FisherBlocks."""
-    return self._layers.get_blocks()
-
-  @property
-  def factors(self):
-    """All registered FisherFactors."""
-    return self._layers.get_factors()
-
-  @property
-  def name(self):
-    return self._name
-
-  @abc.abstractmethod
-  def make_vars_and_create_op_thunks(self, scope=None):
-    """Make vars and create op thunks with a specific placement strategy.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  A new device is chosen
-    for each factor by cycling through list of devices in the cov_devices
-    argument. If cov_devices is None then no explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the inv_devices argument.
-
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All variables will be created,
-        and all thunks will execute, inside of a variable scope of the given
-        name. (Default: None)
-
-    Returns:
-      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-    """
-    pass
-
-  def _apply_transformation(self, vecs_and_vars, transform):
-    """Applies an block-wise transformation to the corresponding vectors.
-
-    Args:
-      vecs_and_vars: List of (vector, variable) pairs.
-      transform: A function of the form f(fb, vec), where vec is the vector
-          to transform and fb is its corresponding block in the matrix, that
-          returns the transformed vector.
-
-    Returns:
-      A list of (transformed vector, var) pairs in the same order as
-      vecs_and_vars.
-    """
-
-    vecs = utils.SequenceDict((var, vec) for vec, var in vecs_and_vars)
-
-    trans_vecs = utils.SequenceDict()
-
-    for params, fb in self._layers.fisher_blocks.items():
-      trans_vecs[params] = transform(fb, vecs[params])
-
-    return [(trans_vecs[var], var) for _, var in vecs_and_vars]
-
-  def multiply_inverse(self, vecs_and_vars):
-    """Multiplies the vecs by the corresponding (damped) inverses of the blocks.
-
-    Args:
-      vecs_and_vars: List of (vector, variable) pairs.
-
-    Returns:
-      A list of (transformed vector, var) pairs in the same order as
-      vecs_and_vars.
-    """
-    return self.multiply_matpower(-1, vecs_and_vars)
-
-  def multiply(self, vecs_and_vars):
-    """Multiplies the vectors by the corresponding (damped) blocks.
-
-    Args:
-      vecs_and_vars: List of (vector, variable) pairs.
-
-    Returns:
-      A list of (transformed vector, var) pairs in the same order as
-      vecs_and_vars.
-    """
-    return self.multiply_matpower(1, vecs_and_vars)
-
-  def multiply_matpower(self, exp, vecs_and_vars):
-    """Multiplies the vecs by the corresponding matrix powers of the blocks.
-
-    Args:
-      exp: A float representing the power to raise the blocks by before
-        multiplying it by the vector.
-      vecs_and_vars: List of (vector, variable) pairs.
-
-    Returns:
-      A list of (transformed vector, var) pairs in the same order as
-      vecs_and_vars.
-    """
-    assert exp in self._exps
-
-    fcn = lambda fb, vec: fb.multiply_matpower(vec, exp)
-    return self._apply_transformation(vecs_and_vars, fcn)
-
-  def multiply_cholesky(self, vecs_and_vars, transpose=False):
-    """Multiplies the vecs by the corresponding Cholesky factors.
-
-    Args:
-      vecs_and_vars: List of (vector, variable) pairs.
-      transpose: Bool. If true the Cholesky factors are transposed before
-        multiplying the vecs. (Default: False)
-
-    Returns:
-      A list of (transformed vector, var) pairs in the same order as
-      vecs_and_vars.
-    """
-    assert self._compute_cholesky
-
-    fcn = lambda fb, vec: fb.multiply_cholesky(vec, transpose=transpose)
-    return self._apply_transformation(vecs_and_vars, fcn)
-
-  def multiply_cholesky_inverse(self, vecs_and_vars, transpose=False):
-    """Mults the vecs by the inverses of the corresponding Cholesky factors.
-
-      Note: if you are using Cholesky inverse multiplication to sample from
-      a matrix-variate Gaussian you will want to multiply by the transpose.
-      Let L be the Cholesky factor of F and observe that
-
-        L^-T * L^-1 = (L * L^T)^-1 = F^-1 .
-
-      Thus we want to multiply by L^-T in order to sample from Gaussian with
-      covariance F^-1.
-
-    Args:
-      vecs_and_vars: List of (vector, variable) pairs.
-      transpose: Bool. If true the Cholesky factor inverses are transposed
-        before multiplying the vecs. (Default: False)
-
-    Returns:
-      A list of (transformed vector, var) pairs in the same order as
-      vecs_and_vars.
-    """
-    assert self._compute_cholesky_inverse
-
-    fcn = lambda fb, vec: fb.multiply_cholesky_inverse(vec, transpose=transpose)
-    return self._apply_transformation(vecs_and_vars, fcn)
-
-  def _instantiate_factors(self):
-    """Instantiates FisherFactors' variables.
-
-    Raises:
-      ValueError: If estimation_mode was improperly specified at construction.
-    """
-    blocks = self.blocks
-    tensors_to_compute_grads = [
-        block.tensors_to_compute_grads() for block in blocks
-    ]
-
-    try:
-      grads_lists = self._gradient_fns[self._estimation_mode](
-          tensors_to_compute_grads)
-    except KeyError:
-      raise ValueError("Unrecognized value {} for estimation_mode.".format(
-          self._estimation_mode))
-
-    for grads_list, block in zip(grads_lists, blocks):
-      block.instantiate_factors(grads_list, self.damping)
-
-  def _check_vars_unmade_and_set_made_flag(self):
-    if self._made_vars:
-      raise Exception("Already made variables.")
-    self._made_vars = True
-
-  def made_vars(self):
-    return self._made_vars
-
-  def _register_matrix_functions(self):
-    for block in self.blocks:
-      for exp in self._exps:
-        block.register_matpower(exp)
-      if self._compute_cholesky:
-        block.register_cholesky()
-      if self._compute_cholesky_inverse:
-        block.register_cholesky_inverse()
-
-  def _finalize_layer_collection(self):
-    self._layers.create_subgraph()
-    self._layers.check_registration(self.variables)
-    self._instantiate_factors()
-    self._register_matrix_functions()
-
-  def create_ops_and_vars_thunks(self, scope=None):
-    """Create thunks that make the ops and vars on demand.
-
-    This function returns 4 lists of thunks: cov_variable_thunks,
-    cov_update_thunks, inv_variable_thunks, and inv_update_thunks.
-
-    The length of each list is the number of factors and the i-th element of
-    each list corresponds to the i-th factor (given by the "factors" property).
-
-    Note that the execution of these thunks must happen in a certain
-    partial order.  The i-th element of cov_variable_thunks must execute
-    before the i-th element of cov_update_thunks (and also the i-th element
-    of inv_update_thunks).  Similarly, the i-th element of inv_variable_thunks
-    must execute before the i-th element of inv_update_thunks.
-
-    TL;DR (oversimplified): Execute the thunks according to the order that
-    they are returned.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All thunks will execute inside
-        of a variable scope of the given name. (Default: None)
-    Returns:
-      cov_variable_thunks: A list of thunks that make the cov variables.
-      cov_update_thunks: A list of thunks that make the cov update ops.
-      inv_variable_thunks: A list of thunks that make the inv variables.
-      inv_update_thunks: A list of thunks that make the inv update ops.
-    """
-    self._check_vars_unmade_and_set_made_flag()
-
-    self._finalize_layer_collection()
-
-    scope = self.name if scope is None else scope
-
-    cov_variable_thunks = [
-        self._create_cov_variable_thunk(factor, scope)
-        for factor in self.factors
-    ]
-    cov_update_thunks = [
-        self._create_cov_update_thunk(factor, scope) for factor in self.factors
-    ]
-    inv_variable_thunks = [
-        self._create_inv_variable_thunk(factor, scope)
-        for factor in self.factors
-    ]
-    inv_update_thunks = [
-        self._create_inv_update_thunk(factor, scope) for factor in self.factors
-    ]
-
-    return (cov_variable_thunks, cov_update_thunks,
-            inv_variable_thunks, inv_update_thunks)
-
-  def _create_cov_variable_thunk(self, factor, scope):
-    """Constructs a covariance variable thunk for a single FisherFactor."""
-
-    def thunk():
-      with variable_scope.variable_scope(scope):
-        return factor.instantiate_cov_variables()
-
-    return thunk
-
-  def _create_cov_update_thunk(self, factor, scope):
-    """Constructs a covariance update thunk for a single FisherFactor."""
-
-    def thunk():
-      with variable_scope.variable_scope(scope):
-        return factor.make_covariance_update_op(self._cov_ema_decay)
-
-    return thunk
-
-  def _create_inv_variable_thunk(self, factor, scope):
-    """Constructs a inverse variable thunk for a single FisherFactor."""
-
-    def thunk():
-      with variable_scope.variable_scope(scope):
-        return factor.instantiate_inv_variables()
-
-    return thunk
-
-  def _create_inv_update_thunk(self, factor, scope):
-    """Constructs an inverse update thunk for a single FisherFactor."""
-
-    def thunk():
-      with variable_scope.variable_scope(scope):
-        return control_flow_ops.group(factor.make_inverse_update_ops())
-
-    return thunk
-
-  def _get_grads_lists_gradients(self, tensors):
-    # Passing in a list of loss values is better than passing in the sum as
-    # the latter creates unnessesary ops on the default device
-    grads_flat = gradients_impl.gradients(
-        self._layers.eval_losses_on_samples(),
-        nest.flatten(tensors),
-        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
-    grads_all = nest.pack_sequence_as(tensors, grads_flat)
-    return tuple((grad,) for grad in grads_all)
-
-  def _get_grads_lists_empirical(self, tensors):
-    # Passing in a list of loss values is better than passing in the sum as
-    # the latter creates unnessesary ops on the default device
-    grads_flat = gradients_impl.gradients(
-        self._layers.eval_losses(),
-        nest.flatten(tensors),
-        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
-    grads_all = nest.pack_sequence_as(tensors, grads_flat)
-    return tuple((grad,) for grad in grads_all)
-
-  def _get_transformed_random_signs(self):
-    transformed_random_signs = []
-    for loss in self._layers.losses:
-      with tf_ops.colocate_with(self._layers.loss_colocation_ops[loss]):
-        transformed_random_signs.append(
-            loss.multiply_fisher_factor(
-                utils.generate_random_signs(loss.fisher_factor_inner_shape)))
-    return transformed_random_signs
-
-  def _get_grads_lists_curvature_prop(self, tensors):
-    loss_inputs = list(loss.inputs for loss in self._layers.losses)
-    transformed_random_signs = self._get_transformed_random_signs()
-    grads_flat = gradients_impl.gradients(
-        nest.flatten(loss_inputs),
-        nest.flatten(tensors),
-        grad_ys=nest.flatten(transformed_random_signs),
-        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
-    grads_all = nest.pack_sequence_as(tensors, grads_flat)
-    return tuple((grad,) for grad in grads_all)
-
-  def _get_grads_lists_exact(self, tensors):
-    """No docstring required."""
-    # Loop over all coordinates of all losses.
-    grads_all = []
-    for loss in self._layers.losses:
-      with tf_ops.colocate_with(self._layers.loss_colocation_ops[loss]):
-        for index in np.ndindex(*loss.fisher_factor_inner_static_shape[1:]):
-          transformed_one_hot = loss.multiply_fisher_factor_replicated_one_hot(
-              index)
-          grads_flat = gradients_impl.gradients(
-              loss.inputs,
-              nest.flatten(tensors),
-              grad_ys=transformed_one_hot,
-              colocate_gradients_with_ops=self._colocate_gradients_with_ops)
-          grads_all.append(nest.pack_sequence_as(tensors, grads_flat))
-    return zip(*grads_all)
-
-
-class FisherEstimatorRoundRobin(placement.RoundRobinPlacementMixin,
-                                FisherEstimator):
-  """Fisher estimator which provides round robin device placement strategy."""
-  pass
diff --git a/tensorflow/contrib/kfac/python/ops/estimator_lib.py b/tensorflow/contrib/kfac/python/ops/estimator_lib.py
deleted file mode 100644
index 9c9fef471f..0000000000
--- a/tensorflow/contrib/kfac/python/ops/estimator_lib.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Defines the high-level Fisher estimator class."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.estimator import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    'FisherEstimator',
-    'make_fisher_estimator',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
deleted file mode 100644
index 3a5c8eb5f9..0000000000
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ /dev/null
@@ -1,1752 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""FisherBlock definitions.
-
-This library contains classes for estimating blocks in a model's Fisher
-Information matrix. Suppose one has a model that parameterizes a posterior
-distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its
-Fisher Information matrix is given by,
-
-  $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$
-
-where,
-
-  $$v(x, y, params) = (d / d params) log p(y | x, params)$$
-
-and the expectation is taken with respect to the data's distribution for 'x' and
-the model's posterior distribution for 'y',
-
-  x ~ p(x)
-  y ~ p(y | x, params)
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import enum  # pylint: disable=g-bad-import-order
-
-import numpy as np
-import six
-
-from tensorflow.contrib.kfac.python.ops import fisher_factors
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.util import nest
-
-# For blocks corresponding to convolutional layers, or any type of block where
-# the parameters can be thought of as being replicated in time or space,
-# we want to adjust the scale of the damping by
-#   damping /= num_replications ** NORMALIZE_DAMPING_POWER
-NORMALIZE_DAMPING_POWER = 1.0
-
-# Methods for adjusting damping for FisherBlocks. See
-# compute_pi_adjusted_damping() for details.
-PI_OFF_NAME = "off"
-PI_TRACENORM_NAME = "tracenorm"
-PI_TYPE = PI_TRACENORM_NAME
-
-
-def set_global_constants(normalize_damping_power=None, pi_type=None):
-  """Sets various global constants used by the classes in this module."""
-  global NORMALIZE_DAMPING_POWER
-  global PI_TYPE
-
-  if normalize_damping_power is not None:
-    NORMALIZE_DAMPING_POWER = normalize_damping_power
-
-  if pi_type is not None:
-    PI_TYPE = pi_type
-
-
-def normalize_damping(damping, num_replications):
-  """Normalize damping after adjusting scale by NORMALIZE_DAMPING_POWER."""
-  if NORMALIZE_DAMPING_POWER:
-    return damping / (num_replications ** NORMALIZE_DAMPING_POWER)
-  return damping
-
-
-def compute_pi_tracenorm(left_cov, right_cov):
-  r"""Computes the scalar constant pi for Tikhonov regularization/damping.
-
-  $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
-  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
-
-  Args:
-    left_cov: A LinearOperator object. The left Kronecker factor "covariance".
-    right_cov: A LinearOperator object. The right Kronecker factor "covariance".
-
-  Returns:
-    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
-  """
-  # Instead of dividing by the dim of the norm, we multiply by the dim of the
-  # other norm. This works out the same in the ratio.
-  left_norm = left_cov.trace() * int(right_cov.domain_dimension)
-  right_norm = right_cov.trace() * int(left_cov.domain_dimension)
-  return math_ops.sqrt(left_norm / right_norm)
-
-
-def compute_pi_adjusted_damping(left_cov, right_cov, damping):
-
-  if PI_TYPE == PI_TRACENORM_NAME:
-    pi = compute_pi_tracenorm(left_cov, right_cov)
-    return (damping * pi, damping / pi)
-
-  elif PI_TYPE == PI_OFF_NAME:
-    return (damping, damping)
-
-
-class PackagedFunc(object):
-  """A Python thunk with a stable ID.
-
-  Enables stable names for lambdas.
-  """
-
-  def __init__(self, func, func_id):
-    """Initializes PackagedFunc.
-
-    Args:
-      func: a zero-arg Python function.
-      func_id: a hashable, function that produces a hashable, or a list/tuple
-        thereof.
-    """
-    self._func = func
-    func_id = func_id if isinstance(func_id, (tuple, list)) else (func_id,)
-    self._func_id = func_id
-
-  def __call__(self):
-    return self._func()
-
-  @property
-  def func_id(self):
-    """A hashable identifier for this function."""
-    return tuple(elt() if callable(elt) else elt for elt in self._func_id)
-
-
-def _package_func(func, func_id):
-  return PackagedFunc(func, func_id)
-
-
-@six.add_metaclass(abc.ABCMeta)
-class FisherBlock(object):
-  """Abstract base class for objects modeling approximate Fisher matrix blocks.
-
-  Subclasses must implement register_matpower, multiply_matpower,
-  instantiate_factors, tensors_to_compute_grads, and num_registered_towers
-  methods.
-  """
-
-  def __init__(self, layer_collection):
-    self._layer_collection = layer_collection
-
-  @abc.abstractmethod
-  def instantiate_factors(self, grads_list, damping):
-    """Creates and registers the component factors of this Fisher block.
-
-    Args:
-      grads_list: A list gradients (each a Tensor or tuple of Tensors) with
-          respect to the tensors returned by tensors_to_compute_grads() that
-          are to be used to estimate the block.
-      damping: The damping factor (float or Tensor).
-    """
-    pass
-
-  @abc.abstractmethod
-  def register_matpower(self, exp):
-    """Registers a matrix power to be computed by the block.
-
-    Args:
-      exp: A float representing the power to raise the block by.
-    """
-    pass
-
-  @abc.abstractmethod
-  def register_cholesky(self):
-    """Registers a Cholesky factor to be computed by the block."""
-    pass
-
-  @abc.abstractmethod
-  def register_cholesky_inverse(self):
-    """Registers an inverse Cholesky factor to be computed by the block."""
-    pass
-
-  def register_inverse(self):
-    """Registers a matrix inverse to be computed by the block."""
-    self.register_matpower(-1)
-
-  @abc.abstractmethod
-  def multiply_matpower(self, vector, exp):
-    """Multiplies the vector by the (damped) matrix-power of the block.
-
-    Args:
-      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
-      exp: A float representing the power to raise the block by before
-        multiplying it by the vector.
-
-    Returns:
-      The vector left-multiplied by the (damped) matrix-power of the block.
-    """
-    pass
-
-  def multiply_inverse(self, vector):
-    """Multiplies the vector by the (damped) inverse of the block.
-
-    Args:
-      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
-
-    Returns:
-      The vector left-multiplied by the (damped) inverse of the block.
-    """
-    return self.multiply_matpower(vector, -1)
-
-  def multiply(self, vector):
-    """Multiplies the vector by the (damped) block.
-
-    Args:
-      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
-
-    Returns:
-      The vector left-multiplied by the (damped) block.
-    """
-    return self.multiply_matpower(vector, 1)
-
-  @abc.abstractmethod
-  def multiply_cholesky(self, vector, transpose=False):
-    """Multiplies the vector by the (damped) Cholesky-factor of the block.
-
-    Args:
-      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
-      transpose: Bool. If true the Cholesky factor is transposed before
-        multiplying the vector. (Default: False)
-
-    Returns:
-      The vector left-multiplied by the (damped) Cholesky-factor of the block.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_cholesky_inverse(self, vector, transpose=False):
-    """Multiplies vector by the (damped) inverse Cholesky-factor of the block.
-
-    Args:
-      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
-      transpose: Bool. If true the Cholesky factor inverse is transposed
-        before multiplying the vector. (Default: False)
-    Returns:
-      Vector left-multiplied by (damped) inverse Cholesky-factor of the block.
-    """
-    pass
-
-  @abc.abstractmethod
-  def tensors_to_compute_grads(self):
-    """Returns the Tensor(s) with respect to which this FisherBlock needs grads.
-    """
-    pass
-
-  @abc.abstractproperty
-  def num_registered_towers(self):
-    """Number of towers registered for this FisherBlock.
-
-    Typically equal to the number of towers in a multi-tower setup.
-    """
-    pass
-
-
-class FullFB(FisherBlock):
-  """FisherBlock using a full matrix estimate (no approximations).
-
-  FullFB uses a full matrix estimate (no approximations), and should only ever
-  be used for very low dimensional parameters.
-
-  Note that this uses the naive "square the sum estimator", and so is applicable
-  to any type of parameter in principle, but has very high variance.
-  """
-
-  def __init__(self, layer_collection, params):
-    """Creates a FullFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      params: The parameters of this layer (Tensor or tuple of Tensors).
-    """
-    self._batch_sizes = []
-    self._params = params
-
-    super(FullFB, self).__init__(layer_collection)
-
-  def instantiate_factors(self, grads_list, damping):
-    self._damping_func = _package_func(lambda: damping, (damping,))
-
-    self._factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullFactor, (grads_list, self._batch_size))
-
-  def register_matpower(self, exp):
-    self._factor.register_matpower(exp, self._damping_func)
-
-  def register_cholesky(self):
-    self._factor.register_cholesky(self._damping_func)
-
-  def register_cholesky_inverse(self):
-    self._factor.register_cholesky_inverse(self._damping_func)
-
-  def _multiply_matrix(self, matrix, vector, transpose=False):
-    vector_flat = utils.tensors_to_column(vector)
-    out_flat = matrix.matmul(vector_flat, adjoint=transpose)
-    return utils.column_to_tensors(vector, out_flat)
-
-  def multiply_matpower(self, vector, exp):
-    matrix = self._factor.get_matpower(exp, self._damping_func)
-    return self._multiply_matrix(matrix, vector)
-
-  def multiply_cholesky(self, vector, transpose=False):
-    matrix = self._factor.get_cholesky(self._damping_func)
-    return self._multiply_matrix(matrix, vector, transpose=transpose)
-
-  def multiply_cholesky_inverse(self, vector, transpose=False):
-    matrix = self._factor.get_cholesky_inverse(self._damping_func)
-    return self._multiply_matrix(matrix, vector, transpose=transpose)
-
-  def full_fisher_block(self):
-    """Explicitly constructs the full Fisher block."""
-    return self._factor.get_cov_as_linear_operator().to_dense()
-
-  def tensors_to_compute_grads(self):
-    return self._params
-
-  def register_additional_tower(self, batch_size):
-    """Register an additional tower.
-
-    Args:
-      batch_size: The batch size, used in the covariance estimator.
-    """
-    self._batch_sizes.append(batch_size)
-
-  @property
-  def num_registered_towers(self):
-    return len(self._batch_sizes)
-
-  @property
-  def _batch_size(self):
-    return math_ops.reduce_sum(self._batch_sizes)
-
-
-@six.add_metaclass(abc.ABCMeta)
-class DiagonalFB(FisherBlock):
-  """A base class for FisherBlocks that use diagonal approximations."""
-
-  def register_matpower(self, exp):
-    # Not needed for this.  Matrix powers are computed on demand in the
-    # diagonal case
-    pass
-
-  def register_cholesky(self):
-    # Not needed for this.  Cholesky's are computed on demand in the
-    # diagonal case
-    pass
-
-  def register_cholesky_inverse(self):
-    # Not needed for this.  Cholesky inverses's are computed on demand in the
-    # diagonal case
-    pass
-
-  def _multiply_matrix(self, matrix, vector):
-    vector_flat = utils.tensors_to_column(vector)
-    out_flat = matrix.matmul(vector_flat)
-    return utils.column_to_tensors(vector, out_flat)
-
-  def multiply_matpower(self, vector, exp):
-    matrix = self._factor.get_matpower(exp, self._damping_func)
-    return self._multiply_matrix(matrix, vector)
-
-  def multiply_cholesky(self, vector, transpose=False):
-    matrix = self._factor.get_cholesky(self._damping_func)
-    return self._multiply_matrix(matrix, vector)
-
-  def multiply_cholesky_inverse(self, vector, transpose=False):
-    matrix = self._factor.get_cholesky_inverse(self._damping_func)
-    return self._multiply_matrix(matrix, vector)
-
-  def full_fisher_block(self):
-    return self._factor.get_cov_as_linear_operator().to_dense()
-
-
-class NaiveDiagonalFB(DiagonalFB):
-  """FisherBlock using a diagonal matrix approximation.
-
-  This type of approximation is generically applicable but quite primitive.
-
-  Note that this uses the naive "square the sum estimator", and so is applicable
-  to any type of parameter in principle, but has very high variance.
-  """
-
-  def __init__(self, layer_collection, params):
-    """Creates a NaiveDiagonalFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      params: The parameters of this layer (Tensor or tuple of Tensors).
-    """
-    self._params = params
-    self._batch_sizes = []
-
-    super(NaiveDiagonalFB, self).__init__(layer_collection)
-
-  def instantiate_factors(self, grads_list, damping):
-    self._damping_func = _package_func(lambda: damping, (damping,))
-
-    self._factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.NaiveDiagonalFactor, (grads_list, self._batch_size))
-
-  def tensors_to_compute_grads(self):
-    return self._params
-
-  def register_additional_tower(self, batch_size):
-    """Register an additional tower.
-
-    Args:
-      batch_size: The batch size, used in the covariance estimator.
-    """
-    self._batch_sizes.append(batch_size)
-
-  @property
-  def num_registered_towers(self):
-    return len(self._batch_sizes)
-
-  @property
-  def _batch_size(self):
-    return math_ops.reduce_sum(self._batch_sizes)
-
-
-class InputOutputMultiTower(object):
-  """Mix-in class for blocks with inputs & outputs and multiple mini-batches."""
-
-  def __init__(self, *args, **kwargs):
-    self.__inputs = []
-    self.__outputs = []
-    super(InputOutputMultiTower, self).__init__(*args, **kwargs)
-
-  def _process_data(self, grads_list):
-    """Process data into the format used by the factors.
-
-    This function takes inputs and grads_lists data and processes it into
-    one of the formats expected by the FisherFactor classes (depending on
-    the value of the global configuration variable TOWER_STRATEGY).
-
-    The initial format of self._inputs is expected to be a list of Tensors
-    over towers. Similarly grads_lists is expected to be a list over sources
-    of such lists.
-
-    If TOWER_STRATEGY is "concat", 'inputs' becomes a tuple containing a single
-    tensor (represented as a PartitionedTensor object) equal to the
-    concatenation (across towers) of all of the elements of self._inputs. And
-    similarly grads_list is formatted into a tuple (over sources) of such
-    tensors (also represented as PartitionedTensors).
-
-    If TOWER_STRATEGY is "separate", formatting of inputs and grads_list
-    remains unchanged from the initial format (although possibly converting
-    from lists into tuples).
-
-    Args:
-      grads_list: grads_list in its initial format (see above).
-
-    Returns:
-      inputs: self._inputs transformed into the appropriate format (see
-        above).
-      grads_list: grads_list transformed into the appropriate format (see
-        above).
-
-    Raises:
-      ValueError: if TOWER_STRATEGY is not one of "separate" or "concat".
-    """
-    inputs = self._inputs
-    # inputs is a list over towers of Tensors
-    # grads_list is a list of list with the first index being sources and the
-    # second being towers.
-    if fisher_factors.TOWER_STRATEGY == "concat":
-      # Merge towers together into a PartitionedTensor. We package it in
-      # a singleton tuple since the factors will expect a list over towers
-      inputs = (utils.PartitionedTensor(inputs),)
-      # Do the same for grads_list but preserve leading sources dimension
-      grads_list = tuple((utils.PartitionedTensor(grads),)
-                         for grads in grads_list)
-    elif fisher_factors.TOWER_STRATEGY == "separate":
-      inputs = tuple(inputs)
-      grads_list = tuple(grads_list)
-
-    else:
-      raise ValueError("Global config variable TOWER_STRATEGY must be one of "
-                       "'concat' or 'separate'.")
-
-    return inputs, grads_list
-
-  def tensors_to_compute_grads(self):
-    """Tensors to compute derivative of loss with respect to."""
-    return tuple(self._outputs)
-
-  def register_additional_tower(self, inputs, outputs):
-    self._inputs.append(inputs)
-    self._outputs.append(outputs)
-
-  @property
-  def num_registered_towers(self):
-    result = len(self._inputs)
-    assert result == len(self._outputs)
-    return result
-
-  @property
-  def _inputs(self):
-    return self.__inputs
-
-  @property
-  def _outputs(self):
-    return self.__outputs
-
-
-class FullyConnectedDiagonalFB(InputOutputMultiTower, DiagonalFB):
-  """FisherBlock for fully-connected (dense) layers using a diagonal approx.
-
-  Estimates the Fisher Information matrix's diagonal entries for a fully
-  connected layer. Unlike NaiveDiagonalFB this uses the low-variance "sum of
-  squares" estimator.
-
-  Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
-  into it. We are interested in Fisher(params)[i, i]. This is,
-
-    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]$$
-
-  Consider fully connected layer in this model with (unshared) weight matrix
-  'w'. For an example 'x' that produces layer inputs 'a' and output
-  preactivations 's',
-
-    $$v(x, y, w) = vec( a (d loss / d s)^T )$$
-
-  This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
-  to the layer's parameters 'w'.
-  """
-
-  def __init__(self, layer_collection, has_bias=False):
-    """Creates a FullyConnectedDiagonalFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      has_bias: Whether the component Kronecker factors have an additive bias.
-          (Default: False)
-    """
-    self._has_bias = has_bias
-
-    super(FullyConnectedDiagonalFB, self).__init__(layer_collection)
-
-  def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._process_data(grads_list)
-
-    self._factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedDiagonalFactor,
-        (inputs, grads_list, self._has_bias))
-
-    self._damping_func = _package_func(lambda: damping, (damping,))
-
-
-class ConvDiagonalFB(InputOutputMultiTower, DiagonalFB):
-  """FisherBlock for 2-D convolutional layers using a diagonal approx.
-
-  Estimates the Fisher Information matrix's diagonal entries for a convolutional
-  layer. Unlike NaiveDiagonalFB this uses the low-variance "sum of squares"
-  estimator.
-
-  Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
-  into it. We are interested in Fisher(params)[i, i]. This is,
-
-    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]$$
-
-  Consider a convoluational layer in this model with (unshared) filter matrix
-  'w'. For an example image 'x' that produces layer inputs 'a' and output
-  preactivations 's',
-
-    $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$
-
-  where 'loc' is a single (x, y) location in an image.
-
-  This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
-  to the layer's parameters 'w'.
-  """
-
-  def __init__(self,
-               layer_collection,
-               params,
-               strides,
-               padding,
-               data_format=None,
-               dilations=None):
-    """Creates a ConvDiagonalFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      params: The parameters (Tensor or tuple of Tensors) of this layer. If
-        kernel alone, a Tensor of shape [kernel_height, kernel_width,
-        in_channels, out_channels]. If kernel and bias, a tuple of 2 elements
-        containing the previous and a Tensor of shape [out_channels].
-      strides: The stride size in this layer (1-D Tensor of length 4).
-      padding: The padding in this layer (e.g. "SAME").
-      data_format: str or None. Format of input data.
-      dilations: List of 4 ints or None. Rate for dilation along all dimensions.
-
-    Raises:
-      ValueError: if strides is not length-4.
-      ValueError: if dilations is not length-4.
-      ValueError: if channel is not last dimension.
-    """
-    if len(strides) != 4:
-      raise ValueError("strides must contain 4 numbers.")
-
-    if dilations is None:
-      dilations = [1, 1, 1, 1]
-
-    if len(dilations) != 4:
-      raise ValueError("dilations must contain 4 numbers.")
-
-    if not utils.is_data_format_channel_last(data_format):
-      raise ValueError("data_format must be channels-last.")
-
-    self._strides = maybe_tuple(strides)
-    self._padding = padding
-    self._data_format = data_format
-    self._dilations = maybe_tuple(dilations)
-    self._has_bias = isinstance(params, (tuple, list))
-
-    fltr = params[0] if self._has_bias else params
-    self._filter_shape = tuple(fltr.shape.as_list())
-
-    if len(self._filter_shape) != 4:
-      raise ValueError(
-          "Convolution filter must be of shape"
-          " [filter_height, filter_width, in_channels, out_channels].")
-
-    super(ConvDiagonalFB, self).__init__(layer_collection)
-
-  def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._process_data(grads_list)
-
-    # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
-                                             self._strides)
-
-    self._factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.ConvDiagonalFactor,
-        (inputs, grads_list, self._filter_shape, self._strides, self._padding,
-         self._data_format, self._dilations, self._has_bias))
-
-    def damping_func():
-      return self._num_locations * normalize_damping(damping,
-                                                     self._num_locations)
-
-    damping_id = (self._num_locations, "mult", "normalize_damping", damping,
-                  self._num_locations)
-    self._damping_func = _package_func(damping_func, damping_id)
-
-
-class KroneckerProductFB(FisherBlock):
-  """A base class for blocks with separate input and output Kronecker factors.
-
-  The Fisher block is approximated as a Kronecker product of the input and
-  output factors.
-  """
-
-  def _setup_damping(self, damping, normalization=None):
-    """Makes functions that compute the damping values for both factors."""
-    def compute_damping():
-      if normalization is not None:
-        maybe_normalized_damping = normalize_damping(damping, normalization)
-      else:
-        maybe_normalized_damping = damping
-
-      return compute_pi_adjusted_damping(
-          self._input_factor.get_cov_as_linear_operator(),
-          self._output_factor.get_cov_as_linear_operator(),
-          maybe_normalized_damping**0.5)
-
-    if normalization is not None:
-      damping_id = ("compute_pi_adjusted_damping",
-                    "cov", self._input_factor.name,
-                    "cov", self._output_factor.name,
-                    "normalize_damping", damping, normalization, "power", 0.5)
-    else:
-      damping_id = ("compute_pi_adjusted_damping",
-                    "cov", self._input_factor.name,
-                    "cov", self._output_factor.name,
-                    damping, "power", 0.5)
-
-    self._input_damping_func = _package_func(lambda: compute_damping()[0],
-                                             damping_id + ("ref", 0))
-    self._output_damping_func = _package_func(lambda: compute_damping()[1],
-                                              damping_id + ("ref", 1))
-
-  def register_matpower(self, exp):
-    self._input_factor.register_matpower(exp, self._input_damping_func)
-    self._output_factor.register_matpower(exp, self._output_damping_func)
-
-  def register_cholesky(self):
-    self._input_factor.register_cholesky(self._input_damping_func)
-    self._output_factor.register_cholesky(self._output_damping_func)
-
-  def register_cholesky_inverse(self):
-    self._input_factor.register_cholesky_inverse(self._input_damping_func)
-    self._output_factor.register_cholesky_inverse(self._output_damping_func)
-
-  @property
-  def _renorm_coeff(self):
-    """Kronecker factor multiplier coefficient.
-
-    If this FisherBlock is represented as 'FB = c * kron(left, right)', then
-    this is 'c'.
-
-    Returns:
-      0-D Tensor.
-    """
-    return 1.0
-
-  def _multiply_factored_matrix(self, left_factor, right_factor, vector,
-                                extra_scale=1.0, transpose_left=False,
-                                transpose_right=False):
-    reshaped_vector = utils.layer_params_to_mat2d(vector)
-    reshaped_out = right_factor.matmul_right(reshaped_vector,
-                                             adjoint=transpose_right)
-    reshaped_out = left_factor.matmul(reshaped_out,
-                                      adjoint=transpose_left)
-    if extra_scale != 1.0:
-      reshaped_out *= math_ops.cast(extra_scale, dtype=reshaped_out.dtype)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
-
-  def multiply_matpower(self, vector, exp):
-    left_factor = self._input_factor.get_matpower(
-        exp, self._input_damping_func)
-    right_factor = self._output_factor.get_matpower(
-        exp, self._output_damping_func)
-    extra_scale = float(self._renorm_coeff)**exp
-    return self._multiply_factored_matrix(left_factor, right_factor, vector,
-                                          extra_scale=extra_scale)
-
-  def multiply_cholesky(self, vector, transpose=False):
-    left_factor = self._input_factor.get_cholesky(self._input_damping_func)
-    right_factor = self._output_factor.get_cholesky(self._output_damping_func)
-    extra_scale = float(self._renorm_coeff)**0.5
-    return self._multiply_factored_matrix(left_factor, right_factor, vector,
-                                          extra_scale=extra_scale,
-                                          transpose_left=transpose,
-                                          transpose_right=not transpose)
-
-  def multiply_cholesky_inverse(self, vector, transpose=False):
-    left_factor = self._input_factor.get_cholesky_inverse(
-        self._input_damping_func)
-    right_factor = self._output_factor.get_cholesky_inverse(
-        self._output_damping_func)
-    extra_scale = float(self._renorm_coeff)**-0.5
-    return self._multiply_factored_matrix(left_factor, right_factor, vector,
-                                          extra_scale=extra_scale,
-                                          transpose_left=transpose,
-                                          transpose_right=not transpose)
-
-  def full_fisher_block(self):
-    """Explicitly constructs the full Fisher block.
-
-    Used for testing purposes. (In general, the result may be very large.)
-
-    Returns:
-      The full Fisher block.
-    """
-    left_factor = self._input_factor.get_cov_as_linear_operator().to_dense()
-    right_factor = self._output_factor.get_cov_as_linear_operator().to_dense()
-    return self._renorm_coeff * utils.kronecker_product(left_factor,
-                                                        right_factor)
-
-
-class EmbeddingKFACFB(InputOutputMultiTower, KroneckerProductFB):
-  """K-FAC FisherBlock for embedding layers.
-
-  This FisherBlock is similar to FullyConnectedKFACBasicFB, except that its
-  input factor is approximated by a diagonal matrix. In the case that each
-  example references exactly one embedding, this approximation is exact.
-
-  Does not support bias parameters.
-  """
-
-  def __init__(self, layer_collection, vocab_size):
-    """Creates a EmbeddingKFACFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      vocab_size: int. Size of vocabulary for this embedding layer.
-    """
-    self._vocab_size = vocab_size
-
-    super(EmbeddingKFACFB, self).__init__(layer_collection)
-
-  def instantiate_factors(self, grads_list, damping):
-    """Instantiate Kronecker Factors for this FisherBlock.
-
-    Args:
-      grads_list: List of list of Tensors. grads_list[i][j] is the
-        gradient of the loss with respect to 'outputs' from source 'i' and
-        tower 'j'. Each Tensor has shape [tower_minibatch_size, output_size].
-      damping: 0-D Tensor or float. 'damping' * identity is approximately added
-        to this FisherBlock's Fisher approximation.
-    """
-    inputs, grads_list = self._process_data(grads_list)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.EmbeddingInputKroneckerFactor,
-        (inputs, self._vocab_size))
-    self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedKroneckerFactor, (grads_list,))
-    self._setup_damping(damping)
-
-
-class FullyConnectedKFACBasicFB(InputOutputMultiTower, KroneckerProductFB):
-  """K-FAC FisherBlock for fully-connected (dense) layers.
-
-  This uses the Kronecker-factorized approximation from the original
-  K-FAC paper (https://arxiv.org/abs/1503.05671)
-  """
-
-  def __init__(self, layer_collection, has_bias=False):
-    """Creates a FullyConnectedKFACBasicFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      has_bias: Whether the component Kronecker factors have an additive bias.
-          (Default: False)
-    """
-    self._has_bias = has_bias
-
-    super(FullyConnectedKFACBasicFB, self).__init__(layer_collection)
-
-  def instantiate_factors(self, grads_list, damping):
-    """Instantiate Kronecker Factors for this FisherBlock.
-
-    Args:
-      grads_list: List of list of Tensors. grads_list[i][j] is the
-        gradient of the loss with respect to 'outputs' from source 'i' and
-        tower 'j'. Each Tensor has shape [tower_minibatch_size, output_size].
-      damping: 0-D Tensor or float. 'damping' * identity is approximately added
-        to this FisherBlock's Fisher approximation.
-    """
-    inputs, grads_list = self._process_data(grads_list)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedKroneckerFactor,
-        ((inputs,), self._has_bias))
-    self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedKroneckerFactor,
-        (grads_list,))
-    self._setup_damping(damping)
-
-
-class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
-  r"""FisherBlock for convolutional layers using the basic KFC approx.
-
-  Estimates the Fisher Information matrix's blog for a convolutional
-  layer.
-
-  Consider a convoluational layer in this model with (unshared) filter matrix
-  'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
-  this FisherBlock estimates,
-
-    $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T],
-                                  E[flat(ds) flat(ds)^T])$$
-
-  where
-
-    $$ds = (d / ds) log p(y | x, w)$$
-    #locations = number of (x, y) locations where 'w' is applied.
-
-  where the expectation is taken over all examples and locations and flat()
-  concatenates an array's leading dimensions.
-
-  See equation 23 in https://arxiv.org/abs/1602.01407 for details.
-  """
-
-  def __init__(self,
-               layer_collection,
-               params,
-               padding,
-               strides=None,
-               dilation_rate=None,
-               data_format=None,
-               extract_patches_fn=None):
-    """Creates a ConvKFCBasicFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      params: The parameters (Tensor or tuple of Tensors) of this layer. If
-        kernel alone, a Tensor of shape [..spatial_filter_shape..,
-        in_channels, out_channels]. If kernel and bias, a tuple of 2 elements
-        containing the previous and a Tensor of shape [out_channels].
-      padding: str. Padding method.
-      strides: List of ints or None. Contains [..spatial_filter_strides..] if
-        'extract_patches_fn' is compatible with tf.nn.convolution(), else
-        [1, ..spatial_filter_strides, 1].
-      dilation_rate: List of ints or None. Rate for dilation along each spatial
-        dimension if 'extract_patches_fn' is compatible with
-        tf.nn.convolution(), else [1, ..spatial_dilation_rates.., 1].
-      data_format: str or None. Format of input data.
-      extract_patches_fn: str or None. Name of function that extracts image
-        patches. One of "extract_convolution_patches", "extract_image_patches",
-        "extract_pointwise_conv2d_patches".
-    """
-    self._padding = padding
-    self._strides = maybe_tuple(strides)
-    self._dilation_rate = maybe_tuple(dilation_rate)
-    self._data_format = data_format
-    self._extract_patches_fn = extract_patches_fn
-    self._has_bias = isinstance(params, (tuple, list))
-
-    fltr = params[0] if self._has_bias else params
-    self._filter_shape = tuple(fltr.shape.as_list())
-
-    super(ConvKFCBasicFB, self).__init__(layer_collection)
-
-  def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._process_data(grads_list)
-
-    # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
-                                             self._strides)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.ConvInputKroneckerFactor,
-        (inputs, self._filter_shape, self._padding, self._strides,
-         self._dilation_rate, self._data_format, self._extract_patches_fn,
-         self._has_bias))
-    self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.ConvOutputKroneckerFactor, (grads_list,))
-
-    self._setup_damping(damping, normalization=self._num_locations)
-
-  @property
-  def _renorm_coeff(self):
-    return self._num_locations
-
-
-class DepthwiseConvDiagonalFB(ConvDiagonalFB):
-  """FisherBlock for depthwise_conv2d().
-
-  Equivalent to ConvDiagonalFB applied to each input channel in isolation.
-  """
-
-  def __init__(self,
-               layer_collection,
-               params,
-               strides,
-               padding,
-               rate=None,
-               data_format=None):
-    """Creates a DepthwiseConvKFCBasicFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      params: Tensor of shape [filter_height, filter_width, in_channels,
-        channel_multiplier].
-      strides: List of 4 ints. Strides along all dimensions.
-      padding: str. Padding method.
-      rate: List of 4 ints or None. Rate for dilation along all dimensions.
-      data_format: str or None. Format of input data.
-
-    Raises:
-      NotImplementedError: If parameters contains bias.
-      ValueError: If filter is not 4-D.
-      ValueError: If strides is not length-4.
-      ValueError: If rates is not length-2.
-      ValueError: If channels are not last dimension.
-    """
-    if isinstance(params, (tuple, list)):
-      raise NotImplementedError("Bias not yet supported.")
-
-    if params.shape.ndims != 4:
-      raise ValueError("Filter must be 4-D.")
-
-    if len(strides) != 4:
-      raise ValueError("strides must account for 4 dimensions.")
-
-    if rate is not None:
-      if len(rate) != 2:
-        raise ValueError("rate must only account for spatial dimensions.")
-      rate = [1, rate[0], rate[1], 1]  # conv2d expects 4-element rate.
-
-    if not utils.is_data_format_channel_last(data_format):
-      raise ValueError("data_format must be channels-last.")
-
-    super(DepthwiseConvDiagonalFB, self).__init__(
-        layer_collection=layer_collection,
-        params=params,
-        strides=strides,
-        padding=padding,
-        dilations=rate,
-        data_format=data_format)
-
-    # This is a hack to overwrite the same setting in ConvKFCBasicFB.__init__().
-    filter_height, filter_width, in_channels, channel_multiplier = (
-        params.shape.as_list())
-    self._filter_shape = (filter_height, filter_width, in_channels,
-                          in_channels * channel_multiplier)
-
-  def _multiply_matrix(self, matrix, vector):
-    conv2d_vector = depthwise_conv2d_filter_to_conv2d_filter(vector)
-    conv2d_result = super(
-        DepthwiseConvDiagonalFB, self)._multiply_matrix(matrix, conv2d_vector)
-    return conv2d_filter_to_depthwise_conv2d_filter(conv2d_result)
-
-
-class DepthwiseConvKFCBasicFB(ConvKFCBasicFB):
-  """FisherBlock for depthwise_conv2d().
-
-  Equivalent to ConvKFCBasicFB applied to each input channel in isolation.
-  """
-
-  def __init__(self,
-               layer_collection,
-               params,
-               strides,
-               padding,
-               rate=None,
-               data_format=None):
-    """Creates a DepthwiseConvKFCBasicFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      params: Tensor of shape [filter_height, filter_width, in_channels,
-        channel_multiplier].
-      strides: List of 4 ints. Strides along all dimensions.
-      padding: str. Padding method.
-      rate: List of 4 ints or None. Rate for dilation along all dimensions.
-      data_format: str or None. Format of input data.
-
-    Raises:
-      NotImplementedError: If parameters contains bias.
-      ValueError: If filter is not 4-D.
-      ValueError: If strides is not length-4.
-      ValueError: If rates is not length-2.
-      ValueError: If channels are not last dimension.
-    """
-    if isinstance(params, (tuple, list)):
-      raise NotImplementedError("Bias not yet supported.")
-
-    if params.shape.ndims != 4:
-      raise ValueError("Filter must be 4-D.")
-
-    if len(strides) != 4:
-      raise ValueError("strides must account for 4 dimensions.")
-
-    if rate is not None:
-      if len(rate) != 2:
-        raise ValueError("rate must only account for spatial dimensions.")
-      rate = [1, rate[0], rate[1], 1]  # conv2d expects 4-element rate.
-
-    if not utils.is_data_format_channel_last(data_format):
-      raise ValueError("data_format must be channels-last.")
-
-    super(DepthwiseConvKFCBasicFB, self).__init__(
-        layer_collection=layer_collection,
-        params=params,
-        padding=padding,
-        strides=strides,
-        dilation_rate=rate,
-        data_format=data_format,
-        extract_patches_fn="extract_image_patches")
-
-    # This is a hack to overwrite the same setting in ConvKFCBasicFB.__init__().
-    filter_height, filter_width, in_channels, channel_multiplier = (
-        params.shape.as_list())
-    self._filter_shape = (filter_height, filter_width, in_channels,
-                          in_channels * channel_multiplier)
-
-  def _multiply_factored_matrix(self, left_factor, right_factor, vector,
-                                extra_scale=1.0, transpose_left=False,
-                                transpose_right=False):
-    conv2d_vector = depthwise_conv2d_filter_to_conv2d_filter(vector)
-    conv2d_result = super(
-        DepthwiseConvKFCBasicFB, self)._multiply_factored_matrix(
-            left_factor, right_factor, conv2d_vector, extra_scale=extra_scale,
-            transpose_left=transpose_left, transpose_right=transpose_right)
-    return conv2d_filter_to_depthwise_conv2d_filter(conv2d_result)
-
-
-def depthwise_conv2d_filter_to_conv2d_filter(filter, name=None):  # pylint: disable=redefined-builtin
-  """Converts a convolution filter for use with conv2d.
-
-  Transforms a filter for use with tf.nn.depthwise_conv2d() to one that's
-  compatible with tf.nn.conv2d().
-
-  Args:
-    filter: Tensor of shape [height, width, in_channels, channel_multiplier].
-    name: None or str. Name of Op.
-
-  Returns:
-    Tensor of shape [height, width, in_channels, out_channels].
-
-  """
-  with ops.name_scope(name, "depthwise_conv2d_filter_to_conv2d_filter",
-                      [filter]):
-    filter = ops.convert_to_tensor(filter)
-    filter_height, filter_width, in_channels, channel_multiplier = (
-        filter.shape.as_list())
-
-    results = []
-    for i in range(in_channels):
-      # Slice out one in_channel's filter. Insert zeros around it to force it
-      # to affect that channel and that channel alone.
-      elements = []
-      if i > 0:
-        elements.append(
-            array_ops.zeros(
-                [filter_height, filter_width, i, channel_multiplier]))
-      elements.append(filter[:, :, i:(i + 1), :])
-      if i + 1 < in_channels:
-        elements.append(
-            array_ops.zeros([
-                filter_height, filter_width, in_channels - (i + 1),
-                channel_multiplier
-            ]))
-
-      # Concat along in_channel.
-      results.append(
-          array_ops.concat(elements, axis=-2, name="in_channel_%d" % i))
-
-    # Concat along out_channel.
-    return array_ops.concat(results, axis=-1, name="out_channel")
-
-
-def conv2d_filter_to_depthwise_conv2d_filter(filter, name=None):  # pylint: disable=redefined-builtin
-  """Converts a convolution filter for use with depthwise_conv2d.
-
-  Transforms a filter for use with tf.nn.conv2d() to one that's
-  compatible with tf.nn.depthwise_conv2d(). Ignores all filters but those along
-  the diagonal.
-
-  Args:
-    filter: Tensor of shape [height, width, in_channels, out_channels].
-    name: None or str. Name of Op.
-
-  Returns:
-    Tensor of shape,
-      [height, width, in_channels, channel_multiplier]
-
-  Raises:
-    ValueError: if out_channels is not evenly divisible by in_channels.
-  """
-  with ops.name_scope(name, "conv2d_filter_to_depthwise_conv2d_filter",
-                      [filter]):
-    filter = ops.convert_to_tensor(filter)
-    filter_height, filter_width, in_channels, out_channels = (
-        filter.shape.as_list())
-
-    if out_channels % in_channels != 0:
-      raise ValueError("out_channels must be evenly divisible by in_channels.")
-    channel_multiplier = out_channels // in_channels
-
-    results = []
-    filter = array_ops.reshape(filter, [
-        filter_height, filter_width, in_channels, in_channels,
-        channel_multiplier
-    ])
-    for i in range(in_channels):
-      # Slice out output corresponding to the correct filter.
-      filter_slice = array_ops.reshape(
-          filter[:, :, i, i, :],
-          [filter_height, filter_width, 1, channel_multiplier])
-      results.append(filter_slice)
-
-    # Concat along out_channel.
-    return array_ops.concat(results, axis=-2, name="in_channels")
-
-
-def maybe_tuple(obj):
-  if not isinstance(obj, list):
-    return obj
-  return tuple(obj)
-
-
-def num_conv_locations(input_shape, strides):
-  """Returns the number of spatial locations a 2D Conv kernel is applied to.
-
-  Args:
-    input_shape: List of ints representing shape of inputs to
-      tf.nn.convolution().
-    strides: List of ints representing strides along spatial dimensions as
-      passed in to tf.nn.convolution().
-
-  Returns:
-    A scalar |T| denoting the number of spatial locations for the Conv layer.
-  """
-  spatial_input_locations = np.prod(input_shape[1:-1])
-
-  if strides is None:
-    spatial_strides_divisor = 1
-  else:
-    spatial_strides_divisor = np.prod(strides)
-
-  return spatial_input_locations // spatial_strides_divisor
-
-
-class InputOutputMultiTowerMultiUse(InputOutputMultiTower):
-  """Adds methods for multi-use/time-step case to InputOutputMultiTower."""
-
-  def __init__(self, num_uses=None, *args, **kwargs):
-    self._num_uses = num_uses
-    super(InputOutputMultiTowerMultiUse, self).__init__(*args, **kwargs)
-
-  def _process_data(self, grads_list):
-    """Process temporal/multi-use data into the format used by the factors.
-
-    This function takes inputs and grads_lists data and processes it into
-    one of the formats expected by the FisherFactor classes (depending on
-    the value of the global configuration variable TOWER_STRATEGY).
-
-    It accepts the data in one of two initial formats. The first possible
-    format is where self._inputs is a list of list of Tensors. The first index
-    is tower, the second is use/time-step. grads_list, meanwhile, is a list
-    over sources of such lists of lists.
-
-    The second possible data format is where self._inputs is a Tensor with
-    uses/times-steps folded into the batch dimension.  i.e. it is a Tensor
-    of shape [num_uses * size_batch, ...] which represents a reshape of a
-    Tensor of shape [num_uses, size_batch, ...].  And similarly grads_list is
-    a list over sources of such Tensors.
-
-    There are two possible formats which inputs and grads_list are transformed
-    into.
-
-    If TOWER_STRATEGY is "concat", 'inputs' becomes a tuple containing
-    a single tensor (represented as a PartitionedTensor object) with all of
-    the data from the towers, as well as the uses/time-steps, concatenated
-    together. In this tensor the leading dimension is the batch and
-    use/time-step dimensions folded together (with 'use' being the major of
-    these two, so that the tensors can be thought of as reshapes of ones of
-    shape [num_uses, batch_size, ...]). grads_list is similarly formatted as a
-    tuple over sources of such tensors.
-
-    If TOWER_STRATEGY is "separate" the inputs are formatted into lists of
-    tensors over towers. Each of these tensors has a similar format to
-    the tensor produced by the "concat" option, except that each contains
-    only the data from a single tower.  grads_list is similarly formatted
-    into a tuple over sources of such tuples.
-
-    Args:
-      grads_list: grads_list in its initial format (see above).
-
-    Returns:
-      inputs: self._inputs transformed into the appropriate format (see
-        above).
-      grads_list: grads_list transformed into the appropriate format (see
-        above).
-
-    Raises:
-      ValueError: If TOWER_STRATEGY is not one of "separate" or "concat".
-      ValueError: If the given/initial format of self._inputs and grads_list
-        isn't recognized, or doesn't agree with self._num_uses.
-    """
-
-    inputs = self._inputs
-
-    if isinstance(inputs[0], (list, tuple)):
-      num_uses = len(inputs[0])
-      if self._num_uses is not None and self._num_uses != num_uses:
-        raise ValueError("num_uses argument doesn't match length of inputs.")
-      else:
-        self._num_uses = num_uses
-
-      # Check that all mini-batches/towers have the same number of uses
-      if not all(len(input_) == num_uses for input_ in inputs):
-        raise ValueError("Length of inputs argument is inconsistent across "
-                         "towers.")
-
-      if fisher_factors.TOWER_STRATEGY == "concat":
-        # Reverse the tower and use/time-step indices, so that use is now first,
-        # and towers is second
-        inputs = tuple(zip(*inputs))
-
-        # Flatten the two dimensions
-        inputs = nest.flatten(inputs)
-
-        # Merge everything together into a PartitionedTensor. We package it in
-        # a singleton tuple since the factors will expect a list over towers
-        inputs = (utils.PartitionedTensor(inputs),)
-
-      elif fisher_factors.TOWER_STRATEGY == "separate":
-        # Merge together the uses/time-step dimension into PartitionedTensors,
-        # but keep the leading dimension (towers) intact for the factors to
-        # process individually.
-        inputs = tuple(utils.PartitionedTensor(input_) for input_ in inputs)
-
-      else:
-        raise ValueError("Global config variable TOWER_STRATEGY must be one of "
-                         "'concat' or 'separate'.")
-    else:
-      inputs = tuple(inputs)
-
-    # Now we perform the analogous processing for grads_list
-    if isinstance(grads_list[0][0], (list, tuple)):
-      num_uses = len(grads_list[0][0])
-      if self._num_uses is not None and self._num_uses != num_uses:
-        raise ValueError("num_uses argument doesn't match length of outputs, "
-                         "or length of outputs is inconsistent with length of "
-                         "inputs.")
-      else:
-        self._num_uses = num_uses
-
-      if not all(len(grad) == num_uses for grads in grads_list
-                 for grad in grads):
-        raise ValueError("Length of outputs argument is inconsistent across "
-                         "towers.")
-
-      if fisher_factors.TOWER_STRATEGY == "concat":
-        # Reverse the tower and use/time-step indices, so that use is now first,
-        # and towers is second
-        grads_list = tuple(tuple(zip(*grads)) for grads in grads_list)
-
-        # Flatten the two dimensions, leaving the leading dimension (source)
-        # intact
-        grads_list = tuple(nest.flatten(grads) for grads in grads_list)
-
-        # Merge inner dimensions together into PartitionedTensors. We package
-        # them in a singleton tuple since the factors will expect a list over
-        # towers
-        grads_list = tuple((utils.PartitionedTensor(grads),)
-                           for grads in grads_list)
-
-      elif fisher_factors.TOWER_STRATEGY == "separate":
-        # Merge together the uses/time-step dimension into PartitionedTensors,
-        # but keep the leading dimension (towers) intact for the factors to
-        # process individually.
-        grads_list = tuple(tuple(utils.PartitionedTensor(grad)
-                                 for grad in grads)
-                           for grads in grads_list)
-
-      else:
-        raise ValueError("Global config variable TOWER_STRATEGY must be one of "
-                         "'concat' or 'separate'.")
-    else:
-      grads_list = tuple(tuple(grads) for grads in grads_list)
-
-    if self._num_uses is None:
-      raise ValueError("You must supply a value for the num_uses argument if "
-                       "the number of uses cannot be inferred from inputs or "
-                       "outputs arguments (e.g. if they are both given in the "
-                       "single Tensor format, instead of as lists of Tensors.")
-
-    return inputs, grads_list
-
-
-class FullyConnectedMultiIndepFB(InputOutputMultiTowerMultiUse,
-                                 KroneckerProductFB):
-  """FisherBlock for fully-connected layers that share parameters.
-
-  This class implements the "independence across time" approximation from the
-  following paper:
-    https://openreview.net/pdf?id=HyMTkQZAb
-  """
-
-  def __init__(self, layer_collection, has_bias=False, num_uses=None):
-    """Creates a FullyConnectedMultiIndepFB block.
-
-    Args:
-      layer_collection: LayerCollection instance.
-      has_bias: bool. If True, estimates Fisher with respect to a bias
-        parameter as well as the layer's parameters.
-      num_uses: int or None. Number of uses of the layer in the model's graph.
-        Only required if the data is formatted with uses/time folded into the
-        batch dimension (instead of uses/time being a list dimension).
-        (Default: None)
-    """
-    self._has_bias = has_bias
-
-    super(FullyConnectedMultiIndepFB, self).__init__(
-        layer_collection=layer_collection,
-        num_uses=num_uses)
-
-  def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._process_data(grads_list)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF,
-        ((inputs,), self._num_uses, self._has_bias))
-
-    self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
-
-    self._setup_damping(damping, normalization=self._num_uses)
-
-  @property
-  def _renorm_coeff(self):
-    return float(self._num_uses)
-
-
-class ConvKFCBasicMultiIndepFB(InputOutputMultiTowerMultiUse,
-                               KroneckerProductFB):
-  """FisherBlock for 2D convolutional layers using the basic KFC approx.
-
-  Similar to ConvKFCBasicFB except that this version supports multiple
-  uses/time-steps via a standard independence approximation.  Similar to the
-  "independence across time" used in FullyConnectedMultiIndepFB but generalized
-  in the obvious way to conv layers.
-  """
-
-  def __init__(self,
-               layer_collection,
-               params,
-               padding,
-               strides=None,
-               dilation_rate=None,
-               data_format=None,
-               extract_patches_fn=None,
-               num_uses=None):
-    """Creates a ConvKFCBasicMultiIndepFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      params: The parameters (Tensor or tuple of Tensors) of this layer. If
-        kernel alone, a Tensor of shape [..spatial_filter_shape..,
-        in_channels, out_channels]. If kernel and bias, a tuple of 2 elements
-        containing the previous and a Tensor of shape [out_channels].
-      padding: str. Padding method.
-      strides: List of ints or None. Contains [..spatial_filter_strides..] if
-        'extract_patches_fn' is compatible with tf.nn.convolution(), else
-        [1, ..spatial_filter_strides, 1].
-      dilation_rate: List of ints or None. Rate for dilation along each spatial
-        dimension if 'extract_patches_fn' is compatible with
-        tf.nn.convolution(), else [1, ..spatial_dilation_rates.., 1].
-      data_format: str or None. Format of input data.
-      extract_patches_fn: str or None. Name of function that extracts image
-        patches. One of "extract_convolution_patches", "extract_image_patches",
-        "extract_pointwise_conv2d_patches".
-      num_uses: int or None. Number of uses of the layer in the model's graph.
-        Only required if the data is formatted with uses/time folded into the
-        batch dimension (instead of uses/time being a list dimension).
-        (Default: None)
-    """
-    self._padding = padding
-    self._strides = maybe_tuple(strides)
-    self._dilation_rate = maybe_tuple(dilation_rate)
-    self._data_format = data_format
-    self._extract_patches_fn = extract_patches_fn
-    self._has_bias = isinstance(params, (tuple, list))
-
-    fltr = params[0] if self._has_bias else params
-    self._filter_shape = tuple(fltr.shape.as_list())
-
-    super(ConvKFCBasicMultiIndepFB, self).__init__(
-        layer_collection=layer_collection,
-        num_uses=num_uses)
-
-  def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._process_data(grads_list)
-
-    # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
-                                             self._strides)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.ConvInputKroneckerFactor,
-        (inputs, self._filter_shape, self._padding, self._strides,
-         self._dilation_rate, self._data_format, self._extract_patches_fn,
-         self._has_bias))
-    self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.ConvOutputKroneckerFactor, (grads_list,))
-
-    self._setup_damping(damping, normalization=
-                        (self._num_locations * self._num_uses))
-
-  @property
-  def _renorm_coeff(self):
-    return self._num_locations * self._num_uses
-
-
-class EmbeddingKFACMultiIndepFB(InputOutputMultiTowerMultiUse,
-                                KroneckerProductFB):
-  """K-FAC FisherBlock for embedding layers used multiple times in the graph.
-
-  Similar to EmbeddingKFACFB except that this version supports multiple uses
-  of the parameter within a single model. These uses could correspond to time
-  steps in an RNN architecture, but they don't have to.
-
-  Does not support bias parameters.
-  """
-
-  def __init__(self, layer_collection, vocab_size, num_uses=None):
-    """Creates a EmbeddingKFACMultiIndepFB block.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-          Fisher information matrix to which this FisherBlock belongs.
-      vocab_size: int. Size of vocabulary for this embedding layer.
-      num_uses: int or None. Number of uses of the layer in the model's graph.
-        Only required if the data is formatted with time folded into the batch
-        dimension (instead of time being a list dimension). (Default: None)
-    """
-    self._vocab_size = vocab_size
-
-    super(EmbeddingKFACMultiIndepFB, self).__init__(
-        layer_collection=layer_collection,
-        num_uses=num_uses)
-
-  def instantiate_factors(self, grads_list, damping):
-    """Instantiate Kronecker Factors for this FisherBlock.
-
-    Args:
-      grads_list: List of list of list of Tensors. grads_list[i][j][k] is the
-        gradient of the loss with respect to 'outputs' from source 'i',
-        tower/mini-batch 'j', and use/time-step 'k'. Each Tensor has shape
-        [tower_minibatch_size, output_size].
-      damping: 0-D Tensor or float. 'damping' * identity is approximately added
-        to this FisherBlock's Fisher approximation.
-    """
-    inputs, grads_list = self._process_data(grads_list)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.EmbeddingInputKroneckerFactor,
-        (inputs, self._vocab_size))
-    self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
-    self._setup_damping(damping, normalization=self._num_uses)
-
-  @property
-  def _renorm_coeff(self):
-    return float(self._num_uses)
-
-
-class SeriesFBApproximation(enum.IntEnum):
-  """See FullyConnectedSeriesFB.__init__ for description and usage."""
-  option1 = 1
-  option2 = 2
-
-
-class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
-                             KroneckerProductFB):
-  """FisherBlock for fully-connected layers that share parameters across time.
-
-  This class implements the "Option 1" and "Option 2" approximation from the
-  following paper:
-    https://openreview.net/pdf?id=HyMTkQZAb
-
-  See the end of the appendix of the paper for a pseudo-code of the
-  algorithm being implemented by multiply_matpower here.  Note that we are
-  using pre-computed versions of certain matrix-matrix products to speed
-  things up.  This is explicitly explained wherever it is done.
-  """
-
-  def __init__(self,
-               layer_collection,
-               has_bias=False,
-               num_uses=None,
-               option=SeriesFBApproximation.option2):
-    """Constructs a new `FullyConnectedSeriesFB`.
-
-    Args:
-      layer_collection: The collection of all layers in the K-FAC approximate
-        Fisher information matrix to which this FisherBlock belongs.
-      has_bias: Whether the layer includes a bias parameter.
-      num_uses: int or None. Number of time-steps over which the layer
-        is used. Only required if the data is formatted with time folded into
-        the batch dimension (instead of time being a list dimension).
-        (Default: None)
-      option: A `SeriesFBApproximation` specifying the simplifying assumption
-        to be used in this block. `option1` approximates the cross-covariance
-        over time as a symmetric matrix, while `option2` makes
-        the assumption that training sequences are infinitely long. See section
-        3.5 of the paper for more details.
-    """
-
-    self._has_bias = has_bias
-    self._option = option
-
-    super(FullyConnectedSeriesFB, self).__init__(
-        layer_collection=layer_collection,
-        num_uses=num_uses)
-
-  @property
-  def _num_timesteps(self):
-    return self._num_uses
-
-  @property
-  def _renorm_coeff(self):
-    # This should no longer be used since the multiply_X functions from the base
-    # class have been overridden
-    assert False
-
-  def instantiate_factors(self, grads_list, damping):
-    inputs, grads_list = self._process_data(grads_list)
-
-    self._input_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF,
-        ((inputs,), self._num_uses, self._has_bias))
-    self._input_factor.register_cov_dt1()
-
-    self._output_factor = self._layer_collection.make_or_get_factor(
-        fisher_factors.FullyConnectedMultiKF, (grads_list, self._num_uses))
-    self._output_factor.register_cov_dt1()
-
-    self._setup_damping(damping, normalization=self._num_uses)
-
-  def register_matpower(self, exp):
-    if exp != -1:
-      raise NotImplementedError("FullyConnectedSeriesFB only supports inverse"
-                                "multiplications.")
-
-    if self._option == SeriesFBApproximation.option1:
-      self._input_factor.register_option1quants(self._input_damping_func)
-      self._output_factor.register_option1quants(self._output_damping_func)
-    elif self._option == SeriesFBApproximation.option2:
-      self._input_factor.register_option2quants(self._input_damping_func)
-      self._output_factor.register_option2quants(self._output_damping_func)
-    else:
-      raise ValueError(
-          "Unrecognized FullyConnectedSeriesFB approximation: {}".format(
-              self._option))
-
-  def multiply_matpower(self, vector, exp):
-    if exp != -1:
-      raise NotImplementedError("FullyConnectedSeriesFB only supports inverse"
-                                "multiplications.")
-
-    # pylint: disable=invalid-name
-
-    Z = utils.layer_params_to_mat2d(vector)
-
-    # Derivations were done for "batch_dim==1" case so we need to convert to
-    # that orientation:
-    Z = array_ops.transpose(Z)
-
-    if self._option == SeriesFBApproximation.option1:
-
-      # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\)
-      L_A, psi_A = self._input_factor.get_option1quants(
-          self._input_damping_func)
-      L_G, psi_G = self._output_factor.get_option1quants(
-          self._output_damping_func)
-
-      def gamma(x):
-        # We are assuming that each case has the same number of time-steps.
-        # If this stops being the case one shouldn't simply replace this T
-        # with its average value.  Instead, one needs to go back to the
-        # definition of the gamma function from the paper.
-        T = self._num_timesteps
-        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
-
-      # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise)
-      # Even though Y is Z-independent we are recomputing it from the psi's
-      # each since Y depends on both A and G quantities, and it is relatively
-      # cheap to compute.
-      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
-
-      # \\(Z = L_G^T * Z * L_A\\)
-      # This is equivalent to the following computation from the original
-      # pseudo-code:
-      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
-      # \\(Z = U_G^T * Z * U_A\\)
-      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
-
-      # \\(Z = Z .* Y\\)
-      Z *= Y
-
-      # \\(Z = L_G * Z * L_A^T\\)
-      # This is equivalent to the following computation from the original
-      # pseudo-code:
-      # \\(Z = U_G * Z * U_A^T\\)
-      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
-      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
-
-    elif self._option == SeriesFBApproximation.option2:
-
-      # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\),
-      # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\)
-      P_A, K_A, mu_A = self._input_factor.get_option2quants(
-          self._input_damping_func)
-      P_G, K_G, mu_G = self._output_factor.get_option2quants(
-          self._output_damping_func)
-
-      # Our approach differs superficially from the pseudo-code in the paper
-      # in order to reduce the total number of matrix-matrix multiplies.
-      # In particular, the first three computations in the pseudo code are
-      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
-      # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\)
-      # \\(Z = E_G^T * Z * E_A\\)
-      # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that
-      # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\)
-      # the entire computation can be written as
-      # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
-      # \\(    - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\)
-      # \\(  = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
-      # \\(    - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\)
-      # \\(  = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\)
-      # \\(    -  E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\)
-      # \\(  = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A\\)
-      # This final expression is computed by the following two lines:
-      # \\(Z = Z - P_G * Z * P_A^T\\)
-      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
-      # \\(Z = K_G^T * Z * K_A\\)
-      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
-
-      # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\)
-      # Be careful with the outer product.  We don't want to accidentally
-      # make it an inner-product instead.
-      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
-      # Prevent some numerical issues by setting any 0.0 eigs to 1.0
-      tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype)
-      Z /= tmp
-
-      # We now perform the transpose/reverse version of the operations
-      # derived above, whose derivation from the original pseudo-code is
-      # analgous.
-      # \\(Z = K_G * Z * K_A^T\\)
-      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
-
-      # \\(Z = Z - P_G^T * Z * P_A\\)
-      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
-
-      # \\(Z = normalize (1/E[T]) * Z\\)
-      # Note that this normalization is done because we compute the statistics
-      # by averaging, not summing, over time. (And the gradient is presumably
-      # summed over time, not averaged, and thus their scales are different.)
-      Z /= math_ops.cast(self._num_timesteps, Z.dtype)
-
-    # Convert back to the "batch_dim==0" orientation.
-    Z = array_ops.transpose(Z)
-
-    return utils.mat2d_to_layer_params(vector, Z)
-
-    # pylint: enable=invalid-name
-
-  def multiply_cholesky(self, vector):
-    raise NotImplementedError("FullyConnectedSeriesFB does not support "
-                              "Cholesky computations.")
-
-  def multiply_cholesky_inverse(self, vector):
-    raise NotImplementedError("FullyConnectedSeriesFB does not support "
-                              "Cholesky computations.")
-
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks_lib.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks_lib.py
deleted file mode 100644
index c04cf727fa..0000000000
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks_lib.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""FisherBlock definitions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.fisher_blocks import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    'FisherBlock',
-    'FullFB',
-    'NaiveDiagonalFB',
-    'FullyConnectedDiagonalFB',
-    'KroneckerProductFB',
-    'EmbeddingKFACFB',
-    'FullyConnectedKFACBasicFB',
-    'ConvKFCBasicFB',
-    'ConvDiagonalFB',
-    'set_global_constants',
-    'compute_pi_tracenorm',
-    'compute_pi_adjusted_damping',
-    'num_conv_locations',
-    'normalize_damping',
-    'LEFT_MULTIPLY',
-    'RIGHT_MULTIPLY',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
deleted file mode 100644
index b43232dfaf..0000000000
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ /dev/null
@@ -1,1830 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""FisherFactor definitions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import contextlib
-
-import numpy as np
-import six
-
-from tensorflow.contrib.kfac.python.ops import linear_operator as lo
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops as tf_ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import special_math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.training import moving_averages
-from tensorflow.python.util import nest
-
-
-# Whether to initialize covariance estimators at a zero matrix (or the identity
-# matrix).
-INIT_COVARIANCES_AT_ZERO = True
-
-# Whether to zero-debias the moving averages.
-ZERO_DEBIAS = True
-
-# Whether to initialize inverse (and other such matrices computed from the cov
-# matrices) to the zero matrix (or the identity matrix).
-INIT_INVERSES_AT_ZERO = True
-
-# When the number of inverses requested from a FisherFactor exceeds this value,
-# the inverses are computed using an eigenvalue decomposition.
-EIGENVALUE_DECOMPOSITION_THRESHOLD = 2
-
-# Numerical eigenvalues computed from covariance matrix estimates are clipped to
-# be at least as large as this value before they are used to compute inverses or
-# matrix powers. Must be nonnegative.
-EIGENVALUE_CLIPPING_THRESHOLD = 0.0
-
-# Used to subsample the flattened extracted image patches. The number of
-# outer products per row of the covariance matrix should not exceed this
-# value. This parameter is used only if `_SUB_SAMPLE_OUTER_PRODUCTS` is True.
-_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = 1
-
-# Used to subsample the inputs passed to the extract image patches. The batch
-# size of number of inputs to extract image patches is multiplied by this
-# factor. This parameter is used only if `_SUB_SAMPLE_INPUTS` is True.
-_INPUTS_TO_EXTRACT_PATCHES_FACTOR = 0.5
-
-# If True, then subsamples the tensor passed to compute the covaraince matrix.
-_SUB_SAMPLE_OUTER_PRODUCTS = False
-
-# If True, then subsamples the tensor passed to compute the covaraince matrix.
-_SUB_SAMPLE_INPUTS = False
-
-# TOWER_STRATEGY can be one of "concat" or "separate".  If "concat", the data
-# passed to the factors from the blocks will be concatenated across towers
-# (lazilly via PartitionedTensor objects).  Otherwise a tuple of tensors over
-# towers will be passed in, and the factors will iterate over this and do the
-# cov computations separately for each one, averaging the results together.
-TOWER_STRATEGY = "concat"
-
-
-def set_global_constants(init_covariances_at_zero=None,
-                         zero_debias=None,
-                         init_inverses_at_zero=None,
-                         eigenvalue_decomposition_threshold=None,
-                         eigenvalue_clipping_threshold=None,
-                         max_num_outer_products_per_cov_row=None,
-                         sub_sample_outer_products=None,
-                         inputs_to_extract_patches_factor=None,
-                         sub_sample_inputs=None,
-                         tower_strategy=None):
-  """Sets various global constants used by the classes in this module."""
-  global INIT_COVARIANCES_AT_ZERO
-  global ZERO_DEBIAS
-  global INIT_INVERSES_AT_ZERO
-  global EIGENVALUE_DECOMPOSITION_THRESHOLD
-  global EIGENVALUE_CLIPPING_THRESHOLD
-  global _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW
-  global _SUB_SAMPLE_OUTER_PRODUCTS
-  global _INPUTS_TO_EXTRACT_PATCHES_FACTOR
-  global _SUB_SAMPLE_INPUTS
-  global TOWER_STRATEGY
-
-  if init_covariances_at_zero is not None:
-    INIT_COVARIANCES_AT_ZERO = init_covariances_at_zero
-  if zero_debias is not None:
-    ZERO_DEBIAS = zero_debias
-  if init_inverses_at_zero is not None:
-    INIT_INVERSES_AT_ZERO = init_inverses_at_zero
-  if eigenvalue_decomposition_threshold is not None:
-    EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold
-  if eigenvalue_clipping_threshold is not None:
-    EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold
-  if max_num_outer_products_per_cov_row is not None:
-    _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = max_num_outer_products_per_cov_row
-  if sub_sample_outer_products is not None:
-    _SUB_SAMPLE_OUTER_PRODUCTS = sub_sample_outer_products
-  if inputs_to_extract_patches_factor is not None:
-    _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_patches_factor
-  if sub_sample_inputs is not None:
-    _SUB_SAMPLE_INPUTS = sub_sample_inputs
-  if tower_strategy is not None:
-    TOWER_STRATEGY = tower_strategy
-
-
-def inverse_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
-  if INIT_INVERSES_AT_ZERO:
-    return array_ops.zeros(shape, dtype=dtype)
-  return linalg_ops.eye(num_rows=shape[0], dtype=dtype)
-
-
-def covariance_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
-  if INIT_COVARIANCES_AT_ZERO:
-    return array_ops.zeros(shape, dtype=dtype)
-  return linalg_ops.eye(num_rows=shape[0], dtype=dtype)
-
-
-def diagonal_covariance_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
-  if INIT_COVARIANCES_AT_ZERO:
-    return array_ops.zeros(shape, dtype=dtype)
-  return array_ops.ones(shape, dtype=dtype)
-
-
-@contextlib.contextmanager
-def place_on_device(device):
-  if device is not None and len(device):
-    with tf_ops.device(device):
-      yield
-  else:
-    yield
-
-
-def compute_cov(tensor, tensor_right=None, normalizer=None):
-  """Compute the empirical second moment of the rows of a 2D Tensor.
-
-  This function is meant to be applied to random matrices for which the true row
-  mean is zero, so that the true second moment equals the true covariance.
-
-  Args:
-    tensor: A 2D Tensor.
-    tensor_right: An optional 2D Tensor. If provided, this function computes
-      the matrix product tensor^T * tensor_right instead of tensor^T * tensor.
-    normalizer: optional scalar for the estimator (by default, the normalizer is
-        the number of rows of tensor).
-
-  Returns:
-    A square 2D Tensor with as many rows/cols as the number of input columns.
-  """
-  if normalizer is None:
-    normalizer = array_ops.shape(tensor)[0]
-  if tensor_right is None:
-    cov = (
-        math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
-            normalizer, tensor.dtype))
-    return (cov + array_ops.transpose(cov)) / math_ops.cast(2.0, cov.dtype)
-  else:
-    return (math_ops.matmul(tensor, tensor_right, transpose_a=True) /
-            math_ops.cast(normalizer, tensor.dtype))
-
-
-def append_homog(tensor):
-  """Appends a homogeneous coordinate to the last dimension of a Tensor.
-
-  Args:
-    tensor: A Tensor.
-
-  Returns:
-    A Tensor identical to the input but one larger in the last dimension.  The
-    new entries are filled with ones.
-  """
-  rank = len(tensor.shape.as_list())
-  shape = array_ops.concat([array_ops.shape(tensor)[:-1], [1]], axis=0)
-  ones = array_ops.ones(shape, dtype=tensor.dtype)
-  return array_ops.concat([tensor, ones], axis=rank - 1)
-
-
-def scope_string_from_params(params):
-  """Builds a variable scope string name from the given parameters.
-
-  Supported parameters are:
-    * tensors
-    * booleans
-    * ints
-    * strings
-    * depth-1 tuples/lists of ints
-    * any depth tuples/lists of tensors
-  Other parameter types will throw an error.
-
-  Args:
-    params: A parameter or list of parameters.
-
-  Returns:
-    A string to use for the variable scope.
-
-  Raises:
-    ValueError: if params includes an unsupported type.
-  """
-  params = params if isinstance(params, (tuple, list)) else (params,)
-
-  name_parts = []
-  for param in params:
-    if param is None:
-      name_parts.append("None")
-    elif isinstance(param, (tuple, list)):
-      if all([isinstance(p, int) for p in param]):
-        name_parts.append("-".join([str(p) for p in param]))
-      else:
-        name_parts.append(scope_string_from_name(param))
-    elif isinstance(param, (str, int, bool)):
-      name_parts.append(str(param))
-    elif isinstance(param, (tf_ops.Tensor, variables.Variable)):
-      name_parts.append(scope_string_from_name(param))
-    elif isinstance(param, utils.PartitionedTensor):
-      name_parts.append(scope_string_from_name(param.tensors))
-    else:
-      raise ValueError("Encountered an unsupported param type {}".format(
-          type(param)))
-  return "_".join(name_parts)
-
-
-def scope_string_from_name(tensor):
-  if isinstance(tensor, (tuple, list)):
-    return "__".join([scope_string_from_name(t) for t in tensor])
-  # "gradients/add_4_grad/Reshape:0" -> "gradients_add_4_grad_Reshape"
-  return tensor.name.split(":")[0].replace("/", "_")
-
-
-def scalar_or_tensor_to_string(val):
-  return repr(val) if np.isscalar(val) else scope_string_from_name(val)
-
-
-def list_to_string(lst):
-  return "_".join(val if isinstance(val, six.string_types)
-                  else scalar_or_tensor_to_string(val) for val in lst)
-
-
-def graph_func_to_id(func):
-  """Returns a hashable object that represents func's computation."""
-  # TODO(b/74201126): replace with Topohash of func's output
-  return func.func_id
-
-
-def graph_func_to_string(func):
-  # TODO(b/74201126): replace with Topohash of func's output
-  return list_to_string(func.func_id)
-
-
-def _subsample_for_cov_computation(array, name=None):
-  """Subsamples the first dimension of the array.
-
-  `array`(A) is a tensor of shape `[batch_size, dim_2]`. Then the covariance
-  matrix(A^TA) is of shape `dim_2 ** 2`. Subsample only if the number of outer
-  products per row of the covariance matrix is greater than
-  `_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW`.
-
-  Args:
-    array: Tensor, of shape `[batch_size, dim_2]`.
-    name: `string`, Default(None)
-
-  Returns:
-    A tensor of shape `[max_samples, dim_2]`.
-
-  Raises:
-    ValueError: If array's is not matrix-shaped.
-    ValueError: If array's batch_size cannot be inferred.
-
-  """
-  with tf_ops.name_scope(name, "subsample", [array]):
-    array = tf_ops.convert_to_tensor(array)
-    if len(array.shape) != 2:
-      raise ValueError("Input param array must be a matrix.")
-
-    batch_size = array.shape.as_list()[0]
-    if batch_size is None:
-      raise ValueError("Unable to get batch_size from input param array.")
-
-    num_cov_rows = array.shape.as_list()[-1]
-    max_batch_size = int(_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW * num_cov_rows)
-    if batch_size <= max_batch_size:
-      return array
-
-    return _random_tensor_gather(array, max_batch_size)
-
-
-def _random_tensor_gather(array, max_size):
-  """Generates a random set of indices and gathers the value at the indcices.
-
-  Args:
-    array: Tensor, of shape `[batch_size, dim_2]`.
-    max_size: int, Number of indices to sample.
-
-  Returns:
-    A tensor of shape `[max_size, ...]`.
-  """
-  batch_size = array.shape.as_list()[0]
-  indices = random_ops.random_shuffle(math_ops.range(0, batch_size))[:max_size]
-  return array_ops.gather(array, indices)
-
-
-@six.add_metaclass(abc.ABCMeta)
-class FisherFactor(object):
-  """Base class for objects modeling factors of approximate Fisher blocks.
-
-  A FisherFactor represents part of an approximate Fisher Information matrix.
-  For example, one approximation to the Fisher uses the Kronecker product of two
-  FisherFactors A and B, F = kron(A, B). FisherFactors are composed with
-  FisherBlocks to construct a block-diagonal approximation to the full Fisher.
-
-  FisherFactors are backed by a single, non-trainable variable that is updated
-  by running FisherFactor.make_covariance_update_op(). The shape and type of
-  this variable is implementation specific.
-
-  Note that for blocks that aren't based on approximations, a 'factor' can
-  be the entire block itself, as is the case for the diagonal and full
-  representations.
-  """
-
-  def __init__(self):
-    self._cov = None
-
-  @abc.abstractproperty
-  def _var_scope(self):
-    """Variable scope for this FisherFactor instance.
-
-    Returns:
-      string that unique identifies this FisherFactor instance.
-    """
-    pass
-
-  @property
-  def name(self):
-    return self._var_scope
-
-  @abc.abstractproperty
-  def _cov_shape(self):
-    """The shape of the variable backing this FisherFactor."""
-    pass
-
-  @abc.abstractproperty
-  def _num_sources(self):
-    """The number of things to sum over when updating covariance variable.
-
-    The default make_covariance_update_op function will call _compute_new_cov
-    with indices ranging from 0 to _num_sources-1. The typical situation is
-    where the factor wants to sum the statistics it computes over multiple
-    backpropped "gradients" (typically passed in via "tensors" or
-    "outputs_grads" arguments).
-    """
-    pass
-
-  @abc.abstractproperty
-  def _num_towers(self):
-    pass
-
-  @abc.abstractproperty
-  def _dtype(self):
-    """dtype for variable backing this factor."""
-    pass
-
-  @property
-  def _cov_initializer(self):
-    """Function for initializing covariance variable."""
-    return covariance_initializer
-
-  def instantiate_cov_variables(self):
-    """Makes the internal cov variable(s)."""
-    assert self._cov is None
-    with variable_scope.variable_scope(self._var_scope):
-      self._cov = variable_scope.get_variable(
-          "cov",
-          initializer=self._cov_initializer,
-          shape=self._cov_shape,
-          trainable=False,
-          dtype=self._dtype)
-
-  @abc.abstractmethod
-  def _compute_new_cov(self, source, tower):
-    """Computes minibatch-estimated covariance for a single source.
-
-    Args:
-      source: int in [0, self._num_sources). Which source to use when computing
-        the cov update.
-      tower: int in [0, self._num_towers). Which tower to use when computing
-        the cov update.
-
-    Returns:
-      Tensor of same shape as self.get_cov().
-    """
-    pass
-
-  def make_covariance_update_op(self, ema_decay):
-    """Constructs and returns the covariance update Op.
-
-    Args:
-      ema_decay: The exponential moving average decay (float or Tensor).
-    Returns:
-      An Op for updating the covariance Variable referenced by _cov.
-    """
-    new_cov_contribs = []
-    for source in range(self._num_sources):
-      for tower in range(self._num_towers):
-        device = (self._get_data_device(tower)
-                  if TOWER_STRATEGY == "separate" else None)
-        with place_on_device(device):
-          new_cov_contribs.append(self._compute_new_cov(source, tower))
-
-    new_cov = math_ops.add_n(new_cov_contribs) / float(self._num_towers)
-
-    # Compute average of 'new_cov' across all TPU cores. On a TPU, each
-    # instance of 'new_cov' will be based on a different minibatch. This ensures
-    # that by the end of assign_moving_average(), all TPU cores see the same
-    # value for self._cov.
-    #
-    # Other implementations of make_covariance_update_op() that accumulate
-    # statistics in other variables should mimic this behavior.
-    if utils.on_tpu():
-      new_cov = utils.cross_replica_mean(new_cov)
-
-    return moving_averages.assign_moving_average(
-        self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
-
-  @abc.abstractmethod
-  def _get_data_device(self, tower):
-    pass
-
-  @abc.abstractmethod
-  def instantiate_inv_variables(self):
-    """Makes the internal "inverse" variable(s)."""
-    pass
-
-  @abc.abstractmethod
-  def make_inverse_update_ops(self):
-    """Create and return update ops corresponding to registered computations."""
-    pass
-
-  def get_cov(self):
-    return self._cov
-
-  @abc.abstractmethod
-  def get_cov_as_linear_operator(self):
-    pass
-
-  @abc.abstractmethod
-  def register_matpower(self, exp, damping_func):
-    pass
-
-  @abc.abstractmethod
-  def register_cholesky(self, damping_func):
-    pass
-
-  @abc.abstractmethod
-  def register_cholesky_inverse(self, damping_func):
-    pass
-
-  @abc.abstractmethod
-  def get_matpower(self, exp, damping_func):
-    pass
-
-  @abc.abstractmethod
-  def get_cholesky(self, damping_func):
-    pass
-
-  @abc.abstractmethod
-  def get_cholesky_inverse(self, damping_func):
-    pass
-
-
-class DenseSquareMatrixFactor(FisherFactor):
-  """Base class for FisherFactors that are stored as dense square matrices.
-
-  This class explicitly calculates and stores inverses of their `cov` matrices,
-  which must be square dense matrices.
-
-  Subclasses must implement the _compute_new_cov method, and the _var_scope and
-  _cov_shape properties.
-  """
-
-  # TODO(b/69108481): This class (and its subclasses) should be refactored to
-  # serve the matrix quantities it computes as both (potentially stale)
-  # variables, updated by the inverse update ops, and fresh values stored in
-  # tensors that recomputed once every session.run() call.  Currently matpower
-  # and damp_inverse have the former behavior, while eigendecomposition has
-  # the latter.
-
-  def __init__(self):
-    self._matpower_by_exp_and_damping = {}  # { (float, hashable): variable }
-    self._matpower_registrations = set()  # { (float, hashable) }
-    self._eigendecomp = None
-    self._damping_funcs_by_id = {}  # {hashable: lambda}
-
-    self._cholesky_registrations = set()  # { hashable }
-    self._cholesky_inverse_registrations = set()  # { hashable }
-
-    self._cholesky_by_damping = {}  # { hashable: variable }
-    self._cholesky_inverse_by_damping = {}  # { hashable: variable }
-
-    super(DenseSquareMatrixFactor, self).__init__()
-
-  def get_cov_as_linear_operator(self):
-    assert self.get_cov().shape.ndims == 2
-    return lo.LinearOperatorFullMatrix(self.get_cov(),
-                                       is_self_adjoint=True,
-                                       is_square=True)
-
-  def _register_damping(self, damping_func):
-    damping_id = graph_func_to_id(damping_func)
-    if damping_id not in self._damping_funcs_by_id:
-      self._damping_funcs_by_id[damping_id] = damping_func
-    return damping_id
-
-  def register_inverse(self, damping_func):
-    # Just for backwards compatibility of some old code and tests
-    self.register_matpower(-1, damping_func)
-
-  def register_matpower(self, exp, damping_func):
-    """Registers a matrix power to be maintained and served on demand.
-
-    This creates a variable and signals make_inverse_update_ops to make the
-    corresponding update op.  The variable can be read via the method
-    get_matpower.
-
-    Args:
-      exp: float.  The exponent to use in the matrix power.
-      damping_func: A function that computes a 0-D Tensor or a float which will
-        be the damping value used.  i.e. damping = damping_func().
-    """
-    if exp == 1.0:
-      return
-
-    damping_id = self._register_damping(damping_func)
-
-    if (exp, damping_id) not in self._matpower_registrations:
-      self._matpower_registrations.add((exp, damping_id))
-
-  def register_cholesky(self, damping_func):
-    """Registers a Cholesky factor to be maintained and served on demand.
-
-    This creates a variable and signals make_inverse_update_ops to make the
-    corresponding update op.  The variable can be read via the method
-    get_cholesky.
-
-    Args:
-      damping_func: A function that computes a 0-D Tensor or a float which will
-        be the damping value used.  i.e. damping = damping_func().
-    """
-    damping_id = self._register_damping(damping_func)
-
-    if damping_id not in self._cholesky_registrations:
-      self._cholesky_registrations.add(damping_id)
-
-  def register_cholesky_inverse(self, damping_func):
-    """Registers an inverse Cholesky factor to be maintained/served on demand.
-
-    This creates a variable and signals make_inverse_update_ops to make the
-    corresponding update op.  The variable can be read via the method
-    get_cholesky_inverse.
-
-    Args:
-      damping_func: A function that computes a 0-D Tensor or a float which will
-        be the damping value used.  i.e. damping = damping_func().
-    """
-    damping_id = self._register_damping(damping_func)
-
-    if damping_id not in self._cholesky_inverse_registrations:
-      self._cholesky_inverse_registrations.add(damping_id)
-
-  def instantiate_inv_variables(self):
-    """Makes the internal "inverse" variable(s)."""
-
-    for (exp, damping_id) in self._matpower_registrations:
-      exp_string = scalar_or_tensor_to_string(exp)
-      damping_func = self._damping_funcs_by_id[damping_id]
-      damping_string = graph_func_to_string(damping_func)
-      with variable_scope.variable_scope(self._var_scope):
-        matpower = variable_scope.get_variable(
-            "matpower_exp{}_damp{}".format(exp_string, damping_string),
-            initializer=inverse_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-      assert (exp, damping_id) not in self._matpower_by_exp_and_damping
-      self._matpower_by_exp_and_damping[(exp, damping_id)] = matpower
-
-    for damping_id in self._cholesky_registrations:
-      damping_func = self._damping_funcs_by_id[damping_id]
-      damping_string = graph_func_to_string(damping_func)
-      with variable_scope.variable_scope(self._var_scope):
-        chol = variable_scope.get_variable(
-            "cholesky_damp{}".format(damping_string),
-            initializer=inverse_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-      assert damping_id not in self._cholesky_by_damping
-      self._cholesky_by_damping[damping_id] = chol
-
-    for damping_id in self._cholesky_inverse_registrations:
-      damping_func = self._damping_funcs_by_id[damping_id]
-      damping_string = graph_func_to_string(damping_func)
-      with variable_scope.variable_scope(self._var_scope):
-        cholinv = variable_scope.get_variable(
-            "cholesky_inverse_damp{}".format(damping_string),
-            initializer=inverse_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-      assert damping_id not in self._cholesky_inverse_by_damping
-      self._cholesky_inverse_by_damping[damping_id] = cholinv
-
-  def make_inverse_update_ops(self):
-    """Create and return update ops corresponding to registered computations."""
-    ops = []
-
-    num_inverses = sum(1 for (exp, _) in self._matpower_by_exp_and_damping
-                       if exp == -1)
-
-    num_other_matpower = len(self._matpower_by_exp_and_damping) - num_inverses
-
-    other_matrix_power_registered = num_other_matpower >= 1
-
-    use_eig = (
-        self._eigendecomp or other_matrix_power_registered or
-        num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
-
-    # We precompute these so we don't need to evaluate them multiple times (for
-    # each matrix power that uses them)
-    damping_value_by_id = {damping_id: math_ops.cast(
-        self._damping_funcs_by_id[damping_id](), self._dtype)
-                           for damping_id in self._damping_funcs_by_id}
-
-    if use_eig:
-      eigenvalues, eigenvectors = self.get_eigendecomp()  # pylint: disable=unpacking-non-sequence
-
-      for (exp, damping_id), matpower in (
-          self._matpower_by_exp_and_damping.items()):
-        damping = damping_value_by_id[damping_id]
-        ops.append(
-            matpower.assign(
-                math_ops.matmul(eigenvectors *
-                                (eigenvalues + damping)**exp,
-                                array_ops.transpose(eigenvectors))))
-      # These ops share computation and should be run on a single device.
-      ops = [control_flow_ops.group(*ops)]
-    else:
-      for (exp, damping_id), matpower in (
-          self._matpower_by_exp_and_damping.items()):
-        assert exp == -1
-        damping = damping_value_by_id[damping_id]
-        ops.append(matpower.assign(utils.posdef_inv(self.get_cov(), damping)))
-
-    # TODO(b/77902055): If inverses are being computed with Cholesky's
-    # we can share the work. Instead this code currently just computes the
-    # Cholesky a second time. It does at least share work between requests for
-    # Cholesky's and Cholesky inverses with the same damping id.
-    for damping_id, cholesky_inv in self._cholesky_inverse_by_damping.items():
-      cholesky_ops = []
-
-      damping = damping_value_by_id[damping_id]
-      cholesky_value = utils.cholesky(self.get_cov(), damping)
-
-      if damping_id in self._cholesky_by_damping:
-        cholesky = self._cholesky_by_damping[damping_id]
-        cholesky_ops.append(cholesky.assign(cholesky_value))
-
-      identity = linalg_ops.eye(cholesky_value.shape.as_list()[0],
-                                dtype=cholesky_value.dtype)
-      cholesky_inv_value = linalg_ops.matrix_triangular_solve(cholesky_value,
-                                                              identity)
-      cholesky_ops.append(cholesky_inv.assign(cholesky_inv_value))
-
-      ops.append(control_flow_ops.group(*cholesky_ops))
-
-    for damping_id, cholesky in self._cholesky_by_damping.items():
-      if damping_id not in self._cholesky_inverse_by_damping:
-        damping = damping_value_by_id[damping_id]
-        cholesky_value = utils.cholesky(self.get_cov(), damping)
-        ops.append(cholesky.assign(cholesky_value))
-
-    self._eigendecomp = False
-    return ops
-
-  def get_inverse(self, damping_func):
-    # Just for backwards compatibility of some old code and tests
-    return self.get_matpower(-1, damping_func)
-
-  def get_matpower(self, exp, damping_func):
-    # Note that this function returns a variable which gets updated by the
-    # inverse ops.  It may be stale / inconsistent with the latest value of
-    # get_cov().
-    if exp != 1:
-      damping_id = graph_func_to_id(damping_func)
-      matpower = self._matpower_by_exp_and_damping[(exp, damping_id)]
-    else:
-      matpower = self.get_cov()
-      identity = linalg_ops.eye(matpower.shape.as_list()[0],
-                                dtype=matpower.dtype)
-      matpower += math_ops.cast(damping_func(), dtype=matpower.dtype)*identity
-
-    assert matpower.shape.ndims == 2
-    return lo.LinearOperatorFullMatrix(matpower,
-                                       is_non_singular=True,
-                                       is_self_adjoint=True,
-                                       is_positive_definite=True,
-                                       is_square=True)
-
-  def get_cholesky(self, damping_func):
-    # Note that this function returns a variable which gets updated by the
-    # inverse ops.  It may be stale / inconsistent with the latest value of
-    # get_cov().
-    damping_id = graph_func_to_id(damping_func)
-    cholesky = self._cholesky_by_damping[damping_id]
-    assert cholesky.shape.ndims == 2
-    return lo.LinearOperatorFullMatrix(cholesky,
-                                       is_non_singular=True,
-                                       is_square=True)
-
-  def get_cholesky_inverse(self, damping_func):
-    # Note that this function returns a variable which gets updated by the
-    # inverse ops.  It may be stale / inconsistent with the latest value of
-    # get_cov().
-    damping_id = graph_func_to_id(damping_func)
-    cholesky_inv = self._cholesky_inverse_by_damping[damping_id]
-    assert cholesky_inv.shape.ndims == 2
-    return lo.LinearOperatorFullMatrix(cholesky_inv,
-                                       is_non_singular=True,
-                                       is_square=True)
-
-  def get_eigendecomp(self):
-    """Creates or retrieves eigendecomposition of self._cov."""
-    # Unlike get_matpower this doesn't retrieve a stored variable, but instead
-    # always computes a fresh version from the current value of get_cov().
-    if not self._eigendecomp:
-      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self.get_cov())
-
-      # The matrix self._cov is positive semidefinite by construction, but the
-      # numerical eigenvalues could be negative due to numerical errors, so here
-      # we clip them to be at least FLAGS.eigenvalue_clipping_threshold
-      clipped_eigenvalues = math_ops.maximum(eigenvalues,
-                                             EIGENVALUE_CLIPPING_THRESHOLD)
-      self._eigendecomp = (clipped_eigenvalues, eigenvectors)
-
-    return self._eigendecomp
-
-
-class FullFactor(DenseSquareMatrixFactor):
-  """FisherFactor for a full matrix representation of the Fisher of a parameter.
-
-  Note that this uses the naive "square the sum estimator", and so is applicable
-  to any type of parameter in principle, but has very high variance.
-  """
-
-  def __init__(self,
-               params_grads,
-               batch_size):
-    self._batch_size = batch_size
-    self._params_grads = tuple(utils.ensure_sequence(params_grad)
-                               for params_grad in params_grads)
-    super(FullFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_full_" + scope_string_from_params(
-        [self._params_grads, self._batch_size])
-
-  @property
-  def _cov_shape(self):
-    size = sum(param_grad.shape.num_elements()
-               for param_grad in self._params_grads[0])
-    return (size, size)
-
-  @property
-  def _num_sources(self):
-    return len(self._params_grads)
-
-  @property
-  def _num_towers(self):
-    return 1
-
-  @property
-  def _dtype(self):
-    return self._params_grads[0][0].dtype
-
-  def _compute_new_cov(self, source, tower):
-    assert tower == 0
-
-    # This will be a very basic rank 1 estimate
-    params_grads_flat = utils.tensors_to_column(self._params_grads[source])
-    return ((params_grads_flat * array_ops.transpose(
-        params_grads_flat)) / math_ops.cast(self._batch_size,
-                                            params_grads_flat.dtype))
-
-  def _get_data_device(self, tower):
-    return None
-
-
-class DiagonalFactor(FisherFactor):
-  """A base class for FisherFactors that use diagonal approximations.
-
-  A DiagonalFactor's covariance variable can be of any shape, but must contain
-  exactly one entry per parameter.
-  """
-
-  def __init__(self):
-    super(DiagonalFactor, self).__init__()
-
-  def get_cov_as_linear_operator(self):
-    assert self._matrix_diagonal.shape.ndims == 1
-    return lo.LinearOperatorDiag(self._matrix_diagonal,
-                                 is_self_adjoint=True,
-                                 is_square=True)
-
-  @property
-  def _cov_initializer(self):
-    return diagonal_covariance_initializer
-
-  @property
-  def _matrix_diagonal(self):
-    return array_ops.reshape(self.get_cov(), [-1])
-
-  def make_inverse_update_ops(self):
-    return []
-
-  def instantiate_inv_variables(self):
-    pass
-
-  def register_matpower(self, exp, damping_func):
-    pass
-
-  def register_cholesky(self, damping_func):
-    pass
-
-  def register_cholesky_inverse(self, damping_func):
-    pass
-
-  def get_matpower(self, exp, damping_func):
-    matpower_diagonal = (self._matrix_diagonal
-                         + math_ops.cast(damping_func(), self._dtype))**exp
-    return lo.LinearOperatorDiag(matpower_diagonal,
-                                 is_non_singular=True,
-                                 is_self_adjoint=True,
-                                 is_positive_definite=True,
-                                 is_square=True)
-
-  def get_cholesky(self, damping_func):
-    return self.get_matpower(0.5, damping_func)
-
-  def get_cholesky_inverse(self, damping_func):
-    return self.get_matpower(-0.5, damping_func)
-
-
-class NaiveDiagonalFactor(DiagonalFactor):
-  """FisherFactor for a diagonal approximation of any type of param's Fisher.
-
-  Note that this uses the naive "square the sum estimator", and so is applicable
-  to any type of parameter in principle, but has very high variance.
-  """
-
-  def __init__(self,
-               params_grads,
-               batch_size):
-    """Initializes NaiveDiagonalFactor instance.
-
-    Args:
-      params_grads: Sequence of Tensors, each with same shape as parameters this
-        FisherFactor corresponds to. For example, the gradient of the loss with
-        respect to parameters.
-      batch_size: int or 0-D Tensor. Size
-    """
-    self._params_grads = tuple(utils.ensure_sequence(params_grad)
-                               for params_grad in params_grads)
-    self._batch_size = batch_size
-    super(NaiveDiagonalFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_naivediag_" + scope_string_from_params(
-        [self._params_grads, self._batch_size])
-
-  @property
-  def _cov_shape(self):
-    size = sum(param_grad.shape.num_elements()
-               for param_grad in self._params_grads[0])
-    return [size, 1]
-
-  @property
-  def _num_sources(self):
-    return len(self._params_grads)
-
-  @property
-  def _num_towers(self):
-    return 1
-
-  @property
-  def _dtype(self):
-    return self._params_grads[0][0].dtype
-
-  def _compute_new_cov(self, source, tower):
-    assert tower == 0
-
-    params_grads_flat = utils.tensors_to_column(self._params_grads[source])
-    return (math_ops.square(params_grads_flat) / math_ops.cast(
-        self._batch_size, params_grads_flat.dtype))
-
-  def _get_data_device(self, tower):
-    return None
-
-
-class EmbeddingInputKroneckerFactor(DiagonalFactor):
-  r"""FisherFactor for input to an embedding layer.
-
-  Given input_ids = [batch_size, input_size] representing indices into an
-  [vocab_size, embedding_size] embedding matrix, approximate input covariance by
-  a diagonal matrix,
-
-    Cov(input_ids, input_ids) =
-        (1/batch_size) sum_{i} diag(n_hot(input[i]) ** 2).
-
-  where n_hot() constructs an n-hot binary vector and diag() constructs a
-  diagonal matrix of size [vocab_size, vocab_size].
-  """
-
-  def __init__(self, input_ids, vocab_size, dtype=None):
-    """Instantiate EmbeddingInputKroneckerFactor.
-
-    Args:
-      input_ids: List of Tensors of shape [batch_size, input_size] and dtype
-        int32. Indices into embedding matrix. List index is tower.
-      vocab_size: int or 0-D Tensor. Maximum value for entries in 'input_ids'.
-      dtype: dtype for covariance statistics. Must be a floating point type.
-        Defaults to float32.
-    """
-    self._input_ids = input_ids
-    self._vocab_size = vocab_size
-    self._cov_dtype = dtype or dtypes.float32
-
-    super(EmbeddingInputKroneckerFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_diag_embedding_" + scope_string_from_params(self._input_ids)
-
-  @property
-  def _cov_shape(self):
-    return [self._vocab_size]
-
-  @property
-  def _num_sources(self):
-    return 1
-
-  @property
-  def _num_towers(self):
-    return len(self._input_ids)
-
-  @property
-  def _dtype(self):
-    return self._cov_dtype
-
-  def _compute_new_cov(self, source, tower):
-    assert source == 0
-
-    input_ids = self._input_ids[tower]
-
-    if len(input_ids.shape) > 2:
-      raise ValueError(
-          "Input to embeddings must have rank <= 2. Found rank %d." % len(
-              input_ids.shape))
-
-    batch_size = array_ops.shape(input_ids)[0]
-
-    # Transform indices into one-hot vectors.
-    #
-    # TODO(b/72714822): There must be a faster way to construct the diagonal
-    # covariance matrix! This operation is O(batch_size * vocab_size), where
-    # it should be O(batch_size * input_size).
-    flat_input_ids = array_ops.reshape(input_ids, [-1])
-    one_hots = array_ops.one_hot(flat_input_ids,
-                                 self._vocab_size)  # [?, vocab_size]
-
-    # Take average across examples. Note that, because all entries have
-    # magnitude zero or one, there's no need to square the entries.
-    #
-    # TODO(b/72714822): Support for SparseTensor, other kinds of aggregation
-    # within an example such as average.
-    #
-    # TODO(b/72714822): Support for partitioned embeddings.
-    new_cov = math_ops.reduce_sum(one_hots, axis=0)  # [vocab_size]
-    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
-
-    return new_cov
-
-  def _get_data_device(self, tower):
-    return self._input_ids[tower].device
-
-
-class FullyConnectedDiagonalFactor(DiagonalFactor):
-  r"""FisherFactor for a diagonal approx of a fully-connected layer's Fisher.
-
-  Given in = [batch_size, input_size] and out_grad = [batch_size, output_size],
-  approximates the covariance as,
-
-    Cov(in, out) = (1/batch_size) sum_{i} outer(in[i], out_grad[i]) ** 2.0
-
-  where the square is taken element-wise.
-  """
-
-  def __init__(self,
-               inputs,
-               outputs_grads,
-               has_bias=False):
-    """Instantiate FullyConnectedDiagonalFactor.
-
-    Args:
-      inputs: List of Tensors of shape [batch_size, input_size]. Inputs to this
-        layer.  List index is towers.
-      outputs_grads: List of Tensors, each of shape [batch_size, output_size],
-        which are the gradients of the loss with respect to the layer's
-        outputs. First index is source, second is tower.
-
-      has_bias: bool. If True, append '1' to each input.
-    """
-    self._inputs = inputs
-    self._has_bias = has_bias
-    self._outputs_grads = outputs_grads
-    self._squared_inputs = None
-
-    super(FullyConnectedDiagonalFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_diagfc_" + scope_string_from_params(
-        tuple(self._inputs) + tuple(nest.flatten(self._outputs_grads)))
-
-  @property
-  def _cov_shape(self):
-    input_size = self._inputs[0].shape[1] + self._has_bias
-    output_size = self._outputs_grads[0][0].shape[1]
-    return [input_size, output_size]
-
-  @property
-  def _num_sources(self):
-    return len(self._outputs_grads)
-
-  @property
-  def _num_towers(self):
-    return len(self._inputs)
-
-  @property
-  def _dtype(self):
-    return self._outputs_grads[0][0].dtype
-
-  def make_covariance_update_op(self, ema_decay):
-
-    self._squared_inputs = []
-    for tower in range(self._num_towers):
-      inputs = self._inputs[tower]
-
-      with place_on_device(self._get_data_device(tower)):
-        if self._has_bias:
-          inputs = append_homog(inputs)
-        self._squared_inputs.append(math_ops.square(inputs))
-
-    return super(FullyConnectedDiagonalFactor, self).make_covariance_update_op(
-        ema_decay)
-
-  def _compute_new_cov(self, source, tower):
-    batch_size = array_ops.shape(self._squared_inputs[tower])[0]
-    outputs_grad = self._outputs_grads[source][tower]
-
-    # The well-known special formula that uses the fact that the entry-wise
-    # square of an outer product is the outer-product of the entry-wise squares.
-    # The gradient is the outer product of the input and the output gradients,
-    # so we just square both and then take their outer-product.
-    new_cov = math_ops.matmul(
-        self._squared_inputs[tower],
-        math_ops.square(outputs_grad),
-        transpose_a=True)
-    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
-    return new_cov
-
-  def _get_data_device(self, tower):
-    return self._inputs[tower].device
-
-
-class ConvDiagonalFactor(DiagonalFactor):
-  """FisherFactor for a diagonal approx of a convolutional layer's Fisher."""
-
-  def __init__(self,
-               inputs,
-               outputs_grads,
-               filter_shape,
-               strides,
-               padding,
-               data_format=None,
-               dilations=None,
-               has_bias=False):
-    """Creates a ConvDiagonalFactor object.
-
-    Args:
-      inputs: List of Tensors of shape [batch_size, height, width, in_channels].
-        Input activations to this layer.  List index is towers.
-      outputs_grads: List of Tensors, each of shape [batch_size,
-        height, width, out_channels], which are the gradients of the loss
-        with respect to the layer's outputs.  First index is source, second
-        index is tower.
-      filter_shape: Tuple of 4 ints: (kernel_height, kernel_width, in_channels,
-        out_channels). Represents shape of kernel used in this layer.
-      strides: The stride size in this layer (1-D Tensor of length 4).
-      padding: The padding in this layer (1-D of Tensor length 4).
-      data_format: None or str. Format of conv2d inputs.
-      dilations: None or tuple of 4 ints.
-      has_bias: Python bool. If True, the layer is assumed to have a bias
-        parameter in addition to its filter parameter.
-
-    Raises:
-      ValueError: If inputs, output_grads, and filter_shape do not agree on
-        in_channels or out_channels.
-      ValueError: If strides, dilations are not length-4 lists of ints.
-      ValueError: If data_format does not put channel last.
-    """
-    if not utils.is_data_format_channel_last(data_format):
-      raise ValueError("Channel must be last.")
-    if any(input_.shape.ndims != 4 for input_ in inputs):
-      raise ValueError("inputs must be a list of 4-D Tensors.")
-    if any(input_.shape.as_list()[-1] != filter_shape[-2] for input_ in inputs):
-      raise ValueError("inputs and filter_shape must agree on in_channels.")
-    for i, outputs_grad in enumerate(outputs_grads):
-      if any(output_grad.shape.ndims != 4 for output_grad in outputs_grad):
-        raise ValueError("outputs[%d] must be 4-D Tensor." % i)
-      if any(output_grad.shape.as_list()[-1] != filter_shape[-1]
-             for output_grad in outputs_grad):
-        raise ValueError(
-            "outputs[%d] and filter_shape must agree on out_channels." % i)
-    if len(strides) != 4:
-      raise ValueError("strides must be length-4 list of ints.")
-    if dilations is not None and len(dilations) != 4:
-      raise ValueError("dilations must be length-4 list of ints.")
-
-    self._inputs = inputs
-    self._outputs_grads = outputs_grads
-    self._filter_shape = filter_shape
-    self._strides = strides
-    self._padding = padding
-    self._data_format = data_format
-    self._dilations = dilations
-    self._has_bias = has_bias
-    self._patches = None
-
-    super(ConvDiagonalFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_convdiag_" + scope_string_from_params(
-        tuple(self._inputs) + tuple(nest.flatten(self._outputs_grads)))
-
-  @property
-  def _cov_shape(self):
-    filter_height, filter_width, in_channels, out_channels = self._filter_shape
-    return [
-        filter_height * filter_width * in_channels + self._has_bias,
-        out_channels
-    ]
-
-  @property
-  def _num_sources(self):
-    return len(self._outputs_grads)
-
-  @property
-  def _num_towers(self):
-    return len(self._inputs)
-
-  @property
-  def _dtype(self):
-    return self._inputs[0].dtype
-
-  def make_covariance_update_op(self, ema_decay):
-    filter_height, filter_width, _, _ = self._filter_shape
-
-    # TODO(b/64144716): there is potential here for a big savings in terms
-    # of memory use.
-    if self._dilations is None:
-      rates = (1, 1, 1, 1)
-    else:
-      rates = tuple(self._dilations)
-
-    self._patches = []
-    for tower in range(self._num_towers):
-      with place_on_device(self._get_data_device(tower)):
-        patches = array_ops.extract_image_patches(
-            self._inputs[tower],
-            ksizes=[1, filter_height, filter_width, 1],
-            strides=self._strides,
-            rates=rates,
-            padding=self._padding)
-
-        if self._has_bias:
-          patches = append_homog(patches)
-
-        self._patches.append(patches)
-
-    return super(ConvDiagonalFactor, self).make_covariance_update_op(ema_decay)
-
-  def _compute_new_cov(self, source, tower):
-    patches = self._patches[tower]
-    batch_size = array_ops.shape(patches)[0]
-    outputs_grad = self._outputs_grads[source][tower]
-
-    new_cov = self._convdiag_sum_of_squares(patches, outputs_grad)
-    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
-
-    return new_cov
-
-  def _convdiag_sum_of_squares(self, patches, outputs_grad):
-    # This computes the sum of the squares of the per-training-case "gradients".
-    # It does this simply by computing a giant tensor containing all of these,
-    # doing an entry-wise square, and them summing along the batch dimension.
-    case_wise_gradients = special_math_ops.einsum("bijk,bijl->bkl", patches,
-                                                  outputs_grad)
-    return math_ops.reduce_sum(math_ops.square(case_wise_gradients), axis=0)
-
-  def _get_data_device(self, tower):
-    return self._inputs[tower].device
-
-
-class FullyConnectedKroneckerFactor(DenseSquareMatrixFactor):
-  """Kronecker factor for the input or output side of a fully-connected layer.
-  """
-
-  def __init__(self,
-               tensors,
-               has_bias=False):
-    """Instantiate FullyConnectedKroneckerFactor.
-
-    Args:
-      tensors: List of list of Tensors, each of shape [batch_size, n]. The
-        Tensors are typically either a layer's inputs or its output's gradients.
-        The first list index is source, the second is tower.
-      has_bias: bool. If True, append '1' to each row.
-    """
-    # The tensor argument is either a tensor of input activations or a tensor of
-    # output pre-activation gradients.
-    self._has_bias = has_bias
-    self._tensors = tensors
-    super(FullyConnectedKroneckerFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_fckron_" + scope_string_from_params(
-        tuple(nest.flatten(self._tensors)) + (self._has_bias,))
-
-  @property
-  def _cov_shape(self):
-    size = self._tensors[0][0].shape[1] + self._has_bias
-    return [size, size]
-
-  @property
-  def _num_sources(self):
-    return len(self._tensors)
-
-  @property
-  def _num_towers(self):
-    return len(self._tensors[0])
-
-  @property
-  def _dtype(self):
-    return self._tensors[0][0].dtype
-
-  def _compute_new_cov(self, source, tower):
-    tensor = self._tensors[source][tower]
-    if self._has_bias:
-      tensor = append_homog(tensor)
-    return compute_cov(tensor)
-
-  def _get_data_device(self, tower):
-    return self._tensors[0][tower].device
-
-
-class ConvInputKroneckerFactor(DenseSquareMatrixFactor):
-  r"""Kronecker factor for the input side of a convolutional layer.
-
-  Estimates E[ a a^T ] where a is the inputs to a convolutional layer given
-  example x. Expectation is taken over all examples and locations.
-
-  Equivalent to Omega in https://arxiv.org/abs/1602.01407 for details. See
-  Section 3.1 Estimating the factors.
-  """
-
-  def __init__(self,
-               inputs,
-               filter_shape,
-               padding,
-               strides=None,
-               dilation_rate=None,
-               data_format=None,
-               extract_patches_fn=None,
-               has_bias=False,
-               sub_sample_inputs=None,
-               sub_sample_patches=None):
-    """Initializes ConvInputKroneckerFactor.
-
-    Args:
-      inputs: List of Tensors of shape [batch_size, ..spatial_input_size..,
-        in_channels]. Inputs to layer. List index is tower.
-      filter_shape: List of ints. Contains [..spatial_filter_size..,
-        in_channels, out_channels]. Shape of convolution kernel.
-      padding: str. Padding method for layer. "SAME" or "VALID".
-      strides: List of ints or None. Contains [..spatial_filter_strides..] if
-        'extract_patches_fn' is compatible with tf.nn.convolution(), else
-        [1, ..spatial_filter_strides, 1].
-      dilation_rate: List of ints or None. Rate for dilation along each spatial
-        dimension if 'extract_patches_fn' is compatible with
-        tf.nn.convolution(), else [1, ..spatial_dilation_rates.., 1].
-      data_format: str or None. Format of input data.
-      extract_patches_fn: str or None. Name of function that extracts image
-        patches. One of "extract_convolution_patches", "extract_image_patches",
-        "extract_pointwise_conv2d_patches".
-      has_bias: bool. If True, append 1 to in_channel.
-      sub_sample_inputs: `bool`. If True, then subsample the inputs from which
-        the image patches are extracted. (Default: None)
-      sub_sample_patches: `bool`, If `True` then subsample the extracted
-        patches.(Default: None)
-    """
-    self._inputs = inputs
-    self._filter_shape = filter_shape
-    self._strides = strides
-    self._padding = padding
-    self._dilation_rate = dilation_rate
-    self._data_format = data_format
-    self._extract_patches_fn = extract_patches_fn
-    self._has_bias = has_bias
-    if sub_sample_inputs is None:
-      self._sub_sample_inputs = _SUB_SAMPLE_INPUTS
-    else:
-      self._sub_sample_inputs = sub_sample_inputs
-
-    if sub_sample_patches is None:
-      self._sub_sample_patches = _SUB_SAMPLE_OUTER_PRODUCTS
-    else:
-      self._sub_sample_patches = sub_sample_patches
-    super(ConvInputKroneckerFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_convinkron_" + scope_string_from_params(
-        tuple(self._inputs) +
-        tuple((self._filter_shape, self._strides, self._padding,
-               self._dilation_rate, self._data_format, self._has_bias)))
-
-  @property
-  def _cov_shape(self):
-    spatial_filter_shape = self._filter_shape[0:-2]
-    in_channels = self._filter_shape[-2]
-    size = np.prod(spatial_filter_shape) * in_channels + self._has_bias
-    return [size, size]
-
-  @property
-  def _num_sources(self):
-    return 1
-
-  @property
-  def _num_towers(self):
-    return len(self._inputs)
-
-  @property
-  def _dtype(self):
-    return self._inputs[0].dtype
-
-  def _compute_new_cov(self, source, tower):
-    assert source == 0
-
-    inputs = self._inputs[tower]
-    if self._sub_sample_inputs:
-      batch_size = inputs.shape.as_list()[0]
-      max_size = int(batch_size * _INPUTS_TO_EXTRACT_PATCHES_FACTOR)
-      inputs = _random_tensor_gather(inputs, max_size)
-
-    # TODO(b/64144716): there is potential here for a big savings in terms of
-    # memory use.
-    if self._extract_patches_fn in [None, "extract_convolution_patches"]:
-      patches = utils.extract_convolution_patches(
-          inputs,
-          self._filter_shape,
-          padding=self._padding,
-          strides=self._strides,
-          dilation_rate=self._dilation_rate,
-          data_format=self._data_format)
-
-    elif self._extract_patches_fn == "extract_image_patches":
-      assert inputs.shape.ndims == 4
-      assert len(self._filter_shape) == 4
-      assert len(self._strides) == 4, self._strides
-      if self._dilation_rate is None:
-        rates = [1, 1, 1, 1]
-      else:
-        rates = self._dilation_rate
-        assert len(rates) == 4
-        assert rates[0] == rates[-1] == 1
-      patches = array_ops.extract_image_patches(
-          inputs,
-          ksizes=[1] + list(self._filter_shape[0:-2]) + [1],
-          strides=self._strides,
-          rates=rates,
-          padding=self._padding)
-
-    elif self._extract_patches_fn == "extract_pointwise_conv2d_patches":
-      assert self._strides in [None, [1, 1, 1, 1], (1, 1, 1, 1)]
-      assert self._filter_shape[0] == self._filter_shape[1] == 1
-      patches = utils.extract_pointwise_conv2d_patches(
-          inputs, self._filter_shape, data_format=None)
-
-    else:
-      raise NotImplementedError(self._extract_patches_fn)
-
-    flatten_size = np.prod(self._filter_shape[0:-1])
-    # patches_flat below is the matrix [[A_l]] from the KFC paper (tilde
-    # omitted over A for clarity). It has shape M|T| x J|Delta| (eq. 14),
-    # where M = minibatch size, |T| = number of spatial locations,
-    # |Delta| = number of spatial offsets, and J = number of input maps
-    # for convolutional layer l.
-    patches_flat = array_ops.reshape(patches, [-1, flatten_size])
-
-    # We append a homogenous coordinate to patches_flat if the layer has
-    # bias parameters. This gives us [[A_l]]_H from the paper.
-    if self._sub_sample_patches:
-      patches_flat = _subsample_for_cov_computation(patches_flat)
-
-    if self._has_bias:
-      patches_flat = append_homog(patches_flat)
-    # We call compute_cov without passing in a normalizer. compute_cov uses
-    # the first dimension of patches_flat i.e. M|T| as the normalizer by
-    # default. Hence we end up computing 1/M|T| * [[A_l]]^T [[A_l]], with
-    # shape J|Delta| x J|Delta|. This is related to hat{Omega}_l from
-    # the paper but has a different scale here for consistency with
-    # ConvOutputKroneckerFactor.
-    # (Tilde omitted over A for clarity.)
-    return compute_cov(patches_flat)
-
-  def _get_data_device(self, tower):
-    return self._inputs[tower].device
-
-
-class ConvOutputKroneckerFactor(DenseSquareMatrixFactor):
-  r"""Kronecker factor for the output side of a convolutional layer.
-
-  Estimates E[ ds ds^T ] where s is the preactivations of a convolutional layer
-  given example x and ds = (d / d s) log(p(y|x, w)). Expectation is taken over
-  all examples and locations.
-
-  Equivalent to Gamma in https://arxiv.org/abs/1602.01407 for details. See
-  Section 3.1 Estimating the factors.
-  """
-
-  def __init__(self, outputs_grads, data_format=None):
-    """Initializes ConvOutputKroneckerFactor.
-
-    Args:
-      outputs_grads: List of list of Tensors. Each Tensor is of shape
-          [batch_size, ..spatial_input_size.., out_channels].  First list index
-          is source, the second is tower.
-      data_format: None or str. Format of outputs_grads.
-
-    Raises:
-      ValueError: If channels are not final dimension.
-    """
-    if not utils.is_data_format_channel_last(data_format):
-      raise ValueError("Channel must be last.")
-    self._out_channels = outputs_grads[0][0].shape.as_list()[-1]
-    self._outputs_grads = outputs_grads
-    super(ConvOutputKroneckerFactor, self).__init__()
-
-  @property
-  def _var_scope(self):
-    return "ff_convoutkron_" + scope_string_from_params(
-        nest.flatten(self._outputs_grads))
-
-  @property
-  def _cov_shape(self):
-    size = self._out_channels
-    return [size, size]
-
-  @property
-  def _num_sources(self):
-    return len(self._outputs_grads)
-
-  @property
-  def _num_towers(self):
-    return len(self._outputs_grads[0])
-
-  @property
-  def _dtype(self):
-    return self._outputs_grads[0][0].dtype
-
-  def _compute_new_cov(self, source, tower):
-    outputs_grad = self._outputs_grads[source][tower]
-
-    # reshaped_tensor below is the matrix DS_l defined in the KFC paper
-    # (tilde omitted over S for clarity). It has shape M|T| x I, where
-    # M = minibatch size, |T| = number of spatial locations, and
-    # I = number of output maps for convolutional layer l.
-    reshaped_tensor = array_ops.reshape(outputs_grad, [-1, self._out_channels])
-    # Following the reasoning in ConvInputKroneckerFactor._compute_new_cov,
-    # compute_cov here returns 1/M|T| * DS_l^T DS_l = hat{Gamma}_l
-    # as defined in the paper, with shape I x I.
-    # (Tilde omitted over S for clarity.)
-    return compute_cov(reshaped_tensor)
-
-  def _get_data_device(self, tower):
-    return self._outputs_grads[0][tower].device
-
-
-class FullyConnectedMultiKF(FullyConnectedKroneckerFactor):
-  """Kronecker factor for a fully connected layer used multiple times."""
-
-  def __init__(self,
-               tensors,
-               num_uses=None,
-               has_bias=False):
-    """Constructs a new `FullyConnectedMultiKF`.
-
-    Args:
-      tensors: List of list of Tensors of shape, each of shape
-        [num_uses * batch_size, n], and is a reshape version of a Tensor of
-        shape [num_uses, batch_size, n]. Each of these tensors is usually a
-        layer's inputs or its output's gradients. The first list index is
-        sources, the second is towers.
-      num_uses: int. The number of time-steps / uses.
-      has_bias: bool. If True, '1' is appended to each row.
-    """
-
-    self._num_uses = num_uses
-
-    self._cov_dt1 = None
-    self._make_cov_dt1 = False
-    self._option1quants_by_damping = {}
-    self._option2quants_by_damping = {}
-    self._option1quants_registrations = set()
-    self._option2quants_registrations = set()
-
-    super(FullyConnectedMultiKF, self).__init__(tensors=tensors,
-                                                has_bias=has_bias)
-
-  @property
-  def _num_timesteps(self):
-    return self._num_uses
-
-  @property
-  def _var_scope(self):
-    return "ff_fc_multi_" + scope_string_from_params(
-        tuple(nest.flatten(self._tensors))
-        + (self._num_timesteps, self._has_bias,))
-
-  def make_covariance_update_op(self, ema_decay):
-
-    op = super(FullyConnectedMultiKF, self).make_covariance_update_op(ema_decay)
-
-    if self._cov_dt1 is not None:
-      new_cov_dt1_contribs = []
-      for source in range(self._num_sources):
-        for tower in range(self._num_towers):
-          with place_on_device(self._get_data_device(tower)):
-            new_cov_dt1_contribs.append(self._compute_new_cov_dt1(source,
-                                                                  tower))
-
-      new_cov_dt1 = (math_ops.add_n(new_cov_dt1_contribs)
-                     / float(self._num_towers))
-
-      # See comments in FisherFactor.make_covariance_update_op() for details.
-      if utils.on_tpu():
-        new_cov_dt1 = utils.cross_replica_mean(new_cov_dt1)
-
-      op2 = moving_averages.assign_moving_average(
-          self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
-
-      # TODO(b/69112164):
-      # It's important that _cov and _cov_dt1 remain consistent with each
-      # other while the inverse ops are happening. How can we ensure this?
-      # We will need to add explicit synchronization for this to
-      # work with asynchronous training.
-      op = control_flow_ops.group(op, op2)
-
-    return op
-
-  def _compute_new_cov_dt1(self, source, tower):  # pylint: disable=missing-docstring
-    tensor = self._tensors[source][tower]
-    if self._has_bias:
-      # This appending is technically done twice (the other time is for
-      # _compute_new_cov())
-      tensor = append_homog(tensor)
-
-    total_len = array_ops.shape(tensor)[0]
-    batch_size = total_len // self._num_timesteps
-
-    tensor_present = tensor[:-batch_size, :]
-    tensor_future = tensor[batch_size:, :]
-
-    # We specify a normalizer for this computation to ensure a PSD Fisher
-    # block estimate.  This is equivalent to padding with zeros, as was done
-    # in Section B.2 of the appendix.
-    return compute_cov(
-        tensor_future, tensor_right=tensor_present, normalizer=total_len)
-
-  def _get_data_device(self, tower):
-    return self._tensors[0][tower].device
-
-  @property
-  def _vec_shape(self):
-    size = self._tensors[0][0].shape[1] + self._has_bias
-    return [size]
-
-  def get_option1quants(self, damping_func):
-    damping_id = graph_func_to_id(damping_func)
-    return self._option1quants_by_damping[damping_id]
-
-  def get_option2quants(self, damping_func):
-    damping_id = graph_func_to_id(damping_func)
-    return self._option2quants_by_damping[damping_id]
-
-  def get_cov_dt1(self):
-    assert self._cov_dt1 is not None
-    return self._cov_dt1
-
-  def register_cov_dt1(self):
-    self._make_cov_dt1 = True
-
-  def instantiate_cov_variables(self):
-    super(FullyConnectedMultiKF, self).instantiate_cov_variables()
-    assert self._cov_dt1 is None
-    if self._make_cov_dt1:
-      with variable_scope.variable_scope(self._var_scope):
-        self._cov_dt1 = variable_scope.get_variable(
-            "cov_dt1",
-            initializer=init_ops.zeros_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-
-  def register_option1quants(self, damping_func):
-    damping_id = self._register_damping(damping_func)
-    if damping_id not in self._option1quants_registrations:
-      self._option1quants_registrations.add(damping_id)
-
-  def register_option2quants(self, damping_func):
-    damping_id = self._register_damping(damping_func)
-    if damping_id not in self._option2quants_registrations:
-      self._option2quants_registrations.add(damping_id)
-
-  def instantiate_inv_variables(self):
-    super(FullyConnectedMultiKF, self).instantiate_inv_variables()
-
-    for damping_id in self._option1quants_registrations:
-      damping_func = self._damping_funcs_by_id[damping_id]
-      damping_string = graph_func_to_string(damping_func)
-      # It's questionable as to whether we should initialize with stuff like
-      # this at all.  Ideally these values should never be used until they are
-      # updated at least once.
-      with variable_scope.variable_scope(self._var_scope):
-        Lmat = variable_scope.get_variable(  # pylint: disable=invalid-name
-            "Lmat_damp{}".format(damping_string),
-            initializer=inverse_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-        psi = variable_scope.get_variable(
-            "psi_damp{}".format(damping_string),
-            initializer=init_ops.ones_initializer,
-            shape=self._vec_shape,
-            trainable=False,
-            dtype=self._dtype)
-
-      assert damping_id not in self._option1quants_by_damping
-      self._option1quants_by_damping[damping_id] = (Lmat, psi)
-
-    for damping_id in self._option2quants_registrations:
-      damping_func = self._damping_funcs_by_id[damping_id]
-      damping_string = graph_func_to_string(damping_func)
-      # It's questionable as to whether we should initialize with stuff like
-      # this at all.  Ideally these values should never be used until they are
-      # updated at least once.
-      with variable_scope.variable_scope(self._var_scope):
-        Pmat = variable_scope.get_variable(  # pylint: disable=invalid-name
-            "Lmat_damp{}".format(damping_string),
-            initializer=inverse_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-        Kmat = variable_scope.get_variable(  # pylint: disable=invalid-name
-            "Kmat_damp{}".format(damping_string),
-            initializer=inverse_initializer,
-            shape=self._cov_shape,
-            trainable=False,
-            dtype=self._dtype)
-        mu = variable_scope.get_variable(
-            "mu_damp{}".format(damping_string),
-            initializer=init_ops.ones_initializer,
-            shape=self._vec_shape,
-            trainable=False,
-            dtype=self._dtype)
-
-      assert damping_id not in self._option2quants_by_damping
-      self._option2quants_by_damping[damping_id] = (Pmat, Kmat, mu)
-
-  def make_inverse_update_ops(self):
-    """Create and return update ops corresponding to registered computations."""
-    # TODO(b/69918258): Add correctness tests for this method.
-    # pylint: disable=invalid-name
-
-    ops = []
-
-    if (len(self._option1quants_by_damping) +
-        len(self._option2quants_by_damping)):
-
-      # Note that C0 and C1 are stand-ins for A0 and A1, or G0 and G1, from
-      # the pseudo-code in the original paper.  Because the computations for
-      # the A and G case are essentially the same they can both be performed by
-      # the same class (this one).
-
-      C1 = self.get_cov_dt1()
-
-      # Get the eigendecomposition of C0  (= self.get_cov())
-      eigen_e, eigen_V = self.get_eigendecomp()
-
-      # TODO(b/69678661): Note, there is an implicit assumption here that C1
-      # and C0 (as represented here by its eigen-decomp) are consistent.  This
-      # could fail to be the case if self._cov and self._cov_dt1 are not updated
-      # consistently, or are somehow read between or during the cov updates.
-      # Can this possibly happen?  Is there a way to prevent it?
-
-      for damping_id, (Lmat_var,
-                       psi_var) in self._option1quants_by_damping.items():
-
-        damping = self._damping_funcs_by_id[damping_id]()
-        damping = math_ops.cast(damping, self._dtype)
-
-        invsqrtC0 = math_ops.matmul(
-            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
-
-        # Might need to enforce symmetry lost due to numerical issues.
-        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
-
-        # The following line imposses the symmetry assumed by "Option 1" on C1.
-        # Stangely the code can work okay with this line commented out,
-        # depending on how psd_eig is defined.  I'm not sure why.
-        C1 = (C1 + array_ops.transpose(C1)) / 2.0
-
-        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means hat{Psi})
-        hPsi = math_ops.matmul(math_ops.matmul(invsqrtC0, C1), invsqrtC0)
-
-        # Compute the decomposition U*diag(psi)*U^T = hPsi
-        psi, U = utils.posdef_eig(hPsi)
-
-        # L = C0^(-1/2) * U
-        Lmat = math_ops.matmul(invsqrtC0, U)
-
-        ops.append(Lmat_var.assign(Lmat))
-        ops.append(psi_var.assign(psi))
-
-      for damping_id, (Pmat_var, Kmat_var,
-                       mu_var) in self._option2quants_by_damping.items():
-
-        damping = self._damping_funcs_by_id[damping_id]()
-        damping = math_ops.cast(damping, self._dtype)
-
-        # compute C0^(-1/2)
-        invsqrtC0 = math_ops.matmul(
-            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
-
-        # Might need to enforce symmetry lost due to numerical issues.
-        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
-
-        # Compute the product C0^(-1/2) * C1
-        invsqrtC0C1 = math_ops.matmul(invsqrtC0, C1)
-
-        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means hat{Psi})
-        hPsi = math_ops.matmul(invsqrtC0C1, invsqrtC0)
-
-        # Compute the decomposition E*diag(mu)*E^T = hPsi^T * hPsi
-        # Note that we using the notation mu instead of "m" for the eigenvalues.
-        # Instead of computing the product hPsi^T * hPsi and then doing an
-        # eigen-decomposition of this we just compute the SVD of hPsi and then
-        # square the singular values to get the eigenvalues. For a justification
-        # of this approach, see:
-        # https://en.wikipedia.org/wiki/Singular-value_decomposition#Relation_to_eigenvalue_decomposition
-        sqrtmu, _, E = linalg_ops.svd(hPsi)
-        mu = math_ops.square(sqrtmu)
-
-        # Mathematically, the eigenvalues should not should not exceed 1.0, but
-        # due to numerical issues, or possible issues with inconsistent
-        # values of C1 and (the eigen-decomposition of) C0 they might. So
-        # we enforce this condition.
-        mu = math_ops.minimum(mu, 1.0)
-
-        # P = (C0^(-1/2) * C1)^T * C0^(-1/2) = C_1^T * C_0^(-1)
-        Pmat = math_ops.matmul(invsqrtC0C1, invsqrtC0, transpose_a=True)
-
-        # K = C_0^(-1/2) * E
-        Kmat = math_ops.matmul(invsqrtC0, E)
-
-        ops.append(Pmat_var.assign(Pmat))
-        ops.append(Kmat_var.assign(Kmat))
-        ops.append(mu_var.assign(mu))
-
-    ops += super(FullyConnectedMultiKF, self).make_inverse_update_ops()
-    return [control_flow_ops.group(*ops)]
-
-    # pylint: enable=invalid-name
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors_lib.py b/tensorflow/contrib/kfac/python/ops/fisher_factors_lib.py
deleted file mode 100644
index 2d8e378a93..0000000000
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors_lib.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""FisherFactor definitions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.fisher_factors import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    "inverse_initializer", "covariance_initializer",
-    "diagonal_covariance_initializer", "scope_string_from_params",
-    "scope_string_from_name", "scalar_or_tensor_to_string", "FisherFactor",
-    "InverseProvidingFactor", "FullFactor", "DiagonalFactor",
-    "NaiveDiagonalFactor", "EmbeddingInputKroneckerFactor",
-    "FullyConnectedDiagonalFactor", "FullyConnectedKroneckerFactor",
-    "ConvInputKroneckerFactor", "ConvOutputKroneckerFactor",
-    "ConvDiagonalFactor", "set_global_constants", "maybe_colocate_with",
-    "compute_cov", "append_homog"
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
deleted file mode 100644
index cbbfe7212c..0000000000
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ /dev/null
@@ -1,1269 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Registry for layers and their parameters/variables.
-
-This represents the collection of all layers in the approximate Fisher
-information matrix to which a particular FisherBlock may belong. That is, we
-might have several layer collections for one TF graph (if we have multiple K-FAC
-optimizers being used, for example.)
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import defaultdict
-from collections import OrderedDict
-from contextlib import contextmanager
-from functools import partial
-import warnings
-
-import math
-import six
-
-from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
-from tensorflow.contrib.kfac.python.ops import loss_functions as lf
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import nest
-
-# Names for various approximations that can be requested for Fisher blocks.
-APPROX_KRONECKER_NAME = "kron"
-APPROX_DIAGONAL_NAME = "diagonal"
-APPROX_FULL_NAME = "full"
-
-_GENERIC_APPROX_TO_BLOCK_TYPES = {
-    APPROX_FULL_NAME: fb.FullFB,
-    APPROX_DIAGONAL_NAME: fb.NaiveDiagonalFB,
-}
-
-_FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES = {
-    APPROX_KRONECKER_NAME: fb.FullyConnectedKFACBasicFB,
-    APPROX_DIAGONAL_NAME: fb.FullyConnectedDiagonalFB,
-}
-
-_CONV2D_APPROX_TO_BLOCK_TYPES = {
-    APPROX_KRONECKER_NAME: fb.ConvKFCBasicFB,
-    APPROX_DIAGONAL_NAME: fb.ConvDiagonalFB,
-}
-
-_EMBEDDING_APPROX_TO_BLOCK_TYPES = {
-    APPROX_KRONECKER_NAME: fb.EmbeddingKFACFB
-}
-
-APPROX_KRONECKER_INDEP_NAME = "kron_indep"
-APPROX_KRONECKER_SERIES_1_NAME = "kron_series_1"
-APPROX_KRONECKER_SERIES_2_NAME = "kron_series_2"
-
-_FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES = {
-    APPROX_KRONECKER_INDEP_NAME: fb.FullyConnectedMultiIndepFB,
-    APPROX_KRONECKER_SERIES_1_NAME: partial(fb.FullyConnectedSeriesFB,
-                                            option=1),
-    APPROX_KRONECKER_SERIES_2_NAME: partial(fb.FullyConnectedSeriesFB,
-                                            option=2)
-}
-
-_CONV2D_MULTI_APPROX_TO_BLOCK_TYPES = {
-    APPROX_KRONECKER_INDEP_NAME: fb.ConvKFCBasicMultiIndepFB
-}
-
-_EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES = {
-    APPROX_KRONECKER_INDEP_NAME: fb.EmbeddingKFACMultiIndepFB
-}
-
-# Possible value for `reuse` keyword argument. Sets `reuse` to
-# tf.get_variable_scope().reuse.
-VARIABLE_SCOPE = "VARIABLE_SCOPE"
-
-_DEFAULT_LAYER_COLLECTION = None
-
-
-def get_default_layer_collection():
-  """Get default LayerCollection."""
-  if _DEFAULT_LAYER_COLLECTION is None:
-    raise ValueError(
-        "Attempted to retrieve default LayerCollection when none is set. Use "
-        "LayerCollection.as_default().")
-
-  return _DEFAULT_LAYER_COLLECTION
-
-
-def set_default_layer_collection(layer_collection):
-  global _DEFAULT_LAYER_COLLECTION
-
-  if _DEFAULT_LAYER_COLLECTION is not None and layer_collection is not None:
-    raise ValueError("Default LayerCollection is already set.")
-
-  _DEFAULT_LAYER_COLLECTION = layer_collection
-
-
-class LayerParametersDict(OrderedDict):
-  """An OrderedDict where keys are Tensors or tuples of Tensors.
-
-  Ensures that no Tensor is associated with two different keys.
-  """
-
-  def __init__(self, *args, **kwargs):
-    self._tensors = set()
-    super(LayerParametersDict, self).__init__(*args, **kwargs)
-
-  def __setitem__(self, key, value):
-    key = self._canonicalize_key(key)
-    tensors = key if isinstance(key, (tuple, list)) else (key,)
-    key_collisions = self._tensors.intersection(tensors)
-    if key_collisions:
-      raise ValueError("Key(s) already present: {}".format(key_collisions))
-    self._tensors.update(tensors)
-    super(LayerParametersDict, self).__setitem__(key, value)
-
-  def __delitem__(self, key):
-    key = self._canonicalize_key(key)
-    self._tensors.remove(key)
-    super(LayerParametersDict, self).__delitem__(key)
-
-  def __getitem__(self, key):
-    key = self._canonicalize_key(key)
-    return super(LayerParametersDict, self).__getitem__(key)
-
-  def __contains__(self, key):
-    key = self._canonicalize_key(key)
-    return super(LayerParametersDict, self).__contains__(key)
-
-  def _canonicalize_key(self, key):
-    if isinstance(key, (list, tuple)):
-      return tuple(key)
-    return key
-
-
-# TODO(b/68034464): add capability for LayerCollection to be "finalized"
-# and do this when it gets used by FisherEstimator / KfacOptimizer.
-
-
-class LayerCollection(object):
-  """Registry of information about layers and losses.
-
-  Note that you need to create a new one of these for each MatrixEstimator or
-  KfacOptimizer.
-
-  Attributes:
-    fisher_blocks: a LayersParamsDict (subclass of OrderedDict) mapping layer
-        parameters (Tensors or tuples of Tensors) to FisherBlock instances.
-    fisher_factors: an OrderedDict mapping tuples to FisherFactor instances.
-    losses: a list of LossFunction objects. The loss to be optimized is their
-        sum.
-    loss_colocation_ops: ops to colocate loss function evaluations with.  These
-        will typically be the inputs to the losses.
-  """
-
-  def __init__(self,
-               graph=None,
-               name="LayerCollection"):
-    warnings.warn(
-        "tf.contrib.kfac is deprecated and will be removed by 2018-11-01. "
-        "Use https://pypi.python.org/pypi/kfac instead.")
-    self.fisher_blocks = LayerParametersDict()
-    self.fisher_factors = OrderedDict()
-    self._linked_parameters = dict(
-    )  # dict mapping sets of variables to optionally specified approximations.
-    self._graph = graph or ops.get_default_graph()
-    self._loss_dict = {}  # {str: LossFunction}
-    self._subgraph = None
-    self._default_generic_approximation = APPROX_DIAGONAL_NAME
-    self._default_embedding_approximation = APPROX_KRONECKER_NAME
-    self._default_fully_connected_approximation = APPROX_KRONECKER_NAME
-    self._default_conv2d_approximation = APPROX_KRONECKER_NAME
-    self._default_fully_connected_multi_approximation = (
-        APPROX_KRONECKER_INDEP_NAME)
-    self._default_conv2d_multi_approximation = (
-        APPROX_KRONECKER_INDEP_NAME)
-    self._default_embedding_multi_approximation = APPROX_KRONECKER_INDEP_NAME
-    self.loss_colocation_ops = {}
-    self._vars_to_uses = defaultdict(lambda: 0)
-
-    with variable_scope.variable_scope(None, default_name=name) as scope:
-      self._var_scope = scope.name
-
-  @property
-  def losses(self):
-    """Tuple of LossFunction objects registered with this LayerCollection."""
-    return nest.flatten(self.towers_by_loss)
-
-  @property
-  def towers_by_loss(self):
-    """Tuple across losses of LossFunction objects registered to each tower."""
-    return tuple(tuple(lst) for lst in self._loss_dict.values())
-
-  @property
-  def registered_variables(self):
-    """A tuple of all of the variables currently registered."""
-    tuple_of_tuples = (utils.ensure_sequence(key) for key, block
-                       in six.iteritems(self.fisher_blocks))
-    flat_tuple = tuple(item for tuple_ in tuple_of_tuples for item in tuple_)
-    return flat_tuple
-
-  @property
-  def linked_parameters(self):
-    """Groups of parameters with an optionally specified approximation.
-
-    Linked parameters can be added using `define_linked_parameters`.
-    If an approximation is specified, then this approximation will be used
-    when registering a layer with exactly these parameters, unless an
-    approximation is specified when calling the registration function.
-
-    Returns:
-      A `dict` mapping tuples of parameters to an optional string.
-    """
-    return self._linked_parameters
-
-  @property
-  def default_embedding_approximation(self):
-    return self._default_embedding_approximation
-
-  def set_default_embedding_approximation(self, value):
-    if value != APPROX_KRONECKER_NAME:
-      raise ValueError(
-          "{} is not a valid approximation for embedding variables.".format(
-              value))
-    self._default_embedding_approximation = value
-
-  @property
-  def default_generic_approximation(self):
-    return self._default_generic_approximation
-
-  def set_default_generic_approximation(self, value):
-    if value not in _GENERIC_APPROX_TO_BLOCK_TYPES:
-      raise ValueError(
-          "{} is not a valid approximation for generic variables.".format(
-              value))
-    self._default_generic_approximation = value
-
-  @property
-  def default_fully_connected_approximation(self):
-    return self._default_fully_connected_approximation
-
-  def set_default_fully_connected_approximation(self, value):
-    if value not in _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES:
-      raise ValueError(
-          "{} is not a valid approximation for fully connected layers.".format(
-              value))
-    self._default_fully_connected_approximation = value
-
-  @property
-  def default_conv2d_approximation(self):
-    return self._default_conv2d_approximation
-
-  def set_default_conv2d_approximation(self, value):
-    if value not in _CONV2D_APPROX_TO_BLOCK_TYPES:
-      raise ValueError(
-          "{} is not a valid approximation for 2d convolutional layers.".format(
-              value))
-    self._default_conv2d_approximation = value
-
-  @property
-  def default_fully_connected_multi_approximation(self):
-    return self._default_fully_connected_multi_approximation
-
-  def set_default_fully_connected_multi_approximation(self, value):
-    if value not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
-      raise ValueError("{} is not a valid approximation for a fully-connected "
-                       "multi layer.".format(value))
-    self._default_fully_connected_multi_approximation = value
-
-  @property
-  def default_conv2d_multi_approximation(self):
-    return self._default_conv2d_multi_approximation
-
-  @property
-  def default_embedding_multi_approximation(self):
-    return self._default_embedding_multi_approximation
-
-  def register_block(self, layer_key, fisher_block, reuse=VARIABLE_SCOPE):
-    """Validates and registers the layer_key associated with the fisher_block.
-
-    Args:
-      layer_key: A variable or tuple of variables. The key to check for in
-          existing registrations and to register if valid.
-      fisher_block: The associated `FisherBlock`.
-      reuse: Method to use for inserting new `FisherBlock's. One of True, False,
-        or `VARIABLE_SCOPE`.
-
-    Raises:
-      ValueError: If `layer_key` was already registered and reuse is `False`,
-        if `layer_key` was registered with a different block type, or if
-        `layer_key` shares any variables with but is not equal to a previously
-        registered key.
-      KeyError: If `reuse` is `True` but `layer_key` was not previously
-        registered.
-
-    Returns:
-      The `FisherBlock` registered under `layer_key`. If `layer_key` was already
-      registered, this will be the previously registered `FisherBlock`.
-    """
-    if reuse is VARIABLE_SCOPE:
-      reuse = variable_scope.get_variable_scope().reuse
-
-    if reuse is True or (reuse is variable_scope.AUTO_REUSE and
-                         layer_key in self.fisher_blocks):
-      result = self.fisher_blocks[layer_key]
-      if type(result) != type(fisher_block):  # pylint: disable=unidiomatic-typecheck
-        raise ValueError(
-            "Attempted to register FisherBlock of type %s when existing "
-            "FisherBlock has type %s." % (type(fisher_block), type(result)))
-      return result
-    if reuse is False and layer_key in self.fisher_blocks:
-      raise ValueError("FisherBlock for %s is already in LayerCollection." %
-                       (layer_key,))
-
-    # Insert fisher_block into self.fisher_blocks.
-    if layer_key in self.fisher_blocks:
-      raise ValueError("Duplicate registration: {}".format(layer_key))
-    # Raise an error if any variable in layer_key has been registered in any
-    # other blocks.
-    variable_to_block = {
-        var: (params, block)
-        for (params, block) in self.fisher_blocks.items()
-        for var in utils.ensure_sequence(params)
-    }
-    for variable in utils.ensure_sequence(layer_key):
-      if variable in variable_to_block:
-        prev_key, prev_block = variable_to_block[variable]
-        raise ValueError(
-            "Attempted to register layer_key {} with block {}, but variable {}"
-            " was already registered in key {} with block {}.".format(
-                layer_key, fisher_block, variable, prev_key, prev_block))
-    self.fisher_blocks[layer_key] = fisher_block
-    return fisher_block
-
-  def register_loss_function(self,
-                             loss,
-                             colocation_op,
-                             base_name,
-                             name=None,
-                             reuse=VARIABLE_SCOPE):
-    """Registers a LossFunction object.
-
-    Args:
-      loss: The LossFunction object.
-      colocation_op: The op to colocate the loss function's computations with.
-      base_name: The name to derive a new unique name from is the name argument
-        is None.
-      name: (OPTIONAL) str or None. Unique name for this loss function. If None,
-        a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, adds `loss` as an additional
-        tower for the existing loss function.
-
-    Raises:
-      ValueError: If reuse == True and name == None.
-      ValueError: If reuse == True and seed != None.
-      KeyError: If reuse == True and no existing LossFunction with `name` found.
-      KeyError: If reuse == False and existing LossFunction with `name` found.
-    """
-
-    name = name or self._graph.unique_name(base_name)
-
-    if reuse == VARIABLE_SCOPE:
-      reuse = variable_scope.get_variable_scope().reuse
-
-    if reuse:
-      if name is None:
-        raise ValueError(
-            "If reuse is enabled, loss function's name must be set.")
-
-      loss_list = self._loss_dict.get(name, None)
-
-      if loss_list is None:
-        raise KeyError(
-            "Unable to find loss function named {}. Register a new loss "
-            "function with reuse=False.".format(name))
-    else:
-      if name in self._loss_dict:
-        raise KeyError(
-            "Loss function named {} already exists. Set reuse=True to append "
-            "another tower.".format(name))
-
-      loss_list = []
-      self._loss_dict[name] = loss_list
-
-    loss_list.append(loss)
-    self.loss_colocation_ops[loss] = colocation_op
-
-  def _get_use_count_map(self):
-    """Returns a dict mapping variables to their number of registrations."""
-    return self._vars_to_uses
-
-  def _add_uses(self, params, uses):
-    """Register additional uses by params in the graph.
-
-    Args:
-      params: Variable or tuple of Variables. Parameters for a layer.
-      uses: int or float. Number of additional uses for these parameters.
-    """
-    params = params if isinstance(params, (tuple, list)) else (params,)
-    for var in params:
-      self._vars_to_uses[var] += uses
-
-  def check_registration(self, variables):
-    """Checks that all variable uses have been registered properly.
-
-    Args:
-      variables: List of variables.
-
-    Raises:
-      ValueError: If any registered variables are not included in the list.
-      ValueError: If any variable in the list is not registered.
-      ValueError: If any variable in the list is registered with the wrong
-          number of "uses" in the subgraph recorded (vs the number of times that
-          variable is actually used in the subgraph).
-    """
-    # Note that overlapping parameters (i.e. those that share variables) will
-    # be caught by layer_collection.LayerParametersDict during registration.
-
-    reg_use_map = self._get_use_count_map()
-
-    error_messages = []
-
-    for var in variables:
-      total_uses = self.subgraph.variable_uses(var)
-      reg_uses = reg_use_map[var]
-
-      if reg_uses == 0:
-        error_messages.append("Variable {} not registered.".format(var))
-      elif (not math.isinf(reg_uses)) and reg_uses != total_uses:
-        error_messages.append(
-            "Variable {} registered with wrong number of uses ({} "
-            "registrations vs {} uses).".format(var, reg_uses, total_uses))
-
-    num_get_vars = len(reg_use_map)
-
-    if num_get_vars > len(variables):
-      error_messages.append("{} registered variables were not included in list."
-                            .format(num_get_vars - len(variables)))
-
-    if error_messages:
-      error_messages = [
-          "Found the following errors with variable registration:"
-      ] + error_messages
-      raise ValueError("\n\t".join(error_messages))
-
-  def get_blocks(self):
-    return self.fisher_blocks.values()
-
-  def get_factors(self):
-    return self.fisher_factors.values()
-
-  @property
-  def graph(self):
-    return self._graph
-
-  @property
-  def subgraph(self):
-    return self._subgraph
-
-  def define_linked_parameters(self, params, approximation=None):
-    """Identify a set of parameters that should be grouped together.
-
-    During automatic graph scanning, any matches containing variables that have
-    been identified as part of a linked group will be filtered out unless
-    the match parameters are exactly equal to the ones specified in the linked
-    group.
-
-    Args:
-      params: A variable, or a tuple or list of variables. The variables
-        to be linked.
-      approximation: Optional string specifying the type of approximation to use
-        for these variables. If unspecified, this layer collection's default
-        approximation for the layer type will be used.
-
-    Raises:
-      ValueError: If the parameters were already registered in a layer or
-        identified as part of an incompatible group.
-    """
-    params = frozenset(utils.ensure_sequence(params))
-
-    # Check if any of the variables in `params` is already in
-    # 'self.fisher_blocks.keys()`.
-    for registered_params, fisher_block in self.fisher_blocks.items():
-      registered_params_set = set(utils.ensure_sequence(registered_params))
-      for variable in params:
-        if (variable in registered_params_set and
-            params != registered_params_set):
-          raise ValueError(
-              "Can`t link parameters {}, variable {} was already registered in "
-              "group {} with layer {}".format(params, variable,
-                                              registered_params, fisher_block))
-
-    # Check if any of the variables in `params` is already in
-    # 'self.linked_parameters`.
-    for variable in params:
-      for other_linked_params in self.linked_parameters:
-        if variable in other_linked_params:
-          raise ValueError("Can`t link parameters {}, variable {} was already "
-                           "linked in group {}.".format(params, variable,
-                                                        other_linked_params))
-    self._linked_parameters[params] = approximation
-
-  def create_subgraph(self):
-    if not self.losses:
-      raise ValueError("Must have at least one registered loss.")
-    inputs_to_losses = nest.flatten(tuple(loss.inputs for loss in self.losses))
-    self._subgraph = utils.SubGraph(inputs_to_losses)
-
-  def eval_losses(self):
-    """Return evaluated losses (colocated with inputs to losses)."""
-    evals = []
-    for loss in self.losses:
-      with ops.colocate_with(self.loss_colocation_ops[loss]):
-        evals.append(loss.evaluate())
-    return evals
-
-  def eval_losses_on_samples(self):
-    """Return losses evaluated on samples (colocated with inputs to losses)."""
-    evals = []
-    for loss in self.losses:
-      with ops.colocate_with(self.loss_colocation_ops[loss]):
-        evals.append(loss.evaluate_on_sample())
-    return evals
-
-  def total_loss(self):
-    return math_ops.add_n(self.eval_losses())
-
-  def total_sampled_loss(self):
-    return math_ops.add_n(self.eval_losses_on_samples())
-
-  def _get_linked_approx(self, params):
-    """If params were linked, return their specified approximation."""
-    params_set = frozenset(utils.ensure_sequence(params))
-    if params_set in self.linked_parameters:
-      return self.linked_parameters[params_set]
-    else:
-      return None
-
-  def _get_block_type(self, params, approx, default, approx_to_type):
-    if approx is None:
-      approx = self._get_linked_approx(params)
-      if approx is None:
-        approx = default
-
-    if approx not in approx_to_type:
-      raise ValueError("Bad value {} for approx.".format(approx))
-
-    return approx_to_type[approx], approx
-
-  def register_embedding(self,
-                         params,
-                         inputs,
-                         outputs,
-                         approx=None,
-                         reuse=VARIABLE_SCOPE):
-    """Registers an embedding layer.
-
-    Args:
-      params: Embedding matrix of shape [vocab_size, embedding_size].
-      inputs: Tensor of shape [batch_size, input_size] and dtype int32. Indices
-        into embedding matrix.
-      outputs: Tensor of shape [batch_size, embedding_size]. Outputs
-        produced by layer.
-      approx: str or None. If not None must be "kron".  The Fisher
-        approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-    block_type, approx = self._get_block_type(
-        params, approx, self.default_embedding_approximation,
-        _EMBEDDING_APPROX_TO_BLOCK_TYPES)
-
-    if isinstance(params, (tuple, list)):
-      raise ValueError("Bias not supported.")
-    vocab_size = int(params.shape[0])
-    block = self.register_block(
-        params, block_type(self, vocab_size), reuse=reuse)
-    block.register_additional_tower(inputs, outputs)
-
-    self._add_uses(params, 1)
-
-  def register_fully_connected(self,
-                               params,
-                               inputs,
-                               outputs,
-                               approx=None,
-                               reuse=VARIABLE_SCOPE):
-    """Registers a fully connnected layer.
-
-    Args:
-      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
-        this layer. Weight matrix should have shape [input_size, output_size].
-        Bias should have shape [output_size].
-      inputs: Tensor of shape [batch_size, input_size]. Inputs to layer.
-      outputs: Tensor of shape [batch_size, output_size]. Outputs
-        produced by layer.
-      approx: str or None. If not None must be one of "kron" or "diagonal".
-        The Fisher approximation to use. If None the default value is used.
-        (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-
-    block_type, approx = self._get_block_type(
-        params, approx, self.default_fully_connected_approximation,
-        _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES)
-
-    has_bias = isinstance(params, (tuple, list))
-    block = self.register_block(params, block_type(self, has_bias=has_bias),
-                                reuse=reuse)
-    block.register_additional_tower(inputs, outputs)
-
-    self._add_uses(params, 1)
-
-  def register_conv2d(self,
-                      params,
-                      strides,
-                      padding,
-                      inputs,
-                      outputs,
-                      data_format=None,
-                      dilations=None,
-                      approx=None,
-                      reuse=VARIABLE_SCOPE):
-    """Registers a call to tf.nn.conv2d().
-
-    Args:
-      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
-        this layer. Weight matrix should have shape [kernel_height,
-        kernel_width, in_channels, out_channels].  Bias should have shape
-        [out_channels].
-      strides: List of 4 ints. Strides for convolution kernel.
-      padding: string. see tf.nn.conv2d for valid values.
-      inputs: Tensor of shape [batch_size, height, width, in_channels]. Inputs
-        to layer.
-      outputs: Tensor of shape [batch_size, height, width, out_channels].
-        Output produced by layer.
-      data_format: str or None. Format of data.
-      dilations: List of 4 ints. Dilations along each dimension.
-      approx: str or None. If not None must be one of "kron" or "diagonal".
-        The Fisher approximation to use. If None the default value is used.
-        (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-
-    block_type, approx = self._get_block_type(
-        params, approx, self.default_conv2d_approximation,
-        _CONV2D_APPROX_TO_BLOCK_TYPES)
-
-    # It feels bad to pass in configuration that has to do with the internal
-    # implementation.  And then we can`t use the same constructor for both
-    # anymore and are thus forced to use this ugly if-statement.
-    # TODO(b/74793309): Clean this up?
-    if approx == APPROX_KRONECKER_NAME:
-      block = self.register_block(
-          params,
-          block_type(
-              layer_collection=self,
-              params=params,
-              padding=padding,
-              strides=strides,
-              data_format=data_format,
-              dilation_rate=dilations,
-              extract_patches_fn="extract_image_patches"),
-          reuse=reuse)
-    elif approx == APPROX_DIAGONAL_NAME:
-      assert strides[0] == strides[-1] == 1
-      block = self.register_block(
-          params,
-          block_type(
-              layer_collection=self,
-              params=params,
-              padding=padding,
-              strides=strides,
-              dilations=dilations,
-              data_format=data_format),
-          reuse=reuse)
-    else:
-      raise NotImplementedError(approx)
-
-    block.register_additional_tower(inputs, outputs)
-
-    self._add_uses(params, 1)
-
-  def register_convolution(self,
-                           params,
-                           inputs,
-                           outputs,
-                           padding,
-                           strides=None,
-                           dilation_rate=None,
-                           data_format=None,
-                           approx=None,
-                           reuse=VARIABLE_SCOPE):
-    """Register a call to tf.nn.convolution().
-
-    Args:
-      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
-        this layer. Weight matrix should have shape [..filter_spatial_size..,
-        in_channels, out_channels].  Bias should have shape [out_channels].
-      inputs: Tensor of shape [batch_size, ..input_spatial_size.., in_channels].
-        Inputs to layer.
-      outputs: Tensor of shape [batch_size, ..output_spatial_size..,
-        out_channels].  Output produced by layer.
-      padding: string. see tf.nn.conv2d for valid values.
-      strides: List of ints of length len(..input_spatial_size..). Strides for
-        convolution kernel in spatial dimensions.
-      dilation_rate: List of ints of length len(..input_spatial_size..).
-        Dilations along spatial dimension.
-      data_format: str or None. Format of data.
-      approx: str or None. If not None must be one of "kron" or "diagonal".
-        The Fisher approximation to use. If None the default value is used.
-        (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-    # TODO(b/74793309): Have this use _get_block_type like the other
-    # registration functions?
-    assert approx is None or approx == APPROX_KRONECKER_NAME
-
-    block = self.register_block(
-        params,
-        fb.ConvKFCBasicFB(
-            layer_collection=self,
-            params=params,
-            padding=padding,
-            strides=strides,
-            dilation_rate=dilation_rate,
-            data_format=data_format),
-        reuse=reuse)
-    block.register_additional_tower(inputs, outputs)
-
-    self._add_uses(params, 1)
-
-  def register_depthwise_conv2d(self,
-                                params,
-                                inputs,
-                                outputs,
-                                strides,
-                                padding,
-                                rate=None,
-                                data_format=None,
-                                approx=None,
-                                reuse=VARIABLE_SCOPE):
-    """Register a call to tf.nn.depthwise_conv2d().
-
-    Args:
-      params: 4-D Tensor of shape [filter_height, filter_width,
-        in_channels, channel_multiplier].  Convolutional filter.
-      inputs: Tensor of shape [batch_size, input_height, input_width,
-        in_channels].  Inputs to layer.
-      outputs: Tensor of shape [batch_size, output_height, output_width,
-        in_channels * channel_multiplier].  Output produced by depthwise conv2d.
-      strides: List of ints of length 4. Strides along all dimensions.
-      padding: string. see tf.nn.conv2d for valid values.
-      rate: None or List of ints of length 2. Dilation rates in spatial
-        dimensions.
-      data_format: str or None. Format of data.
-      approx: str or None. If not None must "diagonal".  The Fisher
-        approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-    # TODO(b/74793309): Have this use _get_block_type like the other
-    # registration functions?
-    assert approx is None or approx == APPROX_DIAGONAL_NAME
-    assert data_format in [None, "NHWC"]
-
-    block = self.register_block(
-        params,
-        fb.DepthwiseConvDiagonalFB(
-            layer_collection=self,
-            params=params,
-            strides=strides,
-            padding=padding,
-            rate=rate,
-            data_format=data_format),
-        reuse=reuse)
-    block.register_additional_tower(inputs, outputs)
-
-    self._add_uses(params, 1)
-
-  def register_separable_conv2d(self,
-                                depthwise_params,
-                                pointwise_params,
-                                inputs,
-                                depthwise_outputs,
-                                pointwise_outputs,
-                                strides,
-                                padding,
-                                rate=None,
-                                data_format=None,
-                                approx=None,
-                                reuse=VARIABLE_SCOPE):
-    """Register a call to tf.nn.separable_conv2d().
-
-    Note: This requires access to intermediate outputs between depthwise and
-    pointwise convolutions.
-
-    Args:
-      depthwise_params: 4-D Tensor of shape [filter_height, filter_width,
-        in_channels, channel_multiplier].  Filter for depthwise conv2d.
-      pointwise_params: 4-D Tensor of shape [1, 1, in_channels *
-        channel_multiplier, out_channels].  Filter for pointwise conv2d.
-      inputs: Tensor of shape [batch_size, input_height, input_width,
-        in_channels].  Inputs to layer.
-      depthwise_outputs: Tensor of shape [batch_size, output_height,
-        output_width, in_channels * channel_multiplier].  Output produced by
-        depthwise conv2d.
-      pointwise_outputs: Tensor of shape [batch_size, output_height,
-        output_width, out_channels].  Output produced by pointwise conv2d.
-      strides: List of ints of length 4. Strides for depthwise conv2d kernel in
-        all dimensions.
-      padding: string. see tf.nn.conv2d for valid values.
-      rate: None or List of ints of length 2. Dilation rate of depthwise conv2d
-        kernel in spatial dimensions.
-      data_format: str or None. Format of data.
-      approx: str or None. If not None must be one of "kron" or "diagonal".
-        The Fisher approximation to use. If None the default value is used.
-        (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-    self.register_depthwise_conv2d(
-        params=depthwise_params,
-        inputs=inputs,
-        outputs=depthwise_outputs,
-        strides=strides,
-        padding=padding,
-        rate=rate,
-        data_format=data_format,
-        approx=APPROX_DIAGONAL_NAME,
-        reuse=reuse)
-
-    self.register_conv2d(
-        params=pointwise_params,
-        inputs=depthwise_outputs,
-        outputs=pointwise_outputs,
-        strides=[1, 1, 1, 1],
-        padding="VALID",
-        data_format=data_format,
-        approx=approx,
-        reuse=reuse)
-
-  def register_generic(self,
-                       params,
-                       batch_size,
-                       approx=None,
-                       reuse=VARIABLE_SCOPE):
-    """Registers a generic layer.
-
-    Args:
-      params: Tensor or tuple of Tensors corresponding to the parameters.
-      batch_size: 0-D Tensor. Size of the minibatch (for this tower).
-      approx: str or None. It not None, must be one of "full" or "diagonal".
-        The Fisher approximation to use. If None the default value is used.
-        (Default: None)
-      reuse: bool or str. If True, this adds `batch_size` to the total
-        mini-batch size use when estimating the Fisher block for this layer
-        (which must have already been registered). If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-    block_type, approx = self._get_block_type(
-        params, approx, self.default_generic_approximation,
-        _GENERIC_APPROX_TO_BLOCK_TYPES)
-
-    block = self.register_block(params, block_type(self, params), reuse=reuse)
-    block.register_additional_tower(batch_size)
-
-    self._add_uses(params, float("inf"))
-
-  def register_fully_connected_multi(self, params, inputs, outputs,
-                                     num_uses=None, approx=None,
-                                     reuse=VARIABLE_SCOPE):
-    """Register fully connected layers with shared parameters.
-
-    This can handle general fully-connected layers with shared parameters, but
-    has specialized approximations to deal with the case where there is a
-    meaningful linear order to the share instances (such as in an RNN).
-
-    Args:
-      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
-        this layer. Weight matrix should have shape [input_size, output_size].
-        Bias should have shape [output_size].
-      inputs: A list of Tensors, each of shape [batch_size, input_size]. Inputs
-        to layer. The list indexes each use in the graph (which might
-        correspond to a "time-step" in an RNN). OR, can be single Tensor, of
-        shape [num_uses * batch_size , input_size], which is a reshaped version
-        of a Tensor of shape [num_uses, batch_size, input_size].
-      outputs: A list of Tensors, the same length as `inputs`, each of shape
-        [batch_size, output_size]. Outputs produced by layer. The list indexes
-        each use in the graph (which might correspond to a "time-step" in an
-        RNN). Needs to correspond with the order used in `inputs`.  OR, can be
-        a single Tensor of shape [num_uses * batch_size, output_size], which is
-        a reshaped version of a Tensor of shape [num_uses, batch_size,
-        output_size].
-      num_uses: int or None. The number uses/time-steps in the graph where the
-        layer appears. Only needed if both inputs and outputs are given in the
-        single Tensor format. (Default: None)
-      approx: str or None. If not None, must be of "kron_indep", "kron_series_1"
-        or "kron_series_2". The Fisher approximation to use. If None the default
-        value is used. (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word `use` here has a completely different meaning to "use in the graph"
-        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-    """
-    block_type, approx = self._get_block_type(
-        params, approx, self.default_fully_connected_multi_approximation,
-        _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES)
-
-    # TODO(b/70283649): something along the lines of find_canonical_output
-    # should be added back in here (and for the other block types, arguably).
-
-    has_bias = isinstance(params, (tuple, list))
-    block = self.register_block(params, block_type(self, has_bias=has_bias,
-                                                   num_uses=num_uses),
-                                reuse=reuse)
-    block.register_additional_tower(inputs, outputs)
-    if isinstance(inputs, (tuple, list)):
-      assert len(inputs) == len(outputs)
-      self._add_uses(params, len(inputs))
-    else:
-      self._add_uses(params, 1)
-
-  def register_conv2d_multi(self,
-                            params,
-                            strides,
-                            padding,
-                            inputs,
-                            outputs,
-                            num_uses=None,
-                            data_format=None,
-                            dilations=None,
-                            approx=None,
-                            reuse=VARIABLE_SCOPE):
-    """Registers convolutional layers with shared parameters.
-
-    Args:
-      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
-        this layer. Weight matrix should have shape [kernel_height,
-        kernel_width, in_channels, out_channels].  Bias should have shape
-        [out_channels].
-      strides: 1-D Tensor of length 4. Strides for convolution kernel.
-      padding: string. see tf.nn.conv2d for valid values.
-      inputs: A list of Tensors, each of shape [batch_size, height, width,
-        in_channels]. Inputs to layer. The list indexes each use in the graph
-        (which might correspond to a "time-step" in an RNN). OR, can be single
-        Tensor, of shape [num_uses * batch_size, height, width, in_channels],
-        which is a reshaped version of a Tensor of shape [num_uses, batch_size,
-        height, width, in_channels].
-      outputs: A list of Tensors, each of shape [batch_size, height, width,
-        out_channels]. Output produced by layer. The list indexes each use
-        in the graph (which might correspond to a "time-step" in an RNN).
-        Needs to correspond with the order used in `inputs`.  OR, can be a
-        single Tensor, of shape [num_uses * batch_size, height, width,
-        out_channels], which is a reshaped version of a Tensor of shape
-        [num_uses, batch_size, height, width, out_channels].
-      num_uses: int or None. The number uses/time-steps in the graph where the
-        layer appears. Only needed if both inputs and outputs are given in the
-        single Tensor format. (Default: None)
-      data_format: str or None. Format of data.
-      dilations: List of 4 ints. Dilations along each dimension.
-      approx: str or None. If not None must by "kron_indep". The Fisher
-        approximation to use. If None the default value is used.
-        (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word `use` here has a completely different meaning to "use in the graph"
-        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-    block_type, approx = self._get_block_type(
-        params, approx, self.default_conv2d_multi_approximation,
-        _CONV2D_MULTI_APPROX_TO_BLOCK_TYPES)
-
-    block = self.register_block(
-        params,
-        block_type(
-            layer_collection=self,
-            params=params,
-            padding=padding,
-            strides=strides,
-            data_format=data_format,
-            dilation_rate=dilations,
-            extract_patches_fn="extract_image_patches",
-            num_uses=num_uses),
-        reuse=reuse)
-
-    block.register_additional_tower(inputs, outputs)
-    if isinstance(inputs, (tuple, list)):
-      assert len(inputs) == len(outputs)
-      self._add_uses(params, len(inputs))
-    else:
-      self._add_uses(params, 1)
-
-  # TODO(b/74108452): change the loss registration functions names to refer
-  # to "loss functions" instead of distributions.  Following naming convention
-  # of the loss function classes themselves.
-
-  def register_embedding_multi(self,
-                               params,
-                               inputs,
-                               outputs,
-                               num_uses=None,
-                               approx=None,
-                               reuse=VARIABLE_SCOPE):
-    """Registers embedding layers with shared parameters.
-
-    Args:
-      params: Embedding matrix of shape [vocab_size, embedding_size].
-      inputs: A list of Tensors, each of shape [batch_size, input_size] and
-        dtype int32. Indices into embedding matrix. The list indexes each use
-        in the graph (which might correspond to a "time-step" in an RNN).
-        OR, can be single Tensor, of shape [num_uses*batch_size, input_size],
-        which is a reshaped version of a Tensor of shape [num_uses, batch_size,
-        input_size].
-      outputs: A list of Tensors, each of shape [batch_size, embedding_size].
-        Outputs produced by layer. The list indexes each use in the graph
-        (which might correspond to a "time-step" in an RNN). Needs to
-        correspond with the order used in `inputs`. OR, can be a
-        single Tensor, of shape [num_uses * batch_size, embedding_size], which
-        is a reshaped version of a Tensor of shape [num_uses, batch_size,
-        embedding_size].
-      num_uses: int or None. The number uses/time-steps in the graph where the
-        layer appears. Only needed if both inputs and outputs are given in the
-        single Tensor format. (Default: None)
-      approx: str or None. If not None must by "kron_indep". The Fisher
-        approximation to use. If None the default value is used.
-        (Default: None)
-      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
-        additional mini-batch/tower of data to use when estimating the Fisher
-        block for this layer (which must have already been registered). If
-        "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word `use` here has a completely different meaning to "use in the graph"
-        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
-        (Default: "VARIABLE_SCOPE")
-
-    Raises:
-      ValueError: For improper value to `approx`.
-      KeyError: If reuse == True but no FisherBlock found for `params`.
-      ValueError: If reuse == True and FisherBlock found but of the wrong type.
-    """
-    block_type, approx = self._get_block_type(
-        params, approx, self.default_embedding_multi_approximation,
-        _EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES)
-
-    if isinstance(params, (tuple, list)):
-      raise ValueError("Bias not supported.")
-    vocab_size = int(params.shape[0])
-
-    block = self.register_block(
-        params, block_type(self, vocab_size, num_uses=num_uses), reuse=reuse)
-    block.register_additional_tower(inputs, outputs)
-
-    if isinstance(inputs, (tuple, list)):
-      self._add_uses(params, len(inputs))
-    else:
-      self._add_uses(params, 1)
-
-  def register_categorical_predictive_distribution(self,
-                                                   logits,
-                                                   seed=None,
-                                                   targets=None,
-                                                   name=None,
-                                                   reuse=VARIABLE_SCOPE):
-    """Registers a categorical predictive distribution.
-
-    Args:
-      logits: The logits of the distribution (i.e. its parameters).
-      seed: The seed for the RNG (for debugging) (Default: None)
-      targets: (OPTIONAL) The targets for the loss function.  Only required if
-        one wants to call total_loss() instead of total_sampled_loss().
-        total_loss() is required, for example, to estimate the
-        "empirical Fisher" (instead of the true Fisher).
-        (Default: None)
-      name: (OPTIONAL) str or None. Unique name for this loss function. If None,
-        a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds `logits` as an additional
-        mini-batch/tower of inputs to the loss-function/predictive distribution
-        (which must have already been registered). If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
-    """
-    loss = lf.CategoricalLogitsNegativeLogProbLoss(logits, targets=targets,
-                                                   seed=seed)
-    self.register_loss_function(loss, logits,
-                                "categorical_predictive_distribution",
-                                name=name, reuse=reuse)
-
-  def register_normal_predictive_distribution(self,
-                                              mean,
-                                              var=0.5,
-                                              seed=None,
-                                              targets=None,
-                                              name=None,
-                                              reuse=VARIABLE_SCOPE):
-    """Registers a normal predictive distribution.
-
-    Args:
-      mean: The mean vector defining the distribution.
-      var: The variance (must be a scalar).  Note that the default value of
-        0.5 corresponds to a standard squared error loss (target -
-        prediction)**2. If your squared error loss is of the form
-        0.5*(target - prediction)**2 you should use var=1.0. (Default: 0.5)
-      seed: The seed for the RNG (for debugging) (Default: None)
-      targets: (OPTIONAL) The targets for the loss function.  Only required if
-        one wants to call total_loss() instead of total_sampled_loss().
-        total_loss() is required, for example, to estimate the
-        "empirical Fisher" (instead of the true Fisher).
-        (Default: None)
-      name: (OPTIONAL) str or None. Unique name for this loss function. If None,
-        a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds `mean` and `var` as an additional
-        mini-batch/tower of inputs to the loss-function/predictive distribution
-        (which must have already been registered). If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
-    """
-    loss = lf.NormalMeanNegativeLogProbLoss(mean, var, targets=targets,
-                                            seed=seed)
-    self.register_loss_function(loss, mean,
-                                "normal_predictive_distribution",
-                                name=name, reuse=reuse)
-
-  def register_multi_bernoulli_predictive_distribution(self,
-                                                       logits,
-                                                       seed=None,
-                                                       targets=None,
-                                                       name=None,
-                                                       reuse=VARIABLE_SCOPE):
-    """Registers a multi-Bernoulli predictive distribution.
-
-    Args:
-      logits: The logits of the distribution (i.e. its parameters).
-      seed: The seed for the RNG (for debugging) (Default: None)
-      targets: (OPTIONAL) The targets for the loss function.  Only required if
-        one wants to call total_loss() instead of total_sampled_loss().
-        total_loss() is required, for example, to estimate the
-        "empirical Fisher" (instead of the true Fisher).
-        (Default: None)
-      name: (OPTIONAL) str or None. Unique name for this loss function. If None,
-        a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds `logits` as an additional
-        mini-batch/tower of inputs to the loss-function/predictive distribution
-        (which must have already been registered). If "VARIABLE_SCOPE", use
-        tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
-    """
-    loss = lf.MultiBernoulliNegativeLogProbLoss(logits, targets=targets,
-                                                seed=seed)
-    self.register_loss_function(loss, logits,
-                                "multi_bernoulli_predictive_distribution",
-                                name=name, reuse=reuse)
-
-  def make_or_get_factor(self, cls, args):
-    """Insert `cls(args)` into 'self.fisher_factors` if not already present.
-
-    Wraps constructor in `tf.variable_scope()` to ensure variables constructed
-    in `cls.__init__` are placed under this LayerCollection's scope.
-
-    Args:
-      cls: Class that implements FisherFactor.
-      args: Tuple of arguments to pass into `cls's constructor. Must be
-        hashable.
-
-    Returns:
-      Instance of `cls` found in self.fisher_factors.
-    """
-    try:
-      hash(args)
-    except TypeError:
-      raise TypeError(
-          ("Unable to use (cls, args) = ({}, {}) as a key in "
-           "LayerCollection.fisher_factors. The pair cannot be hashed.").format(
-               cls, args))
-
-    key = cls, args
-    if key not in self.fisher_factors:
-      with variable_scope.variable_scope(self._var_scope):
-        self.fisher_factors[key] = cls(*args)
-    return self.fisher_factors[key]
-
-  @contextmanager
-  def as_default(self):
-    """Sets this LayerCollection as the default."""
-    set_default_layer_collection(self)
-    yield
-    set_default_layer_collection(None)
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py b/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
deleted file mode 100644
index 9f46853807..0000000000
--- a/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Registry for layers and their parameters/variables.
-
-This represents the collection of all layers in the approximate Fisher
-information matrix to which a particular FisherBlock may belong. That is, we
-might have several layer collections for one TF graph (if we have multiple K-FAC
-optimizers being used, for example.)
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.layer_collection import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    "get_default_layer_collection",
-    "set_default_layer_collection",
-    "LayerParametersDict",
-    "LayerCollection",
-    "APPROX_KRONECKER_NAME",
-    "APPROX_DIAGONAL_NAME",
-    "APPROX_FULL_NAME",
-    "VARIABLE_SCOPE",
-    "APPROX_KRONECKER_INDEP_NAME",
-    "APPROX_KRONECKER_SERIES_1_NAME",
-    "APPROX_KRONECKER_SERIES_2_NAME"
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/linear_operator.py b/tensorflow/contrib/kfac/python/ops/linear_operator.py
deleted file mode 100644
index 61cb955ae8..0000000000
--- a/tensorflow/contrib/kfac/python/ops/linear_operator.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SmartMatrices definitions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.kfac.python.ops import utils
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.linalg import linalg
-from tensorflow.python.ops.linalg import linalg_impl
-from tensorflow.python.ops.linalg import linear_operator_util as lou
-
-
-class LinearOperatorExtras(object):  # pylint: disable=missing-docstring
-
-  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
-
-    with self._name_scope(name, values=[x]):
-      if isinstance(x, ops.IndexedSlices):
-        return self._matmul_sparse(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
-
-      x = ops.convert_to_tensor(x, name="x")
-      self._check_input_dtype(x)
-
-      self_dim = -2 if adjoint else -1
-      arg_dim = -1 if adjoint_arg else -2
-      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
-
-      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
-
-  def matmul_right(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
-
-    with self._name_scope(name, values=[x]):
-
-      if isinstance(x, ops.IndexedSlices):
-        return self._matmul_right_sparse(
-            x, adjoint=adjoint, adjoint_arg=adjoint_arg)
-
-      x = ops.convert_to_tensor(x, name="x")
-      self._check_input_dtype(x)
-
-      self_dim = -1 if adjoint else -2
-      arg_dim = -2 if adjoint_arg else -1
-      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
-
-      return self._matmul_right(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
-
-
-class LinearOperatorFullMatrix(LinearOperatorExtras,
-                               linalg.LinearOperatorFullMatrix):
-
-  # TODO(b/78117889) Remove this definition once core LinearOperator
-  # has _matmul_right.
-  def _matmul_right(self, x, adjoint=False, adjoint_arg=False):
-    return lou.matmul_with_broadcast(
-        x, self._matrix, adjoint_a=adjoint_arg, adjoint_b=adjoint)
-
-  def _matmul_sparse(self, x, adjoint=False, adjoint_arg=False):
-    raise NotImplementedError
-
-  def _matmul_right_sparse(self, x, adjoint=False, adjoint_arg=False):
-    assert not adjoint and not adjoint_arg
-    return utils.matmul_sparse_dense(x, self._matrix)
-
-
-class LinearOperatorDiag(LinearOperatorExtras,  # pylint: disable=missing-docstring
-                         linalg.LinearOperatorDiag):
-
-  def _matmul_right(self, x, adjoint=False, adjoint_arg=False):
-    diag_mat = math_ops.conj(self._diag) if adjoint else self._diag
-    x = linalg_impl.adjoint(x) if adjoint_arg else x
-    return diag_mat * x
-
-  def _matmul_sparse(self, x, adjoint=False, adjoint_arg=False):
-    diag_mat = math_ops.conj(self._diag) if adjoint else self._diag
-    assert not adjoint_arg
-    return utils.matmul_diag_sparse(diag_mat, x)
-
-  def _matmul_right_sparse(self, x, adjoint=False, adjoint_arg=False):
-    raise NotImplementedError
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
deleted file mode 100644
index 42d525c2c2..0000000000
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ /dev/null
@@ -1,754 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Loss functions to be used by LayerCollection."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-
-import six
-
-from tensorflow.contrib.distributions.python.ops import onehot_categorical
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bernoulli
-from tensorflow.python.ops.distributions import categorical
-from tensorflow.python.ops.distributions import normal
-
-
-@six.add_metaclass(abc.ABCMeta)
-class LossFunction(object):
-  """Abstract base class for loss functions.
-
-  Note that unlike typical loss functions used in neural networks these are
-  summed and not averaged across cases in the batch, since this is what the
-  users of this class (FisherEstimator and MatrixVectorProductComputer) will
-  be expecting. The implication of this is that you will may want to
-  normalize things like Fisher-vector products by the batch size when you
-  use this class.  It depends on the use case.
-  """
-
-  @abc.abstractproperty
-  def targets(self):
-    """The targets being predicted by the model.
-
-    Returns:
-      None or Tensor of appropriate shape for calling self._evaluate() on.
-    """
-    pass
-
-  @abc.abstractproperty
-  def inputs(self):
-    """The inputs to the loss function (excluding the targets)."""
-    pass
-
-  def evaluate(self):
-    """Evaluate the loss function on the targets."""
-    if self.targets is not None:
-      # We treat the targets as "constant".  It's only the inputs that get
-      # "back-propped" through.
-      return self._evaluate(array_ops.stop_gradient(self.targets))
-    else:
-      raise Exception("Cannot evaluate losses with unspecified targets.")
-
-  @abc.abstractmethod
-  def _evaluate(self, targets):
-    """Evaluates the negative log probability of the targets.
-
-    Args:
-      targets: Tensor that distribution can calculate log_prob() of.
-
-    Returns:
-      negative log probability of each target, summed across all targets.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_hessian(self, vector):
-    """Right-multiply a vector by the Hessian.
-
-    Here the 'Hessian' is the Hessian matrix (i.e. matrix of 2nd-derivatives)
-    of the loss function with respect to its inputs.
-
-    Args:
-      vector: The vector to multiply.  Must be the same shape(s) as the
-        'inputs' property.
-
-    Returns:
-      The vector right-multiplied by the Hessian.  Will be of the same shape(s)
-      as the 'inputs' property.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_hessian_factor(self, vector):
-    """Right-multiply a vector by a factor B of the Hessian.
-
-    Here the 'Hessian' is the Hessian matrix (i.e. matrix of 2nd-derivatives)
-    of the loss function with respect to its inputs.  Typically this will be
-    block-diagonal across different cases in the batch, since the loss function
-    is typically summed across cases.
-
-    Note that B can be any matrix satisfying B * B^T = H where H is the Hessian,
-    but will agree with the one used in the other methods of this class.
-
-    Args:
-      vector: The vector to multiply.  Must be of the shape given by the
-        'hessian_factor_inner_shape' property.
-
-    Returns:
-      The vector right-multiplied by B.  Will be of the same shape(s) as the
-      'inputs' property.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_hessian_factor_transpose(self, vector):
-    """Right-multiply a vector by the transpose of a factor B of the Hessian.
-
-    Here the 'Hessian' is the Hessian matrix (i.e. matrix of 2nd-derivatives)
-    of the loss function with respect to its inputs.  Typically this will be
-    block-diagonal across different cases in the batch, since the loss function
-    is typically summed across cases.
-
-    Note that B can be any matrix satisfying B * B^T = H where H is the Hessian,
-    but will agree with the one used in the other methods of this class.
-
-    Args:
-      vector: The vector to multiply.  Must be the same shape(s) as the
-        'inputs' property.
-
-    Returns:
-      The vector right-multiplied by B^T.  Will be of the shape given by the
-      'hessian_factor_inner_shape' property.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_hessian_factor_replicated_one_hot(self, index):
-    """Right-multiply a replicated-one-hot vector by a factor B of the Hessian.
-
-    Here the 'Hessian' is the Hessian matrix (i.e. matrix of 2nd-derivatives)
-    of the loss function with respect to its inputs.  Typically this will be
-    block-diagonal across different cases in the batch, since the loss function
-    is typically summed across cases.
-
-    A 'replicated-one-hot' vector means a tensor which, for each slice along the
-    batch dimension (assumed to be dimension 0), is 1.0 in the entry
-    corresponding to the given index and 0 elsewhere.
-
-    Note that B can be any matrix satisfying B * B^T = H where H is the Hessian,
-    but will agree with the one used in the other methods of this class.
-
-    Args:
-      index: A tuple representing in the index of the entry in each slice that
-        is 1.0. Note that len(index) must be equal to the number of elements
-        of the 'hessian_factor_inner_shape' tensor minus one.
-
-    Returns:
-      The vector right-multiplied by B^T. Will be of the same shape(s) as the
-      'inputs' property.
-    """
-    pass
-
-  @abc.abstractproperty
-  def hessian_factor_inner_shape(self):
-    """The shape of the tensor returned by multiply_hessian_factor."""
-    pass
-
-  @abc.abstractproperty
-  def hessian_factor_inner_static_shape(self):
-    """Static version of hessian_factor_inner_shape."""
-    pass
-
-
-@six.add_metaclass(abc.ABCMeta)
-class NegativeLogProbLoss(LossFunction):
-  """Abstract base class for loss functions that are negative log probs."""
-
-  def __init__(self, seed=None):
-    self._default_seed = seed
-    super(NegativeLogProbLoss, self).__init__()
-
-  @property
-  def inputs(self):
-    return self.params
-
-  @abc.abstractproperty
-  def params(self):
-    """Parameters to the underlying distribution."""
-    pass
-
-  @abc.abstractmethod
-  def multiply_fisher(self, vector):
-    """Right-multiply a vector by the Fisher.
-
-    Args:
-      vector: The vector to multiply.  Must be the same shape(s) as the
-        'inputs' property.
-
-    Returns:
-      The vector right-multiplied by the Fisher.  Will be of the same shape(s)
-      as the 'inputs' property.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_fisher_factor(self, vector):
-    """Right-multiply a vector by a factor B of the Fisher.
-
-    Here the 'Fisher' is the Fisher information matrix (i.e. expected outer-
-    product of gradients) with respect to the parameters of the underlying
-    probability distribtion (whose log-prob defines the loss). Typically this
-    will be block-diagonal across different cases in the batch, since the
-    distribution is usually (but not always) conditionally iid across different
-    cases.
-
-    Note that B can be any matrix satisfying B * B^T = F where F is the Fisher,
-    but will agree with the one used in the other methods of this class.
-
-    Args:
-      vector: The vector to multiply.  Must be of the shape given by the
-        'fisher_factor_inner_shape' property.
-
-    Returns:
-      The vector right-multiplied by B. Will be of the same shape(s) as the
-      'inputs' property.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_fisher_factor_transpose(self, vector):
-    """Right-multiply a vector by the transpose of a factor B of the Fisher.
-
-    Here the 'Fisher' is the Fisher information matrix (i.e. expected outer-
-    product of gradients) with respect to the parameters of the underlying
-    probability distribtion (whose log-prob defines the loss). Typically this
-    will be block-diagonal across different cases in the batch, since the
-    distribution is usually (but not always) conditionally iid across different
-    cases.
-
-    Note that B can be any matrix satisfying B * B^T = F where F is the Fisher,
-    but will agree with the one used in the other methods of this class.
-
-    Args:
-      vector: The vector to multiply.  Must be the same shape(s) as the
-        'inputs' property.
-
-    Returns:
-      The vector right-multiplied by B^T.  Will be of the shape given by the
-      'fisher_factor_inner_shape' property.
-    """
-    pass
-
-  @abc.abstractmethod
-  def multiply_fisher_factor_replicated_one_hot(self, index):
-    """Right-multiply a replicated-one-hot vector by a factor B of the Fisher.
-
-    Here the 'Fisher' is the Fisher information matrix (i.e. expected outer-
-    product of gradients) with respect to the parameters of the underlying
-    probability distribtion (whose log-prob defines the loss). Typically this
-    will be block-diagonal across different cases in the batch, since the
-    distribution is usually (but not always) conditionally iid across different
-    cases.
-
-    A 'replicated-one-hot' vector means a tensor which, for each slice along the
-    batch dimension (assumed to be dimension 0), is 1.0 in the entry
-    corresponding to the given index and 0 elsewhere.
-
-    Note that B can be any matrix satisfying B * B^T = H where H is the Fisher,
-    but will agree with the one used in the other methods of this class.
-
-    Args:
-      index: A tuple representing in the index of the entry in each slice that
-        is 1.0. Note that len(index) must be equal to the number of elements
-        of the 'fisher_factor_inner_shape' tensor minus one.
-
-    Returns:
-      The vector right-multiplied by B. Will be of the same shape(s) as the
-      'inputs' property.
-    """
-    pass
-
-  @abc.abstractproperty
-  def fisher_factor_inner_shape(self):
-    """The shape of the tensor returned by multiply_fisher_factor."""
-    pass
-
-  @abc.abstractproperty
-  def fisher_factor_inner_static_shape(self):
-    """Static version of fisher_factor_inner_shape."""
-    pass
-
-  @abc.abstractmethod
-  def sample(self, seed):
-    """Sample 'targets' from the underlying distribution."""
-    pass
-
-  def evaluate_on_sample(self, seed=None):
-    """Evaluates the log probability on a random sample.
-
-    Args:
-      seed: int or None. Random seed for this draw from the distribution.
-
-    Returns:
-      Log probability of sampled targets, summed across examples.
-    """
-    if seed is None:
-      seed = self._default_seed
-    # We treat the targets as "constant".  It's only the inputs that get
-    # "back-propped" through.
-    return self._evaluate(array_ops.stop_gradient(self.sample(seed)))
-
-
-# TODO(jamesmartens): should this just inherit from object to avoid "diamond"
-# inheritance, or is there a better way?
-class NaturalParamsNegativeLogProbLoss(NegativeLogProbLoss):
-  """Base class for neg log prob losses whose inputs are 'natural' parameters.
-
-  Note that the Hessian and Fisher for natural parameters of exponential-
-  family models are the same, hence the purpose of this class.
-  See here: https://arxiv.org/abs/1412.1193
-
-  'Natural parameters' are defined for exponential-family models. See for
-  example: https://en.wikipedia.org/wiki/Exponential_family
-  """
-
-  def multiply_hessian(self, vector):
-    return self.multiply_fisher(vector)
-
-  def multiply_hessian_factor(self, vector):
-    return self.multiply_fisher_factor(vector)
-
-  def multiply_hessian_factor_transpose(self, vector):
-    return self.multiply_fisher_factor_transpose(vector)
-
-  def multiply_hessian_factor_replicated_one_hot(self, index):
-    return self.multiply_fisher_factor_replicated_one_hot(index)
-
-  @property
-  def hessian_factor_inner_shape(self):
-    return self.fisher_factor_inner_shape
-
-  @property
-  def hessian_factor_inner_static_shape(self):
-    return self.fisher_factor_inner_shape
-
-
-class DistributionNegativeLogProbLoss(NegativeLogProbLoss):
-  """Base class for neg log prob losses that use the TF Distribution classes."""
-
-  def __init__(self, seed=None):
-    super(DistributionNegativeLogProbLoss, self).__init__(seed=seed)
-
-  @abc.abstractproperty
-  def dist(self):
-    """The underlying tf.distributions.Distribution."""
-    pass
-
-  def _evaluate(self, targets):
-    return -math_ops.reduce_sum(self.dist.log_prob(targets))
-
-  def sample(self, seed):
-    return self.dist.sample(seed=seed)
-
-
-class NormalMeanNegativeLogProbLoss(DistributionNegativeLogProbLoss,
-                                    NaturalParamsNegativeLogProbLoss):
-  """Neg log prob loss for a normal distribution parameterized by a mean vector.
-
-
-  Note that the covariance is treated as a constant 'var' times the identity.
-  Also note that the Fisher for such a normal distribution with respect the mean
-  parameter is given by:
-
-     F = (1/var) * I
-
-  See for example https://www.ii.pwr.edu.pl/~tomczak/PDF/[JMT]Fisher_inf.pdf.
-  """
-
-  def __init__(self, mean, var=0.5, targets=None, seed=None):
-    self._mean = mean
-    self._var = var
-    self._targets = targets
-    super(NormalMeanNegativeLogProbLoss, self).__init__(seed=seed)
-
-  @property
-  def targets(self):
-    return self._targets
-
-  @property
-  def dist(self):
-    return normal.Normal(loc=self._mean, scale=math_ops.sqrt(self._var))
-
-  @property
-  def params(self):
-    return self._mean
-
-  def multiply_fisher(self, vector):
-    return (1. / self._var) * vector
-
-  def multiply_fisher_factor(self, vector):
-    return self._var**-0.5 * vector
-
-  def multiply_fisher_factor_transpose(self, vector):
-    return self.multiply_fisher_factor(vector)  # it's symmetric in this case
-
-  def multiply_fisher_factor_replicated_one_hot(self, index):
-    assert len(index) == 1, "Length of index was {}".format(len(index))
-    ones_slice = array_ops.expand_dims(
-        array_ops.ones(array_ops.shape(self._mean)[:1], dtype=self._mean.dtype),
-        axis=-1)
-    output_slice = self._var**-0.5 * ones_slice
-    return insert_slice_in_zeros(output_slice, 1, int(self._mean.shape[1]),
-                                 index[0])
-
-  @property
-  def fisher_factor_inner_shape(self):
-    return array_ops.shape(self._mean)
-
-  @property
-  def fisher_factor_inner_static_shape(self):
-    return self._mean.shape
-
-
-class NormalMeanVarianceNegativeLogProbLoss(DistributionNegativeLogProbLoss):
-  """Negative log prob loss for a normal distribution with mean and variance.
-
-  This class parameterizes a multivariate normal distribution with n independent
-  dimensions. Unlike `NormalMeanNegativeLogProbLoss`, this class does not
-  assume the variance is held constant. The Fisher Information for n = 1
-  is given by,
-
-  F = [[1 / variance,                0],
-       [           0, 0.5 / variance^2]]
-
-  where the parameters of the distribution are concatenated into a single
-  vector as [mean, variance]. For n > 1, the mean parameter vector is
-  concatenated with the variance parameter vector.
-
-  See https://www.ii.pwr.edu.pl/~tomczak/PDF/[JMT]Fisher_inf.pdf for derivation.
-  """
-
-  def __init__(self, mean, variance, targets=None, seed=None):
-    assert len(mean.shape) == 2, "Expect 2D mean tensor."
-    assert len(variance.shape) == 2, "Expect 2D variance tensor."
-    self._mean = mean
-    self._variance = variance
-    self._targets = targets
-    super(NormalMeanVarianceNegativeLogProbLoss, self).__init__(seed=seed)
-
-  @property
-  def targets(self):
-    return self._targets
-
-  @property
-  def dist(self):
-    return normal.Normal(loc=self._mean, scale=math_ops.sqrt(self._variance))
-
-  @property
-  def params(self):
-    return self._mean, self._variance
-
-  def _concat(self, mean, variance):
-    return array_ops.concat([mean, variance], axis=-1)
-
-  def _split(self, params):
-    return array_ops.split(params, 2, axis=-1)
-
-  @property
-  def _fisher_mean(self):
-    return 1. / self._variance
-
-  @property
-  def _fisher_mean_factor(self):
-    return 1. / math_ops.sqrt(self._variance)
-
-  @property
-  def _fisher_var(self):
-    return 1. / (2 * math_ops.square(self._variance))
-
-  @property
-  def _fisher_var_factor(self):
-    return 1. / (math_ops.sqrt(2.) * self._variance)
-
-  def multiply_fisher(self, vecs):
-    mean_vec, var_vec = vecs
-    return (self._fisher_mean * mean_vec, self._fisher_var * var_vec)
-
-  def multiply_fisher_factor(self, vecs):
-    mean_vec, var_vec = self._split(vecs)
-    return (self._fisher_mean_factor * mean_vec,
-            self._fisher_var_factor * var_vec)
-
-  def multiply_fisher_factor_transpose(self, vecs):
-    mean_vec, var_vec = vecs
-    return self._concat(self._fisher_mean_factor * mean_vec,
-                        self._fisher_var_factor * var_vec)
-
-  def multiply_fisher_factor_replicated_one_hot(self, index):
-    assert len(index) == 1, "Length of index was {}".format(len(index))
-    index = index[0]
-
-    if index < int(self._mean.shape[-1]):
-      # Index corresponds to mean parameter.
-      mean_slice = self._fisher_mean_factor[:, index]
-      mean_slice = array_ops.expand_dims(mean_slice, axis=-1)
-      mean_output = insert_slice_in_zeros(mean_slice, 1, int(
-          self._mean.shape[1]), index)
-      var_output = array_ops.zeros_like(mean_output)
-    else:
-      index -= int(self._mean.shape[-1])
-      # Index corresponds to variance parameter.
-      var_slice = self._fisher_var_factor[:, index]
-      var_slice = array_ops.expand_dims(var_slice, axis=-1)
-      var_output = insert_slice_in_zeros(var_slice, 1,
-                                         int(self._variance.shape[1]), index)
-      mean_output = array_ops.zeros_like(var_output)
-
-    return mean_output, var_output
-
-  @property
-  def fisher_factor_inner_shape(self):
-    return array_ops.concat(
-        [
-            array_ops.shape(self._mean)[:-1],
-            2 * array_ops.shape(self._mean)[-1:]
-        ],
-        axis=0)
-
-  @property
-  def fisher_factor_inner_static_shape(self):
-    shape = self._mean.shape.as_list()
-    return tensor_shape.TensorShape(shape[-1:] + [2 * shape[-1]])
-
-  def multiply_hessian(self, vector):
-    raise NotImplementedError()
-
-  def multiply_hessian_factor(self, vector):
-    raise NotImplementedError()
-
-  def multiply_hessian_factor_transpose(self, vector):
-    raise NotImplementedError()
-
-  def multiply_hessian_factor_replicated_one_hot(self, index):
-    raise NotImplementedError()
-
-  @property
-  def hessian_factor_inner_shape(self):
-    raise NotImplementedError()
-
-  @property
-  def hessian_factor_inner_static_shape(self):
-    raise NotImplementedError()
-
-
-class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
-                                           NaturalParamsNegativeLogProbLoss):
-  """Neg log prob loss for a categorical distribution parameterized by logits.
-
-
-  Note that the Fisher (for a single case) of a categorical distribution, with
-  respect to the natural parameters (i.e. the logits), is given by:
-
-  F = diag(p) - p*p^T
-
-  where p = softmax(logits).  F can be factorized as F = B * B^T where
-
-  B = diag(q) - p*q^T
-
-  where q is the entry-wise square root of p. This is easy to verify using the
-  fact that q^T*q = 1.
-  """
-
-  def __init__(self, logits, targets=None, seed=None):
-    """Instantiates a CategoricalLogitsNegativeLogProbLoss.
-
-    Args:
-      logits: Tensor of shape [batch_size, output_size]. Parameters for
-        underlying distribution.
-      targets: None or Tensor of shape [output_size]. Each elements contains an
-        index in [0, output_size).
-      seed: int or None. Default random seed when sampling.
-    """
-    self._logits = logits
-    self._targets = targets
-    super(CategoricalLogitsNegativeLogProbLoss, self).__init__(seed=seed)
-
-  @property
-  def targets(self):
-    return self._targets
-
-  @property
-  def dist(self):
-    return categorical.Categorical(logits=self._logits)
-
-  @property
-  def _probs(self):
-    return self.dist.probs
-
-  @property
-  def _sqrt_probs(self):
-    return math_ops.sqrt(self._probs)
-
-  @property
-  def params(self):
-    return self._logits
-
-  def multiply_fisher(self, vector):
-    probs = self._probs
-    return vector * probs - probs * math_ops.reduce_sum(
-        vector * probs, axis=-1, keepdims=True)
-
-  def multiply_fisher_factor(self, vector):
-    probs = self._probs
-    sqrt_probs = self._sqrt_probs
-    return sqrt_probs * vector - probs * math_ops.reduce_sum(
-        sqrt_probs * vector, axis=-1, keepdims=True)
-
-  def multiply_fisher_factor_transpose(self, vector):
-    probs = self._probs
-    sqrt_probs = self._sqrt_probs
-    return sqrt_probs * vector - sqrt_probs * math_ops.reduce_sum(
-        probs * vector, axis=-1, keepdims=True)
-
-  def multiply_fisher_factor_replicated_one_hot(self, index):
-    assert len(index) == 1, "Length of index was {}".format(len(index))
-    probs = self._probs
-    sqrt_probs = self._sqrt_probs
-    sqrt_probs_slice = array_ops.expand_dims(sqrt_probs[:, index[0]], -1)
-    padded_slice = insert_slice_in_zeros(sqrt_probs_slice, 1,
-                                         int(sqrt_probs.shape[1]), index[0])
-    return padded_slice - probs * sqrt_probs_slice
-
-  @property
-  def fisher_factor_inner_shape(self):
-    return array_ops.shape(self._logits)
-
-  @property
-  def fisher_factor_inner_static_shape(self):
-    return self._logits.shape
-
-
-class MultiBernoulliNegativeLogProbLoss(DistributionNegativeLogProbLoss,
-                                        NaturalParamsNegativeLogProbLoss):
-  """Neg log prob loss for multiple Bernoulli distributions param'd by logits.
-
-  Represents N independent Bernoulli distributions where N = len(logits). Its
-  Fisher Information matrix is given by,
-
-  F = diag(p * (1-p))
-  p = sigmoid(logits)
-
-  As F is diagonal with positive entries, its factor B is,
-
-  B = diag(sqrt(p * (1-p)))
-  """
-
-  def __init__(self, logits, targets=None, seed=None):
-    self._logits = logits
-    self._targets = targets
-    super(MultiBernoulliNegativeLogProbLoss, self).__init__(seed=seed)
-
-  @property
-  def targets(self):
-    return self._targets
-
-  @property
-  def dist(self):
-    return bernoulli.Bernoulli(logits=self._logits)
-
-  @property
-  def _probs(self):
-    return self.dist.probs
-
-  @property
-  def params(self):
-    return self._logits
-
-  def multiply_fisher(self, vector):
-    return self._probs * (1 - self._probs) * vector
-
-  def multiply_fisher_factor(self, vector):
-    return math_ops.sqrt(self._probs * (1 - self._probs)) * vector
-
-  def multiply_fisher_factor_transpose(self, vector):
-    return self.multiply_fisher_factor(vector)  # it's symmetric in this case
-
-  def multiply_fisher_factor_replicated_one_hot(self, index):
-    assert len(index) == 1, "Length of index was {}".format(len(index))
-    probs_slice = array_ops.expand_dims(self._probs[:, index[0]], -1)
-    output_slice = math_ops.sqrt(probs_slice * (1 - probs_slice))
-    return insert_slice_in_zeros(output_slice, 1, int(self._logits.shape[1]),
-                                 index[0])
-
-  @property
-  def fisher_factor_inner_shape(self):
-    return array_ops.shape(self._logits)
-
-  @property
-  def fisher_factor_inner_static_shape(self):
-    return self._logits.shape
-
-
-def insert_slice_in_zeros(slice_to_insert, dim, dim_size, position):
-  """Inserts slice into a larger tensor of zeros.
-
-  Forms a new tensor which is the same shape as slice_to_insert, except that
-  the dimension given by 'dim' is expanded to the size given by 'dim_size'.
-  'position' determines the position (index) at which to insert the slice within
-  that dimension.
-
-  Assumes slice_to_insert.shape[dim] = 1.
-
-  Args:
-    slice_to_insert: The slice to insert.
-    dim: The dimension which to expand with zeros.
-    dim_size: The new size of the 'dim' dimension.
-    position: The position of 'slice_to_insert' in the new tensor.
-
-  Returns:
-    The new tensor.
-
-  Raises:
-    ValueError: If the slice's shape at the given dim is not 1.
-  """
-  slice_shape = slice_to_insert.shape
-  if slice_shape[dim] != 1:
-    raise ValueError("Expected slice_to_insert.shape to have {} dim of 1, but "
-                     "was {}".format(dim, slice_to_insert.shape[dim]))
-
-  before = [0] * int(len(slice_shape))
-  after = before[:]
-  before[dim] = position
-  after[dim] = dim_size - position - 1
-
-  return array_ops.pad(slice_to_insert, list(zip(before, after)))
-
-
-class OnehotCategoricalLogitsNegativeLogProbLoss(
-    CategoricalLogitsNegativeLogProbLoss):
-  """Neg log prob loss for a categorical distribution with onehot targets.
-
-  Identical to CategoricalLogitsNegativeLogProbLoss except that the underlying
-  distribution is OneHotCategorical as opposed to Categorical.
-  """
-
-  @property
-  def dist(self):
-    return onehot_categorical.OneHotCategorical(logits=self._logits)
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
deleted file mode 100644
index 4279cb2792..0000000000
--- a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Loss functions to be used by LayerCollection."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.loss_functions import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    "LossFunction",
-    "NegativeLogProbLoss",
-    "NaturalParamsNegativeLogProbLoss",
-    "DistributionNegativeLogProbLoss",
-    "NormalMeanNegativeLogProbLoss",
-    "NormalMeanVarianceNegativeLogProbLoss",
-    "CategoricalLogitsNegativeLogProbLoss",
-    "OnehotCategoricalLogitsNegativeLogProbLoss",
-    "MultiBernoulliNegativeLogProbLoss",
-    "insert_slice_in_zeros",
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/op_queue.py b/tensorflow/contrib/kfac/python/ops/op_queue.py
deleted file mode 100644
index b6d9d37a31..0000000000
--- a/tensorflow/contrib/kfac/python/ops/op_queue.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Helper for choosing which op to run next in a distributed setting."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops as tf_ops
-
-
-class OpQueue(object):
-  """Class for choosing which Op to run next.
-
-  Constructs an infinitely repeating sequence of Ops in shuffled order.
-
-  In K-FAC, this can be used to distribute inverse update operations among
-  workers.
-  """
-
-  def __init__(self, ops, seed=None):
-    """Initializes an OpQueue.
-
-    Args:
-      ops: list of TensorFlow Ops. Ops to be selected from. All workers must
-        initialize with the same set of ops.
-      seed: int or None. Random seed used when shuffling order of ops.
-    """
-    self._ops_by_name = {op.name: op for op in ops}
-
-    # Construct a (shuffled) Dataset with Op names.
-    op_names = tf_ops.convert_to_tensor(list(sorted(op.name for op in ops)))
-    op_names_dataset = (dataset_ops.Dataset.from_tensor_slices(op_names)
-                        .shuffle(len(ops), seed=seed).repeat())
-    self._next_op_name = op_names_dataset.make_one_shot_iterator().get_next()
-
-  @property
-  def ops(self):
-    """Ops this OpQueue can return in next_op()."""
-    return self._ops_by_name.values()
-
-  def next_op(self, sess):
-    """Chooses which op to run next.
-
-    Note: This call will make a call to sess.run().
-
-    Args:
-      sess: tf.Session.
-
-    Returns:
-      Next Op chosen from 'ops'.
-    """
-    # In Python 3, type(next_op_name) == bytes. Calling bytes.decode('ascii')
-    # returns a str.
-    next_op_name = sess.run(self._next_op_name).decode('ascii')
-    return self._ops_by_name[next_op_name]
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
deleted file mode 100644
index 03b9da7933..0000000000
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ /dev/null
@@ -1,727 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The KFAC optimizer."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import warnings
-
-# pylint disable=long-line
-from tensorflow.contrib.kfac.python.ops import curvature_matrix_vector_products as cmvp
-from tensorflow.contrib.kfac.python.ops import estimator as est
-# pylint enable=long-line
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training import gradient_descent
-
-
-class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
-  """The KFAC Optimizer (https://arxiv.org/abs/1503.05671)."""
-
-  def __init__(self,
-               learning_rate,
-               cov_ema_decay,
-               damping,
-               layer_collection,
-               var_list=None,
-               momentum=0.9,
-               momentum_type="regular",
-               norm_constraint=None,
-               name="KFAC",
-               estimation_mode="gradients",
-               colocate_gradients_with_ops=True,
-               batch_size=None,
-               placement_strategy=None,
-               **kwargs):
-    """Initializes the KFAC optimizer with the given settings.
-
-    Args:
-      learning_rate: The base learning rate for the optimizer.  Should probably
-          be set to 1.0 when using momentum_type = 'qmodel', but can still be
-          set lowered if desired (effectively lowering the trust in the
-          quadratic model.)
-      cov_ema_decay: The decay factor used when calculating the covariance
-          estimate moving averages.
-      damping: The damping factor used to stabilize training due to errors in
-          the local approximation with the Fisher information matrix, and to
-          regularize the update direction by making it closer to the gradient.
-          If damping is adapted during training then this value is used for
-          initializing damping variable.
-          (Higher damping means the update looks more like a standard gradient
-          update - see Tikhonov regularization.)
-      layer_collection: The layer collection object, which holds the fisher
-          blocks, kronecker factors, and losses associated with the
-          graph.  The layer_collection cannot be modified after KfacOptimizer's
-          initialization.
-      var_list: Optional list or tuple of variables to train. Defaults to the
-          list of variables collected in the graph under the key
-          `GraphKeys.TRAINABLE_VARIABLES`.
-      momentum: The momentum decay constant to use. Only applies when
-          momentum_type is 'regular' or 'adam'. (Default: 0.9)
-      momentum_type: The type of momentum to use in this optimizer, one of
-          'regular', 'adam', or 'qmodel'. (Default: 'regular')
-      norm_constraint: float or Tensor. If specified, the update is scaled down
-          so that its approximate squared Fisher norm v^T F v is at most the
-          specified value. May only be used with momentum type 'regular'.
-          (Default: None)
-      name: The name for this optimizer. (Default: 'KFAC')
-      estimation_mode: The type of estimator to use for the Fishers.  Can be
-          'gradients', 'empirical', 'curvature_propagation', or 'exact'.
-          (Default: 'gradients'). See the doc-string for FisherEstimator for
-          more a more detailed description of these options.
-      colocate_gradients_with_ops: Whether we should request gradients we
-          compute in the estimator be colocated with their respective ops.
-          (Default: True)
-      batch_size: The size of the mini-batch. Only needed when momentum_type
-          == 'qmodel' or when automatic adjustment is used.  (Default: None)
-      placement_strategy: string, Device placement strategy used when creating
-        covariance variables, covariance ops, and inverse ops.
-        (Default: `None`)
-      **kwargs: Arguments to be passesd to specific placement
-        strategy mixin. Check `placement.RoundRobinPlacementMixin` for example.
-
-    Raises:
-      ValueError: If the momentum type is unsupported.
-      ValueError: If clipping is used with momentum type other than 'regular'.
-      ValueError: If no losses have been registered with layer_collection.
-      ValueError: If momentum is non-zero and momentum_type is not 'regular'
-          or 'adam'.
-    """
-    warnings.warn(
-        "third_party.tensorflow.contrib.kfac is deprecated."
-        "This will be removed on 15-07-2018. Check README for further details.",
-        DeprecationWarning)
-    # Parameters to be passed to the Fisher estimator:
-    self._variables = var_list or tf_variables.trainable_variables
-    self._cov_ema_decay = cov_ema_decay
-    self._layers = layer_collection
-    self._estimation_mode = estimation_mode
-    self._colocate_gradients_with_ops = colocate_gradients_with_ops
-
-    # The below parameters are required only if damping needs to be adapated.
-    # These parameters can be set by calling
-    # set_damping_adaptation_params() explicitly.
-    self._damping_adaptation_decay = 0.95
-    self._damping_adaptation_interval = 5
-    # Check section 6.5 KFAC paper. omega(1) = pow(damping decay, interval)
-    self._omega = (
-        self._damping_adaptation_decay**self._damping_adaptation_interval)
-    self._adapt_damping = False
-    self._min_damping = 1e-5
-    self._prev_train_batch = None
-    self._is_chief = False
-    self._loss_fn = None
-    self._damping_constant = damping
-    self._damping = None
-    self._rho = None
-    self._prev_loss = None
-    self._q_model_change = None
-    self._update_damping_op = None
-
-    momentum_type = momentum_type.lower()
-    legal_momentum_types = ["regular", "adam", "qmodel"]
-
-    if momentum_type not in legal_momentum_types:
-      raise ValueError("Unsupported momentum type {}. Must be one of {}."
-                       .format(momentum_type, legal_momentum_types))
-    if momentum_type != "regular" and norm_constraint is not None:
-      raise ValueError("Update clipping is only supported with momentum "
-                       "type 'regular'.")
-    if momentum_type not in ["regular", "adam"] and momentum != 0:
-      raise ValueError("Momentum must be unspecified if using a momentum_type "
-                       "other than 'regular' or 'adam'.")
-
-    # Extra parameters of the optimizer
-    self._momentum = momentum
-    self._momentum_type = momentum_type
-    self._norm_constraint = norm_constraint
-    self._batch_size = batch_size
-    self._placement_strategy = placement_strategy
-
-    with variable_scope.variable_scope(name):
-      self._fisher_est = est.make_fisher_estimator(
-          placement_strategy=placement_strategy,
-          variables=self._variables,
-          cov_ema_decay=self._cov_ema_decay,
-          damping=self.damping,
-          layer_collection=self._layers,
-          exps=(-1,),
-          estimation_mode=self._estimation_mode,
-          colocate_gradients_with_ops=self._colocate_gradients_with_ops,
-          **kwargs)
-
-    super(KfacOptimizer, self).__init__(learning_rate, name=name)
-
-  def set_damping_adaptation_params(self,
-                                    is_chief,
-                                    prev_train_batch,
-                                    loss_fn,
-                                    min_damping=1e-5,
-                                    damping_adaptation_decay=0.99,
-                                    damping_adaptation_interval=5):
-    """Sets parameters required to adapt damping during training.
-
-    When called, enables damping adaptation according to the Levenberg-Marquardt
-    style rule described in Section 6.5 of "Optimizing Neural Networks with
-    Kronecker-factored Approximate Curvature".
-
-    Note that this function creates Tensorflow variables which store a few
-    scalars and are accessed by the ops which update the damping (as part
-    of the training op returned by the minimize() method).
-
-    Args:
-      is_chief: `Boolean`, `True` if the worker is chief.
-      prev_train_batch: Training data used to minimize loss in the previous
-        step. This will be used to evaluate loss by calling
-        `loss_fn(prev_train_batch)`.
-      loss_fn: `function` that takes as input training data tensor and returns
-        a scalar loss.
-      min_damping: `float`(Optional), Minimum value the damping parameter
-        can take. Default value 1e-5.
-      damping_adaptation_decay: `float`(Optional), The `damping` parameter is
-        multiplied by the `damping_adaptation_decay` every
-        `damping_adaptation_interval` number of iterations. Default value 0.99.
-      damping_adaptation_interval: `int`(Optional), Number of steps in between
-        updating the `damping` parameter. Default value 5.
-
-    Raises:
-      ValueError: If `set_damping_adaptation_params` is already called and the
-        the `adapt_damping` is `True`.
-    """
-    if self._adapt_damping:
-      raise ValueError("Damping adaptation parameters already set.")
-
-    with variable_scope.variable_scope(self.get_name()):
-      self._adapt_damping = True
-      self._is_chief = is_chief
-      self._prev_train_batch = prev_train_batch
-      self._loss_fn = loss_fn
-      self._damping_adaptation_decay = damping_adaptation_decay
-      self._damping_adaptation_interval = damping_adaptation_interval
-      self._omega = (
-          self._damping_adaptation_decay**self._damping_adaptation_interval)
-      self._min_damping = min_damping
-
-      self._rho = variable_scope.get_variable(
-          "rho", shape=(), dtype=dtypes.float32, trainable=False)  # LM ratio.
-      self._prev_loss = variable_scope.get_variable(
-          "prev_loss", shape=(), dtype=dtypes.float32, trainable=False)
-      self._q_model_change = variable_scope.get_variable(
-          "q_model_change", shape=(), dtype=dtypes.float32, trainable=False)
-      self._damping = variable_scope.get_variable(
-          "damping", initializer=self._damping_constant, trainable=False)
-
-  @property
-  def variables(self):
-    return self._fisher_est.variables
-
-  @property
-  def damping(self):
-    if self._damping:
-      return self._damping
-    else:
-      return self._damping_constant
-
-  @property
-  def damping_adaptation_interval(self):
-    return self._damping_adaptation_interval
-
-  def make_vars_and_create_op_thunks(self):
-    """Make vars and create op thunks.
-
-    Returns:
-      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-    """
-    scope = self.get_name() + "/" + self._fisher_est.name
-    return self._fisher_est.make_vars_and_create_op_thunks(scope=scope)
-
-  def create_ops_and_vars_thunks(self):
-    """Create thunks that make the ops and vars on demand.
-
-    This function returns 4 lists of thunks: cov_variable_thunks,
-    cov_update_thunks, inv_variable_thunks, and inv_update_thunks.
-
-    The length of each list is the number of factors and the i-th element of
-    each list corresponds to the i-th factor (given by the "factors" property).
-
-    Note that the execution of these thunks must happen in a certain
-    partial order.  The i-th element of cov_variable_thunks must execute
-    before the i-th element of cov_update_thunks (and also the i-th element
-    of inv_update_thunks).  Similarly, the i-th element of inv_variable_thunks
-    must execute before the i-th element of inv_update_thunks.
-
-    TL;DR (oversimplified): Execute the thunks according to the order that
-    they are returned.
-
-    Returns:
-      cov_variable_thunks: A list of thunks that make the cov variables.
-      cov_update_thunks: A list of thunks that make the cov update ops.
-      inv_variable_thunks: A list of thunks that make the inv variables.
-      inv_update_thunks: A list of thunks that make the inv update ops.
-    """
-    scope = self.get_name() + "/" + self._fisher_est.name
-    return self._fisher_est.create_ops_and_vars_thunks(scope=scope)
-
-  def minimize(self, *args, **kwargs):
-    # Should this variable scope encompass everything below?  Or will the super-
-    # class make another copy of the same name scope?
-    with variable_scope.variable_scope(self.get_name()):
-      kwargs["var_list"] = kwargs.get("var_list") or self.variables
-      if set(kwargs["var_list"]) != set(self.variables):
-        raise ValueError("var_list doesn't match with set of Fisher-estimating "
-                         "variables.")
-      if self._adapt_damping and self._is_chief:
-        global_step = kwargs.get("global_step", None)
-        if not global_step:
-          raise KeyError("global_step needs to be passed to optimizer.minimize "
-                         "if damping parameter is adapted.")
-        update_damping_op = self._update_damping(self._prev_train_batch,
-                                                 global_step)
-        with ops.control_dependencies([update_damping_op]):
-          loss = args[0]
-          loss_assign_op = state_ops.assign(self._prev_loss, loss)
-          train_op = super(KfacOptimizer, self).minimize(*args, **kwargs)
-          return control_flow_ops.group(loss_assign_op, train_op)
-      else:
-        return super(KfacOptimizer, self).minimize(*args, **kwargs)
-
-  def compute_gradients(self, *args, **kwargs):
-    # args[1] could be our var_list
-    if len(args) > 1:
-      var_list = args[1]
-    else:
-      kwargs["var_list"] = kwargs.get("var_list") or self.variables
-      var_list = kwargs["var_list"]
-
-    if set(var_list) != set(self.variables):
-      raise ValueError("var_list doesn't match with set of Fisher-estimating "
-                       "variables.")
-    return super(KfacOptimizer, self).compute_gradients(*args, **kwargs)
-
-  def apply_gradients(self, grads_and_vars, *args, **kwargs):
-    """Applies gradients to variables.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-      *args: Additional arguments for super.apply_gradients.
-      **kwargs: Additional keyword arguments for super.apply_gradients.
-
-    Returns:
-      An `Operation` that applies the specified gradients.
-    """
-    # In Python 3, grads_and_vars can be a zip() object which can only be
-    # iterated over once. By converting it to a list, we ensure that it can be
-    # iterated over more than once.
-    grads_and_vars = list(grads_and_vars)
-
-    # Compute step.
-    steps_and_vars = self._compute_update_steps(grads_and_vars)
-
-    # Update trainable variables with this step.
-    return super(KfacOptimizer, self).apply_gradients(steps_and_vars, *args,
-                                                      **kwargs)
-
-  def _squared_fisher_norm(self, grads_and_vars, precon_grads_and_vars):
-    """Computes the squared (approximate) Fisher norm of the updates.
-
-    This is defined as v^T F v, where F is the approximate Fisher matrix
-    as computed by the estimator, and v = F^{-1} g, where g is the gradient.
-    This is computed efficiently as v^T g.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-      precon_grads_and_vars: List of (preconditioned gradient, variable) pairs.
-        Must be the result of calling `self._fisher_est.multiply_inverse`
-        on `grads_and_vars`.
-
-    Returns:
-      Scalar representing the squared norm.
-
-    Raises:
-      ValueError: if the two list arguments do not contain the same variables,
-        in the same order.
-    """
-    for (_, gvar), (_, pgvar) in zip(grads_and_vars, precon_grads_and_vars):
-      if gvar is not pgvar:
-        raise ValueError("The variables referenced by the two arguments "
-                         "must match.")
-    terms = [
-        math_ops.reduce_sum(grad * pgrad)
-        for (grad, _), (pgrad, _) in zip(grads_and_vars, precon_grads_and_vars)
-    ]
-    return math_ops.reduce_sum(terms)
-
-  def _update_clip_coeff(self, grads_and_vars, precon_grads_and_vars):
-    """Computes the scale factor for the update to satisfy the norm constraint.
-
-    Defined as min(1, sqrt(c / r^T F r)), where c is the norm constraint,
-    F is the approximate Fisher matrix, and r is the update vector, i.e.
-    -alpha * v, where alpha is the learning rate, and v is the preconditioned
-    gradient.
-
-    This is based on Section 5 of Ba et al., Distributed Second-Order
-    Optimization using Kronecker-Factored Approximations. Note that they
-    absorb the learning rate alpha (which they denote eta_max) into the formula
-    for the coefficient, while in our implementation, the rescaling is done
-    before multiplying by alpha. Hence, our formula differs from theirs by a
-    factor of alpha.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-      precon_grads_and_vars: List of (preconditioned gradient, variable) pairs.
-        Must be the result of calling `self._fisher_est.multiply_inverse`
-        on `grads_and_vars`.
-
-    Returns:
-      Scalar representing the coefficient which should be applied to the
-      preconditioned gradients to satisfy the norm constraint.
-    """
-    sq_norm_grad = self._squared_fisher_norm(grads_and_vars,
-                                             precon_grads_and_vars)
-    sq_norm_up = sq_norm_grad * self._learning_rate**2
-    return math_ops.minimum(1.,
-                            math_ops.sqrt(self._norm_constraint / sq_norm_up))
-
-  def _clip_updates(self, grads_and_vars, precon_grads_and_vars):
-    """Rescales the preconditioned gradients to satisfy the norm constraint.
-
-    Rescales the preconditioned gradients such that the resulting update r
-    (after multiplying by the learning rate) will satisfy the norm constraint.
-    This constraint is that r^T F r <= C, where F is the approximate Fisher
-    matrix, and C is the norm_constraint attribute. See Section 5 of
-    Ba et al., Distributed Second-Order Optimization using Kronecker-Factored
-    Approximations.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-      precon_grads_and_vars: List of (preconditioned gradient, variable) pairs.
-        Must be the result of calling `self._fisher_est.multiply_inverse`
-        on `grads_and_vars`.
-
-    Returns:
-      List of (rescaled preconditioned gradient, variable) pairs.
-    """
-    coeff = self._update_clip_coeff(grads_and_vars, precon_grads_and_vars)
-    return [(pgrad * coeff, var) for pgrad, var in precon_grads_and_vars]
-
-  def _compute_prev_updates(self, variables):
-    """Computes previous updates as negative velocities scaled by learning rate.
-
-    Args:
-      variables: List of variables in the graph that the update will be
-          applied to.
-
-    Returns:
-      List of previous updates applied to the `variables`.
-    """
-    return list(
-        -1 * self._learning_rate * self._zeros_slot(var, "velocity", self._name)
-        for var in variables)
-
-  def _compute_qmodel_hyperparams(self, precon_grads, prev_updates, grads,
-                                  variables):
-    """Compute optimal update hyperparameters from the quadratic model.
-
-    More specifically, if L is the loss we minimize a quadratic approximation
-    of L(theta + d) which we denote by qmodel(d) with
-    d = alpha*precon_grad + mu*prev_update with respect to alpha and mu, where
-
-      qmodel(d) = (1/2) * d^T * B * d + grad^T*d + L(theta) .
-
-    Unlike in the KL clipping approach we use the non-approximated quadratic
-    model where the curvature matrix C is the true Fisher on the current
-    mini-batch (computed without any approximations beyond mini-batch sampling),
-    with the usual Tikhonov damping/regularization applied,
-
-      C = F + damping * I
-
-    See Section 7 of https://arxiv.org/abs/1503.05671 for a derivation of
-    the formula.  See Appendix C for a discussion of the trick of using
-    a factorized Fisher matrix to more efficiently compute the required
-    vector-matrix-vector products.
-
-    Note that the elements of all 4 lists passed to this function must
-    be in correspondence with each other.
-
-    Args:
-      precon_grads: List of preconditioned gradients.
-      prev_updates: List of updates computed at the previous iteration.
-      grads: List of gradients.
-      variables: List of variables in the graph that the update will be
-          applied to. (Note that this function doesn't actually apply the
-          update.)
-
-    Returns:
-      (alpha, mu, qmodel_change), where alpha and mu are chosen to optimize the
-      quadratic model, and
-      qmodel_change = qmodel(alpha*precon_grad + mu*prev_update) - qmodel(0)
-                    = qmodel(alpha*precon_grad + mu*prev_update) - L(theta).
-    """
-
-    cmvpc = cmvp.CurvatureMatrixVectorProductComputer(self._layers.losses,
-                                                      variables)
-
-    # compute the matrix-vector products with the transposed Fisher factor
-    fft_precon_grads = cmvpc.multiply_fisher_factor_transpose(precon_grads)
-    fft_prev_updates = cmvpc.multiply_fisher_factor_transpose(prev_updates)
-    batch_size = math_ops.cast(
-        self._batch_size, dtype=fft_precon_grads[0].dtype)
-
-    # compute the entries of the 2x2 matrix
-    m_11 = (
-        _inner_product_list(fft_precon_grads, fft_precon_grads) / batch_size +
-        self.damping * _inner_product_list(precon_grads, precon_grads))
-
-    m_21 = (
-        _inner_product_list(fft_prev_updates, fft_precon_grads) / batch_size +
-        self.damping * _inner_product_list(prev_updates, precon_grads))
-
-    m_22 = (
-        _inner_product_list(fft_prev_updates, fft_prev_updates) / batch_size +
-        self.damping * _inner_product_list(prev_updates, prev_updates))
-
-    def non_zero_prevupd_case():
-      r"""Computes optimal (alpha, mu) given non-zero previous update.
-
-      We solve the full 2x2 linear system. See Martens & Grosse (2015),
-      Section 7, definition of $\alpha^*$ and $\mu^*$.
-
-      Returns:
-        (alpha, mu, qmodel_change), where alpha and mu are chosen to optimize
-        the quadratic model, and
-        qmodel_change = qmodel(alpha*precon_grad + mu*prev_update) - qmodel(0).
-      """
-      m = ops.convert_to_tensor([[m_11, m_21], [m_21, m_22]])
-
-      c = ops.convert_to_tensor([[_inner_product_list(grads, precon_grads)],
-                                 [_inner_product_list(grads, prev_updates)]])
-
-      sol = -1. * _two_by_two_solve(m, c)
-      alpha = sol[0]
-      mu = sol[1]
-      qmodel_change = 0.5 * math_ops.reduce_sum(sol * c)
-
-      return alpha, mu, qmodel_change
-
-    def zero_prevupd_case():
-      r"""Computes optimal (alpha, mu) given all-zero previous update.
-
-      The linear system reduces to 1x1. See Martens & Grosse (2015),
-      Section 6.4, definition of $\alpha^*$.
-
-      Returns:
-        (alpha, 0.0, qmodel_change), where alpha is chosen to optimize the
-        quadratic model, and
-        qmodel_change = qmodel(alpha*precon_grad) - qmodel(0)
-      """
-      m = m_11
-      c = _inner_product_list(grads, precon_grads)
-
-      alpha = -c / m
-      mu = 0.0
-      qmodel_change = 0.5 * alpha * c
-
-      return alpha, mu, qmodel_change
-
-    return control_flow_ops.cond(
-        math_ops.equal(m_22, 0.0), zero_prevupd_case, non_zero_prevupd_case)
-
-  def _assign_q_model_change(self, q_model_change):
-    """Assigns `q_model_change` to `self._q_model_change` if damping is adapted.
-
-    Note only the chief worker does the assignment.
-
-    Args:
-      q_model_change: Scalar tensor of type `float32`.
-
-    Returns:
-      If `adapt_damping` is `True` then returns an assign op, Otherwise returns
-      a no_op().
-    """
-    if self._adapt_damping and self._is_chief:
-      q_model_assign_op = state_ops.assign(self._q_model_change, q_model_change)
-    else:
-      q_model_assign_op = control_flow_ops.no_op()
-    return q_model_assign_op
-
-  def _compute_qmodel_hyperparams_wrapper(self, grads_and_vars,
-                                          precon_grads_and_vars):
-    """Wrapper function for `self._compute_qmodel_hyperparams`.
-
-    Constructs a list of preconditioned gradients and variables. Also creates a
-    op to asssign the computed q model change to `self._q_model_change`.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-      precon_grads_and_vars: List of (preconditioned gradients, variable)
-        pairs.
-
-    Returns:
-      (alpha, mu, q_model_assign_op), where alpha and mu are chosen to optimize
-      the quadratic model, `q_model_assign_op` assigns the computed q model
-      change to `self._q_model_change`.
-    """
-    precon_grads = list(
-        precon_grad for (precon_grad, _) in precon_grads_and_vars)
-    grads = list(grad for (grad, _) in grads_and_vars)
-    variables = list(var for (_, var) in grads_and_vars)
-    prev_updates = self._compute_prev_updates(variables)
-    # Compute optimal velocity update parameters according to quadratic model
-    alpha, mu, q_model_change = self._compute_qmodel_hyperparams(
-        precon_grads, prev_updates, grads, variables)
-
-    return alpha, mu, self._assign_q_model_change(q_model_change)
-
-  def _compute_update_steps(self, grads_and_vars):
-    """Computes the update steps for the variables given the gradients.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs.
-
-    Returns:
-      A list of tuple (assign_op ,var) where `assign_op` assigns the update
-      steps to `var`.
-    """
-
-    if self._momentum_type == "regular":
-      # Compute "preconditioned" gradient.
-      precon_grads_and_vars = self._fisher_est.multiply_inverse(grads_and_vars)
-
-      # Apply "KL clipping" if asked for.
-      if self._norm_constraint is not None:
-        precon_grads_and_vars = self._clip_updates(grads_and_vars,
-                                                   precon_grads_and_vars)
-
-      # Update the velocity with this and return it as the step.
-      if self._adapt_damping and self._is_chief:
-        _, _, q_model_assign_op = self._compute_qmodel_hyperparams_wrapper(
-            grads_and_vars, precon_grads_and_vars)
-        with ops.control_dependencies([q_model_assign_op]):
-          return self._update_velocities(precon_grads_and_vars, self._momentum)
-      else:
-        return self._update_velocities(precon_grads_and_vars, self._momentum)
-    elif self._momentum_type == "adam":
-      # Update velocity.
-      velocities_and_vars = self._update_velocities(grads_and_vars,
-                                                    self._momentum)
-      # Return "preconditioned" velocity vector as the step.
-      return self._fisher_est.multiply_inverse(velocities_and_vars)
-
-    elif self._momentum_type == "qmodel":
-      # Compute "preconditioned" gradient.
-      precon_grads_and_vars = self._fisher_est.multiply_inverse(grads_and_vars)
-
-      # Compute optimal velocity update parameters according to quadratic model
-      alpha, mu, q_model_assign_op = self._compute_qmodel_hyperparams_wrapper(
-          grads_and_vars, precon_grads_and_vars)
-
-      with ops.control_dependencies([q_model_assign_op]):
-        return self._update_velocities(
-            precon_grads_and_vars, mu, vec_coeff=-alpha)
-
-  def _update_velocities(self, vecs_and_vars, decay, vec_coeff=1.0):
-    """Updates the velocities of the variables with the given vectors.
-
-    Args:
-      vecs_and_vars: List of (vector, variable) pairs.
-      decay: How much to decay the old velocity by.  This is often referred to
-        as the 'momentum constant'.
-      vec_coeff: Coefficient to apply to the vectors before adding them to the
-        velocity.
-
-    Returns:
-      A list of (velocity, var) indicating the new velocity for each var.
-    """
-
-    def _update_velocity(vec, var):
-      velocity = self._zeros_slot(var, "velocity", self._name)
-      with ops.colocate_with(velocity):
-        # NOTE(mattjj): read/modify/write race condition not suitable for async.
-
-        # Compute the new velocity for this variable.
-        new_velocity = decay * velocity + vec_coeff * vec
-
-        # Save the updated velocity.
-        return (array_ops.identity(velocity.assign(new_velocity)), var)
-
-    # Go through variable and update its associated part of the velocity vector.
-    return [_update_velocity(vec, var) for vec, var in vecs_and_vars]
-
-  def _update_damping(self, prev_batch, global_step):
-    """Adapts damping parameter. Check KFAC (Section 6.5) for the details.
-
-    The damping parameter is updated according to the Levenberg-Marquardt rule
-    every `self._damping_adaptation_interval` iterations.
-
-    Args:
-      prev_batch: Tensor or tuple of tensors which can be passed to
-        `self._loss_fn` to evaluate loss.
-      global_step: `Variable` which keeps track of number of times the training
-        variables have been updated.
-    Returns:
-      A `tf.cond` op which updates the damping parameter.
-    """
-    def compute_damping():
-      """"Adapts damping parameter based on "reduction ratio".
-
-      Reduction ratio captures how closely the quadratic approximation to the
-      loss function approximates the actual loss within a trust region. The
-      damping update tries to make the damping as small as possible while
-      maintaining the property that the quadratic model remains a good local
-      approximation to the loss function.
-
-      Returns:
-        An Op to assign newly computed damping value to `self._damping`.
-      """
-      prev_batch_loss = self._loss_fn(prev_batch)
-      with ops.control_dependencies([prev_batch_loss]):
-        rho_assign = self._rho.assign(
-            (prev_batch_loss - self._prev_loss) / self._q_model_change)
-        with ops.control_dependencies([rho_assign]):
-          new_damping = control_flow_ops.case(
-              [(self._rho < 0.25, lambda: self.damping / self._omega),
-               (self._rho > 0.75, lambda: self.damping * self._omega)],
-              lambda: self.damping)
-          with ops.control_dependencies([new_damping]):
-            new_damping_min = math_ops.maximum(new_damping, self._min_damping)
-            return control_flow_ops.group(self._damping.assign(new_damping_min))
-
-    return control_flow_ops.cond(
-        math_ops.equal(
-            math_ops.mod(global_step + 1, self._damping_adaptation_interval),
-            0), compute_damping, control_flow_ops.no_op)
-
-
-def _inner_product_list(list1, list2):
-  return math_ops.add_n(
-      [math_ops.reduce_sum(elt1 * elt2) for elt1, elt2 in zip(list1, list2)])
-
-
-def _two_by_two_solve(m, c):
-  # it might be better just to crank out the exact formula for 2x2 inverses
-  return math_ops.matmul(linalg_ops.matrix_inverse(m), c)
diff --git a/tensorflow/contrib/kfac/python/ops/placement.py b/tensorflow/contrib/kfac/python/ops/placement.py
deleted file mode 100644
index c4454325ae..0000000000
--- a/tensorflow/contrib/kfac/python/ops/placement.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implements placement strategies for cov and inv ops, cov variables."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-from tensorflow.python.framework import ops as tf_ops
-
-
-def _make_thunk_on_device(func, device):
-  def thunk():
-    with tf_ops.device(device):
-      return func()
-  return thunk
-
-
-class RoundRobinPlacementMixin(object):
-  """Implements round robin placement strategy for ops and variables."""
-
-  def __init__(self, cov_devices=None, inv_devices=None, **kwargs):
-    """Initializes the RoundRobinPlacementMixin class.
-
-    Args:
-      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
-        computations will be placed on these devices in a round-robin fashion.
-        Can be None, which means that no devices are specified.
-      **kwargs: Need something here?
-
-    """
-    super(RoundRobinPlacementMixin, self).__init__(**kwargs)
-    self._cov_devices = cov_devices
-    self._inv_devices = inv_devices
-
-  def make_vars_and_create_op_thunks(self, scope=None):
-    """Make vars and create op thunks w/ a round-robin device placement start.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  A new device is chosen
-    for each factor by cycling through list of devices in the
-    `self._cov_devices` attribute. If `self._cov_devices` is `Non`e then no
-    explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the `self._inv_devices` attribute.
-
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All variables will be created,
-        and all thunks will execute, inside of a variable scope of the given
-        name. (Default: None)
-
-    Returns:
-      cov_update_thunks: List of cov update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-      inv_update_thunks: List of inv update thunks. Corresponds one-to-one with
-        the list of factors given by the "factors" property.
-    """
-    # Note: `create_ops_and_vars_thunks` is implemented in `FisherEstimator`.
-    (cov_variable_thunks_raw, cov_update_thunks_raw, inv_variable_thunks_raw,
-     inv_update_thunks_raw) = self.create_ops_and_vars_thunks(scope=scope)
-
-    if self._cov_devices:
-      cov_update_thunks = []
-      for cov_variable_thunk, cov_update_thunk, device in zip(
-          cov_variable_thunks_raw, cov_update_thunks_raw,
-          itertools.cycle(self._cov_devices)):
-        with tf_ops.device(device):
-          cov_variable_thunk()
-        cov_update_thunks.append(_make_thunk_on_device(cov_update_thunk,
-                                                       device))
-    else:
-      for cov_variable_thunk in cov_variable_thunks_raw:
-        cov_variable_thunk()
-      cov_update_thunks = cov_update_thunks_raw
-
-    for inv_variable_thunk in inv_variable_thunks_raw:
-      inv_variable_thunk()
-
-    if self._inv_devices:
-      inv_update_thunks = []
-      for inv_update_thunk, device in zip(inv_update_thunks_raw,
-                                          itertools.cycle(self._inv_devices)):
-        inv_update_thunks.append(_make_thunk_on_device(inv_update_thunk,
-                                                       device))
-    else:
-      inv_update_thunks = inv_update_thunks_raw
-
-    return cov_update_thunks, inv_update_thunks
diff --git a/tensorflow/contrib/kfac/python/ops/utils.py b/tensorflow/contrib/kfac/python/ops/utils.py
deleted file mode 100644
index 144295f4c7..0000000000
--- a/tensorflow/contrib/kfac/python/ops/utils.py
+++ /dev/null
@@ -1,709 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables
-
-# Method used for inverting matrices.
-POSDEF_INV_METHOD = "cholesky"
-POSDEF_EIG_METHOD = "self_adjoint"
-
-
-def set_global_constants(posdef_inv_method=None):
-  """Sets various global constants used by the classes in this module."""
-  global POSDEF_INV_METHOD
-
-  if posdef_inv_method is not None:
-    POSDEF_INV_METHOD = posdef_inv_method
-
-
-class SequenceDict(object):
-  """A dict convenience wrapper that allows getting/setting with sequences."""
-
-  def __init__(self, iterable=None):
-    self._dict = dict(iterable or [])
-
-  def __getitem__(self, key_or_keys):
-    if isinstance(key_or_keys, (tuple, list)):
-      return list(map(self.__getitem__, key_or_keys))
-    else:
-      return self._dict[key_or_keys]
-
-  def __setitem__(self, key_or_keys, val_or_vals):
-    if isinstance(key_or_keys, (tuple, list)):
-      for key, value in zip(key_or_keys, val_or_vals):
-        self[key] = value
-    else:
-      self._dict[key_or_keys] = val_or_vals
-
-  def items(self):
-    return list(self._dict.items())
-
-
-def tensors_to_column(tensors):
-  """Converts a tensor or list of tensors to a column vector.
-
-  Args:
-    tensors: A tensor or list of tensors.
-
-  Returns:
-    The tensors reshaped into vectors and stacked on top of each other.
-  """
-  if isinstance(tensors, (tuple, list)):
-    return array_ops.concat(
-        tuple(array_ops.reshape(tensor, [-1, 1]) for tensor in tensors), axis=0)
-  else:
-    return array_ops.reshape(tensors, [-1, 1])
-
-
-def column_to_tensors(tensors_template, colvec):
-  """Converts a column vector back to the shape of the given template.
-
-  Args:
-    tensors_template: A tensor or list of tensors.
-    colvec: A 2d column vector with the same shape as the value of
-        tensors_to_column(tensors_template).
-
-  Returns:
-    X, where X is tensor or list of tensors with the properties:
-     1) tensors_to_column(X) = colvec
-     2) X (or its elements) have the same shape as tensors_template (or its
-        elements)
-  """
-  if isinstance(tensors_template, (tuple, list)):
-    offset = 0
-    tensors = []
-    for tensor_template in tensors_template:
-      sz = np.prod(tensor_template.shape.as_list(), dtype=np.int32)
-      tensor = array_ops.reshape(colvec[offset:(offset + sz)],
-                                 tensor_template.shape)
-      tensors.append(tensor)
-      offset += sz
-
-    tensors = tuple(tensors)
-  else:
-    tensors = array_ops.reshape(colvec, tensors_template.shape)
-
-  return tensors
-
-
-def kronecker_product(mat1, mat2):
-  """Computes the Kronecker product two matrices."""
-  m1, n1 = mat1.get_shape().as_list()
-  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
-  m2, n2 = mat2.get_shape().as_list()
-  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
-  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
-
-
-def layer_params_to_mat2d(vector):
-  """Converts a vector shaped like layer parameters to a 2D matrix.
-
-  In particular, we reshape the weights/filter component of the vector to be
-  2D, flattening all leading (input) dimensions. If there is a bias component,
-  we concatenate it to the reshaped weights/filter component.
-
-  Args:
-    vector: A Tensor or pair of Tensors shaped like layer parameters.
-
-  Returns:
-    A 2D Tensor with the same coefficients and the same output dimension.
-  """
-  if isinstance(vector, (tuple, list)):
-    w_part, b_part = vector
-    w_part_reshaped = array_ops.reshape(w_part,
-                                        [-1, w_part.shape.as_list()[-1]])
-    return array_ops.concat(
-        (w_part_reshaped, array_ops.reshape(b_part, [1, -1])), axis=0)
-  elif isinstance(vector, ops.IndexedSlices):
-    return vector
-  else:  # Tensor or Tensor-like.
-    return array_ops.reshape(vector, [-1, vector.shape.as_list()[-1]])
-
-
-def mat2d_to_layer_params(vector_template, mat2d):
-  """Converts a canonical 2D matrix representation back to a vector.
-
-  Args:
-    vector_template: A Tensor or pair of Tensors shaped like layer parameters.
-    mat2d: A 2D Tensor with the same shape as the value of
-        layer_params_to_mat2d(vector_template).
-
-  Returns:
-    A Tensor or pair of Tensors with the same coefficients as mat2d and the same
-        shape as vector_template.
-  """
-  if isinstance(vector_template, (tuple, list)):
-    w_part, b_part = mat2d[:-1], mat2d[-1]
-    return array_ops.reshape(w_part, vector_template[0].shape), b_part
-  elif isinstance(vector_template, ops.IndexedSlices):
-    if not isinstance(mat2d, ops.IndexedSlices):
-      raise TypeError(
-          "If vector_template is an IndexedSlices, so should mat2d.")
-    return mat2d
-  else:
-    return array_ops.reshape(mat2d, vector_template.shape)
-
-
-def posdef_inv(tensor, damping):
-  """Computes the inverse of tensor + damping * identity."""
-  identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype)
-  damping = math_ops.cast(damping, dtype=tensor.dtype)
-  return posdef_inv_functions[POSDEF_INV_METHOD](tensor, identity, damping)
-
-
-def posdef_inv_matrix_inverse(tensor, identity, damping):
-  """Computes inverse(tensor + damping * identity) directly."""
-  return linalg_ops.matrix_inverse(tensor + damping * identity)
-
-
-def posdef_inv_cholesky(tensor, identity, damping):
-  """Computes inverse(tensor + damping * identity) with Cholesky."""
-  chol = linalg_ops.cholesky(tensor + damping * identity)
-  return linalg_ops.cholesky_solve(chol, identity)
-
-
-def posdef_inv_eig(tensor, identity, damping):
-  """Computes inverse(tensor + damping * identity) with eigendecomposition."""
-  eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(
-      tensor + damping * identity)
-  return math_ops.matmul(
-      eigenvectors / eigenvalues, eigenvectors, transpose_b=True)
-
-
-posdef_inv_functions = {
-    "matrix_inverse": posdef_inv_matrix_inverse,
-    "cholesky": posdef_inv_cholesky,
-    "eig": posdef_inv_eig,
-}
-
-
-def posdef_eig(mat):
-  """Computes the eigendecomposition of a positive semidefinite matrix."""
-  return posdef_eig_functions[POSDEF_EIG_METHOD](mat)
-
-
-def posdef_eig_svd(mat):
-  """Computes the singular values and left singular vectors of a matrix."""
-  evals, evecs, _ = linalg_ops.svd(mat)
-
-  return evals, evecs
-
-
-def posdef_eig_self_adjoint(mat):
-  """Computes eigendecomposition using self_adjoint_eig."""
-  evals, evecs = linalg_ops.self_adjoint_eig(mat)
-  evals = math_ops.abs(evals)  # Should be equivalent to svd approach.
-
-  return evals, evecs
-
-
-posdef_eig_functions = {
-    "self_adjoint": posdef_eig_self_adjoint,
-    "svd": posdef_eig_svd,
-}
-
-
-def cholesky(tensor, damping):
-  """Computes the inverse of tensor + damping * identity."""
-  identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype)
-  damping = math_ops.cast(damping, dtype=tensor.dtype)
-  return linalg_ops.cholesky(tensor + damping * identity)
-
-
-class SubGraph(object):
-  """Defines a subgraph given by all the dependencies of a given set of outputs.
-  """
-
-  def __init__(self, outputs):
-    # Set of all ancestor Tensors, Ops to 'outputs'.
-    self._members = set()
-
-    self._iter_add(outputs)
-
-  def _iter_add(self, root):
-    """Iteratively adds all of nodes' ancestors using depth first search."""
-    stack = [root]
-    while stack:
-      nodes = stack.pop()
-      for node in nodes:
-        if node in self._members:
-          continue
-        self._members.add(node)
-
-        if isinstance(node, ops.Tensor):
-          stack.append((node.op,))
-        elif isinstance(node, ops.Operation):
-          stack.append(node.inputs)
-
-  def is_member(self, node):
-    """Check if 'node' is in this subgraph."""
-    return node in self._members
-
-  def variable_uses(self, var):
-    """Computes number of times a variable is used.
-
-    Args:
-      var: Variable or ResourceVariable instance.
-
-    Returns:
-      Number of times a variable is used within this subgraph.
-
-    Raises:
-      ValueError: If 'var' is not a variable type.
-    """
-    if isinstance(var, resource_variable_ops.ResourceVariable):
-      var = var.handle
-    elif isinstance(var, variables.Variable):
-      var = var.value()
-    else:
-      raise ValueError("%s does not appear to be a variable." % str(var))
-
-    return len(self._members.intersection(set(var.consumers())))
-
-  def filter_list(self, node_list):
-    """Filters 'node_list' to nodes in this subgraph."""
-    filtered_list = []
-    for node in node_list:
-      if self.is_member(node):
-        filtered_list.append(node)
-    return filtered_list
-
-
-def generate_random_signs(shape, dtype=dtypes.float32):
-  """Generate a random tensor with {-1, +1} entries."""
-  ints = random_ops.random_uniform(shape, maxval=2, dtype=dtypes.int32)
-  return 2 * math_ops.cast(ints, dtype=dtype) - 1
-
-
-def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
-  """Compute forward-mode gradients."""
-  # See b/37888268.
-
-  # This version of forward-mode autodiff is based on code by Tim Cooijmans
-  # and handles list arguments and certain special cases such as when the
-  # ys doesn't depend on one or more of the xs, and when ops.IndexedSlices are
-  # generated by the first gradients_impl.gradients call.
-
-  us = [array_ops.zeros_like(y) + float("nan") for y in ys]
-  dydxs = gradients_impl.gradients(
-      ys, xs, grad_ys=us, stop_gradients=stop_gradients)
-
-  # Deal with strange types that gradients_impl.gradients returns but can't
-  # deal with.
-  dydxs = [
-      ops.convert_to_tensor(dydx)
-      if isinstance(dydx, ops.IndexedSlices) else dydx for dydx in dydxs
-  ]
-  dydxs = [
-      array_ops.zeros_like(x) if dydx is None else dydx
-      for x, dydx in zip(xs, dydxs)
-  ]
-
-  dysdx = gradients_impl.gradients(dydxs, us, grad_ys=grad_xs)
-
-  return dysdx
-
-
-def on_tpu():
-  """Returns True when building a TPU computation."""
-  return tpu_function.get_tpu_context().number_of_shards is not None
-
-
-def cross_replica_mean(tensor, name=None):
-  """Takes mean value of a Tensor across all TPU cores.
-
-  Args:
-    tensor: Tensor to be synchronized.
-    name: None or string. Name of Op.
-
-  Returns:
-    Average of Tensor across all TPU cores.
-
-  Raises:
-    ValueError: If called outside of TPU context.
-  """
-  with ops.name_scope(name, "cross_replica_mean", [tensor]):
-    num_shards = tpu_function.get_tpu_context().number_of_shards
-    if num_shards is None:
-      raise ValueError(
-          "Cannot take cross_replica_mean() outside of TPU Context.")
-    if num_shards == 1:
-      return tensor
-    return tpu_ops.cross_replica_sum(tensor / num_shards)
-
-
-def ensure_sequence(obj):
-  """If `obj` isn't a tuple or list, return a tuple containing `obj`."""
-  if isinstance(obj, (tuple, list)):
-    return obj
-  else:
-    return (obj,)
-
-
-def batch_execute(global_step, thunks, batch_size, name=None):
-  """Executes a subset of ops per global step.
-
-  Given a list of thunks, each of which produces a single stateful op,
-  ensures that exactly 'batch_size' ops are run per global step. Ops are
-  scheduled in a round-robin fashion. For example, with 3 ops
-
-    global_step | op0 | op1 | op2
-    ------------+-----+-----+-----
-        0       |  x  |  x  |
-    ------------+-----+-----+-----
-        1       |  x  |     |  x
-    ------------+-----+-----+-----
-        2       |     |  x  |  x
-    ------------+-----+-----+-----
-        3       |  x  |  x  |
-    ------------+-----+-----+-----
-        4       |  x  |     |  x
-
-  Does not guarantee order of op execution within a single global step.
-
-  Args:
-    global_step: Tensor indicating time. Determines which ops run.
-    thunks: List of thunks. Each thunk encapsulates one op. Return values are
-      ignored.
-    batch_size: int. Number of ops to execute per global_step.
-    name: string or None. Name scope for newly added ops.
-
-  Returns:
-    List of ops. Exactly 'batch_size' ops are guaranteed to have an effect
-    every global step.
-  """
-
-  def true_fn(thunk):
-    """Ensures thunk is executed and returns an Op (not a Tensor)."""
-
-    def result():
-      with ops.control_dependencies([thunk()]):
-        return control_flow_ops.no_op()
-
-    return result
-
-  def false_fn(_):
-    """Executes a no-op."""
-
-    def result():
-      return control_flow_ops.no_op()
-
-    return result
-
-  with ops.name_scope(name, "batch_execute"):
-    true_fns = [true_fn(thunk) for thunk in thunks]
-    false_fns = [false_fn(thunk) for thunk in thunks]
-    num_thunks = len(thunks)
-    conditions = [
-        math_ops.less(
-            math_ops.mod(batch_size - 1 + global_step * batch_size - j,
-                         num_thunks), batch_size) for j in range(num_thunks)
-    ]
-    result = [
-        control_flow_ops.cond(condition, true_fn, false_fn)
-        for (condition, true_fn,
-             false_fn) in zip(conditions, true_fns, false_fns)
-    ]
-    return result
-
-
-def extract_convolution_patches(inputs,
-                                filter_shape,
-                                padding,
-                                strides=None,
-                                dilation_rate=None,
-                                name=None,
-                                data_format=None):
-  """Extracts inputs to each output coordinate in tf.nn.convolution.
-
-  This is a generalization of tf.extract_image_patches() to tf.nn.convolution(),
-  where the number of spatial dimensions may be something other than 2.
-
-  Assumes,
-  - First dimension of inputs is batch_size
-  - Convolution filter is applied to all input channels.
-
-  Args:
-    inputs: Tensor of shape [batch_size, ..spatial_image_shape..,
-      ..spatial_filter_shape.., in_channels]. Inputs to tf.nn.convolution().
-    filter_shape: List of ints. Shape of filter passed to tf.nn.convolution().
-    padding: string. Padding method. One of "VALID", "SAME".
-    strides: None or list of ints. Strides along spatial dimensions.
-    dilation_rate: None or list of ints. Dilation along spatial dimensions.
-    name: None or str. Name of Op.
-    data_format: None or str. Format of data.
-
-  Returns:
-    Tensor of shape [batch_size, ..spatial_image_shape..,
-      ..spatial_filter_shape.., in_channels]
-
-  Raises:
-    ValueError: If data_format does not put channel last.
-    ValueError: If inputs and filter disagree on in_channels.
-  """
-  if not is_data_format_channel_last(data_format):
-    raise ValueError("Channel must be last dimension.")
-  with ops.name_scope(name, "extract_convolution_patches",
-                      [inputs, filter_shape, padding, strides, dilation_rate]):
-    batch_size = inputs.shape.as_list()[0]
-    in_channels = inputs.shape.as_list()[-1]
-
-    # filter_shape = spatial_filter_shape + [in_channels, out_channels]
-    spatial_filter_shape = filter_shape[:-2]
-    if in_channels != filter_shape[-2]:
-      raise ValueError("inputs and filter_shape must agree on in_channels.")
-
-    # Map each input feature to a location in the output.
-    out_channels = np.prod(spatial_filter_shape) * in_channels
-    filters = linalg_ops.eye(out_channels)
-    filters = array_ops.reshape(
-        filters,
-        list(spatial_filter_shape) + [in_channels, out_channels])
-
-    result = nn_ops.convolution(
-        inputs,
-        filters,
-        padding=padding,
-        strides=strides,
-        dilation_rate=dilation_rate)
-    spatial_output_shape = result.shape.as_list()[1:-1]
-    result = array_ops.reshape(result,
-                               [batch_size or -1] + spatial_output_shape +
-                               list(spatial_filter_shape) + [in_channels])
-
-    return result
-
-
-def extract_pointwise_conv2d_patches(inputs,
-                                     filter_shape,
-                                     name=None,
-                                     data_format=None):
-  """Extract patches for a 1x1 conv2d.
-
-  Args:
-    inputs: 4-D Tensor of shape [batch_size, height, width, in_channels].
-    filter_shape: List of 4 ints. Shape of filter to apply with conv2d()
-    name: None or str. Name for Op.
-    data_format: None or str. Format for data. See 'data_format' in
-      tf.nn.conv2d() for details.
-
-  Returns:
-    Tensor of shape [batch_size, ..spatial_input_shape..,
-    ..spatial_filter_shape.., in_channels]
-
-  Raises:
-    ValueError: if inputs is not 4-D.
-    ValueError: if filter_shape is not [1, 1, ?, ?]
-    ValueError: if data_format is not channels-last.
-  """
-  if inputs.shape.ndims != 4:
-    raise ValueError("inputs must have 4 dims.")
-  if len(filter_shape) != 4:
-    raise ValueError("filter_shape must have 4 dims.")
-  if filter_shape[0] != 1 or filter_shape[1] != 1:
-    raise ValueError("filter_shape must have shape 1 along spatial dimensions.")
-  if not is_data_format_channel_last(data_format):
-    raise ValueError("data_format must be channels last.")
-  with ops.name_scope(name, "extract_pointwise_conv2d_patches",
-                      [inputs, filter_shape]):
-    ksizes = [1, 1, 1, 1]  # Spatial shape is 1x1.
-    strides = [1, 1, 1, 1]  # Operate on all pixels.
-    rates = [1, 1, 1, 1]  # Dilation has no meaning with spatial shape = 1.
-    padding = "VALID"  # Doesn't matter.
-    result = array_ops.extract_image_patches(inputs, ksizes, strides, rates,
-                                             padding)
-
-    batch_size, input_height, input_width, in_channels = inputs.shape.as_list()
-    filter_height, filter_width, in_channels, _ = filter_shape
-    return array_ops.reshape(result, [
-        batch_size, input_height, input_width, filter_height, filter_width,
-        in_channels
-    ])
-
-
-def is_data_format_channel_last(data_format):
-  """True if data_format puts channel last."""
-  if data_format is None:
-    return True
-  return data_format.endswith("C")
-
-
-def matmul_sparse_dense(A, B, name=None, transpose_a=False, transpose_b=False):  # pylint: disable=invalid-name
-  """Computes matmul(A, B) where A is sparse, B is dense.
-
-  Args:
-    A: tf.IndexedSlices with dense shape [m, n].
-    B: tf.Tensor with shape [n, k].
-    name: str. Name of op.
-    transpose_a: Bool. If true we transpose A before multiplying it by B.
-      (Default: False)
-    transpose_b: Bool. If true we transpose B before multiplying it by A.
-      (Default: False)
-
-  Returns:
-    tf.IndexedSlices resulting from matmul(A, B).
-
-  Raises:
-    ValueError: If A doesn't represent a matrix.
-    ValueError: If B is not rank-2.
-  """
-  with ops.name_scope(name, "matmul_sparse_dense", [A, B]):
-    if A.indices.shape.ndims != 1 or A.values.shape.ndims != 2:
-      raise ValueError("A must represent a matrix. Found: %s." % A)
-    if B.shape.ndims != 2:
-      raise ValueError("B must be a matrix.")
-    new_values = math_ops.matmul(
-        A.values, B, transpose_a=transpose_a, transpose_b=transpose_b)
-    return ops.IndexedSlices(
-        new_values,
-        A.indices,
-        dense_shape=array_ops.stack([A.dense_shape[0], new_values.shape[1]]))
-
-
-def matmul_diag_sparse(A_diag, B, name=None):  # pylint: disable=invalid-name
-  """Computes matmul(A, B) where A is a diagonal matrix, B is sparse.
-
-  Args:
-    A_diag: diagonal entries of matrix A of shape [m, m].
-    B: tf.IndexedSlices. Represents matrix of shape [m, n].
-    name: str. Name of op.
-
-  Returns:
-    tf.IndexedSlices resulting from matmul(A, B).
-
-  Raises:
-    ValueError: If A_diag is not rank-1.
-    ValueError: If B doesn't represent a matrix.
-  """
-  with ops.name_scope(name, "matmul_diag_sparse", [A_diag, B]):
-    A_diag = ops.convert_to_tensor(A_diag)
-    if A_diag.shape.ndims != 1:
-      raise ValueError("A_diag must be a rank-1 Tensor.")
-    if B.indices.shape.ndims != 1 or B.values.shape.ndims != 2:
-      raise ValueError("B must represent a matrix. Found: %s." % B)
-    a = array_ops.gather(A_diag, B.indices)
-    a = array_ops.reshape(a, list(a.shape) + [1] * (B.values.shape.ndims - 1))
-    return ops.IndexedSlices(a * B.values, B.indices, dense_shape=B.dense_shape)
-
-
-class PartitionedTensor(object):
-  """A Tensor partitioned across its 0-th dimension."""
-
-  def __init__(self, tensors):
-    """Initializes PartitionedTensor.
-
-    Args:
-      tensors: List of Tensors. All Tensors must agree on shape (excepting
-        batch dimension) and dtype.
-
-    Raises:
-      ValueError: If 'tensors' has length zero.
-      ValueError: if contents of 'tensors' don't agree on shape or dtype.
-    """
-    if not tensors:
-      raise ValueError("tensors must be a list of 1+ Tensors.")
-
-    dtype = tensors[0].dtype
-    if not all(tensor.dtype == dtype for tensor in tensors):
-      raise ValueError("all tensors must have dtype = %s." % dtype)
-
-    shape = tensors[0].shape[1:]
-    if not all(tensor.shape[1:] == shape for tensor in tensors):
-      raise ValueError("All tensors must have shape = %s (excluding batch "
-                       "dimension)." % shape)
-
-    self.tensors = tensors
-    self._concats = {}  # {device: Tensor}
-
-  @property
-  def shape(self):
-    feature_shape = self.tensors[0].shape[1:]
-    batch_size = sum([tensor.shape[0] for tensor in self.tensors],
-                     tensor_shape.Dimension(0))
-    return tensor_shape.TensorShape([batch_size]).concatenate(feature_shape)
-
-  def get_shape(self):
-    return self.shape
-
-  @property
-  def dtype(self):
-    return self.tensors[0].dtype
-
-  def __str__(self):
-    return "PartitionedTensor([%s, ...], dtype=%s, shape=%s)" % (
-        self.tensors[0].name, self.dtype.name, tuple(self.shape.as_list()))
-
-  def __hash__(self):
-    return hash(tuple(self.tensors))
-
-  def __eq__(self, other):
-    if not isinstance(other, PartitionedTensor):
-      return False
-    return self.tensors == other.tensors
-
-  def __ne__(self, other):
-    return not self == other  # pylint: disable=g-comparison-negation
-
-  def __getitem__(self, key):
-    return self.as_tensor()[key]
-
-  def as_tensor(self, dtype=None, name=None, as_ref=False):
-    with ops.name_scope(name, "PartitionedTensor.as_tensor", self.tensors):
-      assert not as_ref
-      assert dtype in [None, self.dtype]
-      result = array_ops.concat(self.tensors, axis=0)
-
-      # Cache 'result' if we haven't already cached a value for this device.
-      if result.device not in self._concats:
-        self._concats[result.device] = result
-      return self._concats[result.device]
-
-  @property
-  def device(self):
-    # PartitionedTensors in general do not live on a single device.  If the
-    # device cannot be determined unambiguously this property will return None.
-    device = self.tensors[0].device
-    if all(tensor.device == device for tensor in self.tensors):
-      return device
-    return None
-
-
-ops.register_tensor_conversion_function(
-    PartitionedTensor,
-    lambda val, dtype, name, as_ref: val.as_tensor(dtype, name, as_ref))
-
-
-# TODO(b/69623235): Add a function for finding tensors that share gradients
-# to eliminate redundant fisher factor computations.
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
deleted file mode 100644
index 330d222dbf..0000000000
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.utils import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
-
-_allowed_symbols = [
-    "set_global_constants",
-    "SequenceDict",
-    "tensors_to_column",
-    "column_to_tensors",
-    "kronecker_product",
-    "layer_params_to_mat2d",
-    "mat2d_to_layer_params",
-    "posdef_inv",
-    "posdef_inv_matrix_inverse",
-    "posdef_inv_cholesky",
-    "posdef_inv_funcs",
-    "SubGraph",
-    "generate_random_signs",
-    "fwd_gradients",
-    "ensure_sequence",
-    "batch_execute",
-    "extract_convolution_patches",
-    "extract_pointwise_conv2d_patches",
-    "is_data_format_channel_last",
-    "matmul_sparse_dense",
-    "matmul_diag_sparse",
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index 39e9d65407..9a402d888c 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -270,7 +270,7 @@ class ReshapeTest(Base):
         array_ops.placeholder(dtypes.float32, [None]), ['x'])
     reshape_lt = ops.reshape(orig_lt, ['x'], ['y', ('z', 1)])
     self.assertEqual(reshape_lt.axes, core.Axes([('y', None), ('z', 1)]))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       result = sess.run(reshape_lt, feed_dict={orig_lt.tensor: [1, 2]})
       np.testing.assert_array_equal(result, [[1], [2]])
 
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/test_util.py b/tensorflow/contrib/labeled_tensor/python/ops/test_util.py
index 8f0416030f..900c9217c3 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/test_util.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/test_util.py
@@ -27,7 +27,7 @@ class Base(test.TestCase):
   """A class with some useful methods for testing."""
 
   def eval(self, tensors):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
 
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 7355a403ae..b4fe8cac74 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -185,7 +185,7 @@ py_test(
 
 py_test(
     name = "normalization_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index a7b41b714f..af8e673f59 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Ops for building neural network layers, regularizers, summaries, etc.
 
-See the @{$python/contrib.layers} guide.
+See the
+[Contrib Layers](https://tensorflow.org/api_guides/python/contrib.layers)
+guide.
 
 @@avg_pool2d
 @@avg_pool3d
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 3ae07cedab..28d19a0445 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -997,9 +997,14 @@ class _OneHotColumn(
       # Remove (?, -1) index
       weighted_column = sparse_ops.sparse_slice(
           weighted_column,
-          [0, 0],
+          array_ops.zeros_like(weighted_column.dense_shape),
           weighted_column.dense_shape)
-      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+      dense_tensor = sparse_ops.sparse_tensor_to_dense(weighted_column)
+      batch_shape = array_ops.shape(dense_tensor)[:-1]
+      dense_tensor_shape = array_ops.concat(
+          [batch_shape, [self.length]], axis=0)
+      dense_tensor = array_ops.reshape(dense_tensor, dense_tensor_shape)
+      return dense_tensor
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(sparse_id_column,
                                                         default_value=-1)
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index 1de9ab7056..eaaf9f8d5f 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -57,6 +57,29 @@ def _sparse_id_tensor(shape, vocab_size, seed=112123):
       indices=indices, values=values, dense_shape=shape)
 
 
+def _sparse_id_tensor_with_weights(shape, vocab_size, seed=112123):
+  # Returns a arbitrary `SparseTensor` with given shape and vocab size.
+  assert vocab_size >= shape[-1]
+  np.random.seed(seed)
+  indices = np.array(list(itertools.product(*[range(s) for s in shape])))
+
+  # Values must be distinct from the vocab
+  values = np.ndarray.flatten(np.array([
+      np.random.choice(vocab_size, size=shape[-1], replace=False)
+      for _ in range(np.prod(shape[:-1]))]))
+  weights = np.sort(np.random.rand(*shape), axis=len(shape)-1)
+
+  # Remove entries if weight < 0.5 for sparsity.
+  keep = np.ndarray.flatten(weights < 0.5)  # Remove half of them
+  indices = indices[keep]
+  values = values[keep]
+  weights = np.ndarray.flatten(weights)[keep]
+  return (sparse_tensor_lib.SparseTensor(
+      indices=indices, values=values, dense_shape=shape),
+          sparse_tensor_lib.SparseTensor(
+              indices=indices, values=weights, dense_shape=shape))
+
+
 class FeatureColumnTest(test.TestCase):
 
   def testImmutability(self):
@@ -329,6 +352,34 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights")
     self.assertEqual(one_hot.length, 3)
 
+  def testIntegerizedOneHotColumnForWeightedSparseColumn(self):
+    vocab_size = 5
+    ids = fc.sparse_column_with_integerized_feature("ids", vocab_size)
+    weighted_ids = fc.weighted_sparse_column(ids, "weights")
+    one_hot = fc.one_hot_column(weighted_ids)
+    self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights")
+    self.assertEqual(one_hot.length, vocab_size)
+
+  def testIntegerizedOneHotWeightedSparseColumnShape(self):
+    vocab_size = 5
+    for id_tensor_shape in [[4, 3], [2, 4], [3, 3, 3]]:
+      output_rank = len(id_tensor_shape)
+      a = fc.sparse_column_with_integerized_feature("a", vocab_size)
+      weighted = fc.weighted_sparse_column(a, "weights")
+      one_hot = fc.one_hot_column(weighted)
+      id_tensor, weight_tensor = _sparse_id_tensor_with_weights(
+          id_tensor_shape, vocab_size)
+
+      one_hot_output = one_hot._to_dnn_input_layer(
+          (id_tensor, weight_tensor),
+          output_rank=output_rank)
+      one_hot_output_shape = one_hot_output.get_shape().as_list()
+      expected_shape = id_tensor_shape[:-1] + [vocab_size]
+      self.assertEquals(expected_shape, one_hot_output_shape)
+      with self.test_session() as sess:
+        one_hot_value = sess.run(one_hot_output)
+        self.assertEquals(expected_shape, list(one_hot_value.shape))
+
   def testOneHotColumnWithSparseColumnWithHashKeys(self):
     input_values = ["marlo", "unknown", "omar"]
     inputs = constant_op.constant(input_values)
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index 1192198ec2..655f038b18 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -111,7 +111,7 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
   if not dtype.is_floating:
     raise TypeError('Cannot create initializer for non-floating point type.')
   if mode not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG']:
-    raise TypeError('Unknow mode %s [FAN_IN, FAN_OUT, FAN_AVG]', mode)
+    raise TypeError('Unknown mode %s [FAN_IN, FAN_OUT, FAN_AVG]', mode)
 
   # pylint: disable=unused-argument
   def _initializer(shape, dtype=dtype, partition_info=None):
diff --git a/tensorflow/contrib/layers/python/layers/initializers_test.py b/tensorflow/contrib/layers/python/layers/initializers_test.py
index b7fe878893..bd3692b258 100644
--- a/tensorflow/contrib/layers/python/layers/initializers_test.py
+++ b/tensorflow/contrib/layers/python/layers/initializers_test.py
@@ -85,7 +85,7 @@ class VarianceScalingInitializerTest(test.TestCase):
 
   def _test_variance(self, initializer, shape, variance, factor, mode, uniform):
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         var = variable_scope.get_variable(
             name='test',
             shape=shape,
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 51c7abb105..eee90864b4 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1067,7 +1067,7 @@ class Convolution2dTransposeTests(test.TestCase):
         conv = layers_lib.conv2d(
             transpose, num_filters, filter_size, stride=stride, padding='VALID')
 
-        with self.test_session(graph=graph) as sess:
+        with self.session(graph=graph) as sess:
           sess.run(variables_lib.global_variables_initializer())
           self.assertListEqual(list(conv.eval().shape), input_size)
 
@@ -1460,14 +1460,14 @@ class DropoutTest(test.TestCase):
 class FlattenTest(test.TestCase):
 
   def testInvalidRank(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5,)))
       with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
         _layers.flatten(inputs)
 
   def testUnknownLastDim(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5, None)))
       output = _layers.flatten(inputs)
@@ -1629,7 +1629,7 @@ class FCTest(test.TestCase):
   def testCreateFC(self):
     height, width = 3, 3
     for layer_fn in (_layers.fully_connected, layers_lib.relu):
-      with ops.Graph().as_default() as g, self.test_session(g):
+      with ops.Graph().as_default() as g, self.session(g):
         inputs = np.random.uniform(size=(5, height * width * 3))
         output = layer_fn(inputs, 32)
         self.assertEqual(output.op.name, 'fully_connected/Relu')
@@ -1814,27 +1814,27 @@ class BatchNormTest(test.TestCase):
         a, center=False, data_format='NCHW', zero_debias_moving_mean=True)
 
   def testUnknownShape(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       with self.assertRaisesRegexp(ValueError, 'undefined rank'):
         _layers.batch_norm(inputs)
 
   def testInvalidDataFormat(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       with self.assertRaisesRegexp(
           ValueError, 'data_format has to be either NCHW or NHWC.'):
         _layers.batch_norm(inputs, data_format='CHWN')
 
   def testUnknownChannelsDimNHWC(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5, 3, 3, None)))
       with self.assertRaisesRegexp(ValueError, 'undefined'):
         _layers.batch_norm(inputs, data_format='NHWC')
 
   def testUnknownChannelsDimNCHW(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5, None, 3, 3)))
       with self.assertRaisesRegexp(ValueError, 'undefined'):
@@ -2810,13 +2810,13 @@ class BatchNormTest(test.TestCase):
 class LayerNormTest(test.TestCase):
 
   def testUnknownShape(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       with self.assertRaisesRegexp(ValueError, 'undefined rank'):
         _layers.layer_norm(inputs)
 
   def testParamsDimsNotFullyDefined(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5, 3, 3, None)))
       with self.assertRaisesRegexp(ValueError, 'is not fully defined'):
@@ -2876,7 +2876,7 @@ class LayerNormTest(test.TestCase):
       for sigma in [1.0, 0.1]:
         input_values = np.random.randn(*input_shape) * sigma + mu
         with ops.Graph().as_default() as g:
-          with self.test_session(graph=g) as sess:
+          with self.session(graph=g) as sess:
             inputs = constant_op.constant(
                 input_values, shape=input_shape, dtype=dtype)
             output_t = _layers.layer_norm(
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index c807ab0f2e..11033a2e9c 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -176,7 +176,8 @@ def group_norm(inputs,
                variables_collections=None,
                outputs_collections=None,
                trainable=True,
-               scope=None):
+               scope=None,
+               mean_close_to_zero=False):
   """Functional interface for the group normalization layer.
 
   Reference: https://arxiv.org/abs/1803.08494.
@@ -222,6 +223,19 @@ def group_norm(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     scope: Optional scope for `variable_scope`.
+    mean_close_to_zero: The mean of `input` before ReLU will be close to zero
+      when batch size >= 4k for Resnet-50 on TPU. If `True`, use
+      `nn.sufficient_statistics` and `nn.normalize_moments` to calculate the
+      variance. This is the same behavior as `fused` equals `True` in batch
+      normalization. If `False`, use `nn.moments` to calculate the variance.
+      When `mean` is close to zero, like 1e-4, use `mean` to calculate the
+      variance may have poor result due to repeated roundoff error and
+      denormalization in `mean`.  When `mean` is large, like 1e2,
+      sum(`input`^2) is so large that only the high-order digits of the elements
+      are being accumulated. Thus, use sum(`input` - `mean`)^2/n to calculate
+      the variance has better accuracy compared to (sum(`input`^2)/n - `mean`^2)
+      when `mean` is large.
+
 
   Returns:
     A `Tensor` representing the output of the operation.
@@ -333,7 +347,14 @@ def group_norm(inputs,
       gamma = array_ops.reshape(gamma, params_shape_broadcast)
 
     # Calculate the moments.
-    mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
+    if mean_close_to_zero:
+      # One pass algorithm returns better result when mean is close to zero.
+      counts, means_ss, variance_ss, _ = nn.sufficient_statistics(
+          inputs, moments_axes, keep_dims=True)
+      mean, variance = nn.normalize_moments(
+          counts, means_ss, variance_ss, shift=None)
+    else:
+      mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
 
     # Compute normalization.
     # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index b6e96350db..55272e5fd1 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -293,8 +293,13 @@ class GroupNormTest(test.TestCase):
       train_np, eval_np = sess.run([output_train, output_eval])
       self.assertAllClose(train_np, eval_np)
 
-  def doOutputTest(self, input_shape, channels_axis=None, reduction_axes=None,
-                   groups=2, tol=1e-2):
+  def doOutputTest(self,
+                   input_shape,
+                   channels_axis=None,
+                   reduction_axes=None,
+                   mean_close_to_zero=False,
+                   groups=2,
+                   tol=1e-2):
     # Select the axis for the channel and the dimensions along which statistics
     # are accumulated.
     if channels_axis < 0:
@@ -322,17 +327,28 @@ class GroupNormTest(test.TestCase):
       if i not in reduced_axes:
         reduced_shape.append(a)
 
-    for mu in (0.0, 1e2):
-      for sigma in (1.0, 0.1):
+    if mean_close_to_zero:
+      mu_tuple = (1e-4, 1e-2, 1.0)
+      sigma_tuple = (1e-2, 0.1, 1.0)
+    else:
+      mu_tuple = (1.0, 1e2)
+      sigma_tuple = (1.0, 0.1)
+
+    for mu in mu_tuple:
+      for sigma in sigma_tuple:
         # Determine shape of Tensor after normalization.
         expected_mean = np.zeros(reduced_shape)
         expected_var = np.ones(reduced_shape)
 
-        inputs = random_ops.random_uniform(input_shape, seed=0) * sigma + mu
+        inputs = random_ops.random_normal(input_shape, seed=0) * sigma + mu
         output_op = normalization.group_norm(
-            inputs, groups=groups, center=False, scale=False,
+            inputs,
+            groups=groups,
+            center=False,
+            scale=False,
             channels_axis=channels_axis,
-            reduction_axes=reduction_axes)
+            reduction_axes=reduction_axes,
+            mean_close_to_zero=mean_close_to_zero)
         with self.test_session() as sess:
           sess.run(variables.global_variables_initializer())
           outputs = sess.run(output_op)
@@ -347,12 +363,32 @@ class GroupNormTest(test.TestCase):
           self.assertAllClose(expected_mean, mean, rtol=tol, atol=tol)
           self.assertAllClose(expected_var, var, rtol=tol, atol=tol)
 
+  def doOutputTestForMeanCloseToZero(self,
+                                     input_shape,
+                                     channels_axis=None,
+                                     reduction_axes=None,
+                                     groups=2,
+                                     tol=5e-2):
+    self.doOutputTest(
+        input_shape,
+        channels_axis=channels_axis,
+        reduction_axes=reduction_axes,
+        groups=groups,
+        tol=tol,
+        mean_close_to_zero=True)
+
   def testOutputSmallInput4D_NHWC(self):
     input_shape = [10, 10, 10, 30]
     # Specify axes with positive values.
     self.doOutputTest(input_shape, channels_axis=3, reduction_axes=[1, 2])
     # Specify axes with negative values.
     self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+    # Specify axes with positive values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=3, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=-1, reduction_axes=[-3, -2])
 
   def testOutputSmallInput3D_NHWC(self):
     input_shape = [10, 10, 30]
@@ -360,6 +396,12 @@ class GroupNormTest(test.TestCase):
     self.doOutputTest(input_shape, channels_axis=2, reduction_axes=[0, 1])
     # Specify axes with negative values.
     self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+    # Specify axes with positive values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=2, reduction_axes=[0, 1])
+    # Specify axes with negative values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=-1, reduction_axes=[-3, -2])
 
   def testOutputSmallInput4D_NCHW(self):
     input_shape = [10, 10, 10, 30]
@@ -367,6 +409,12 @@ class GroupNormTest(test.TestCase):
     self.doOutputTest(input_shape, channels_axis=1, reduction_axes=[2, 3])
     # Specify axes with negative values.
     self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+    # Specify axes with positive values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=1, reduction_axes=[2, 3])
+    # Specify axes with negative values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=-3, reduction_axes=[-2, -1])
 
   def testOutputSmallInput3D_NCHW(self):
     input_shape = [10, 10, 30]
@@ -374,23 +422,43 @@ class GroupNormTest(test.TestCase):
     self.doOutputTest(input_shape, channels_axis=0, reduction_axes=[1, 2])
     # Specify axes with negative values.
     self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+    # Specify axes with positive values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=0, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTestForMeanCloseToZero(
+        input_shape, channels_axis=-3, reduction_axes=[-2, -1])
 
   def testOutputBigInput4D_NHWC(self):
-    self.doOutputTest([5, 100, 100, 1], channels_axis=3, reduction_axes=[1, 2],
-                      groups=1)
+    self.doOutputTest(
+        [5, 100, 100, 1], channels_axis=3, reduction_axes=[1, 2], groups=1)
+    self.doOutputTestForMeanCloseToZero(
+        [5, 100, 100, 1], channels_axis=3, reduction_axes=[1, 2], groups=1)
 
   def testOutputBigInput4D_NCHW(self):
-    self.doOutputTest([1, 100, 100, 4], channels_axis=1, reduction_axes=[2, 3],
-                      groups=4)
+    self.doOutputTest(
+        [1, 100, 100, 4], channels_axis=1, reduction_axes=[2, 3], groups=4)
+    self.doOutputTestForMeanCloseToZero(
+        [1, 100, 100, 4], channels_axis=1, reduction_axes=[2, 3], groups=4)
 
   def testOutputSmallInput2D_NC(self):
-    self.doOutputTest([10, 7*100], channels_axis=1, reduction_axes=[], groups=7)
+    self.doOutputTest(
+        [10, 7 * 100], channels_axis=1, reduction_axes=[], groups=7)
+    self.doOutputTestForMeanCloseToZero(
+        [10, 7 * 100], channels_axis=1, reduction_axes=[], groups=7)
 
   def testOutputSmallInput5D_NCXXX(self):
-    self.doOutputTest([10, 10, 20, 40, 5],
-                      channels_axis=1,
-                      reduction_axes=[2, 3, 4],
-                      groups=5)
+    self.doOutputTest(
+        [10, 10, 20, 40, 5],
+        channels_axis=1,
+        reduction_axes=[2, 3, 4],
+        groups=5)
+    self.doOutputTestForMeanCloseToZero(
+        [10, 10, 20, 40, 5],
+        channels_axis=1,
+        reduction_axes=[2, 3, 4],
+        groups=5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index a4461a20e5..0f037e24ad 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -66,7 +66,7 @@ class OptimizersTest(test.TestCase):
     ]
     for optimizer in optimizers:
       with ops.Graph().as_default() as g:
-        with self.test_session(graph=g) as session:
+        with self.session(graph=g) as session:
           x, var, loss, global_step = _setup_model()
           train = optimizers_lib.optimize_loss(
               loss, global_step, learning_rate=0.1, optimizer=optimizer)
@@ -82,7 +82,7 @@ class OptimizersTest(test.TestCase):
       return gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
 
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         x, var, loss, global_step = _setup_model()
         train = optimizers_lib.optimize_loss(
             loss, global_step, learning_rate=None, optimizer=optimizer_fn)
@@ -96,14 +96,14 @@ class OptimizersTest(test.TestCase):
     optimizers = ["blah", variables.Variable, object(), lambda x: None]
     for optimizer in optimizers:
       with ops.Graph().as_default() as g:
-        with self.test_session(graph=g):
+        with self.session(graph=g):
           _, _, loss, global_step = _setup_model()
           with self.assertRaises(ValueError):
             optimizers_lib.optimize_loss(
                 loss, global_step, learning_rate=0.1, optimizer=optimizer)
 
   def testBadSummaries(self):
-    with ops.Graph().as_default() as g, self.test_session(graph=g):
+    with ops.Graph().as_default() as g, self.session(graph=g):
       _, _, loss, global_step = _setup_model()
       with self.assertRaises(ValueError):
         optimizers_lib.optimize_loss(
@@ -111,7 +111,7 @@ class OptimizersTest(test.TestCase):
             summaries=["loss", "bad_summary"])
 
   def testInvalidLoss(self):
-    with ops.Graph().as_default() as g, self.test_session(graph=g):
+    with ops.Graph().as_default() as g, self.session(graph=g):
       _, _, _, global_step = _setup_model()
       with self.assertRaises(ValueError):
         optimizers_lib.optimize_loss(
@@ -121,7 +121,7 @@ class OptimizersTest(test.TestCase):
             [[1.0]], global_step, learning_rate=0.1, optimizer="SGD")
 
   def testInvalidGlobalStep(self):
-    with ops.Graph().as_default() as g, self.test_session(graph=g):
+    with ops.Graph().as_default() as g, self.session(graph=g):
       x = array_ops.placeholder(dtypes.float32, [])
       var = variable_scope.get_variable(
           "test", [], initializer=init_ops.constant_initializer(10))
@@ -157,7 +157,7 @@ class OptimizersTest(test.TestCase):
             optimizer="SGD")
 
   def testInvalidLearningRate(self):
-    with ops.Graph().as_default() as g, self.test_session(graph=g):
+    with ops.Graph().as_default() as g, self.session(graph=g):
       _, _, loss, global_step = _setup_model()
       with self.assertRaises(ValueError):
         optimizers_lib.optimize_loss(
@@ -270,7 +270,7 @@ class OptimizersTest(test.TestCase):
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
     ]
     for optimizer in optimizers:
-      with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      with ops.Graph().as_default() as g, self.session(graph=g) as session:
         x = array_ops.placeholder(dtypes.float32, [])
         var = variable_scope.get_variable(
             "test", [], initializer=init_ops.constant_initializer(10))
@@ -295,7 +295,7 @@ class OptimizersTest(test.TestCase):
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
     ]
     for optimizer in optimizers:
-      with ops.Graph().as_default() as g, self.test_session(graph=g):
+      with ops.Graph().as_default() as g, self.session(graph=g):
         x = array_ops.placeholder(dtypes.float32, [])
         var = variable_scope.get_variable(
             "test", [], initializer=init_ops.constant_initializer(10))
@@ -319,7 +319,7 @@ class OptimizersTest(test.TestCase):
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
     ]
     for optimizer in optimizers:
-      with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      with ops.Graph().as_default() as g, self.session(graph=g) as session:
         x, var, loss, global_step = _setup_model()
         update_var = variable_scope.get_variable(
             "update", [], initializer=init_ops.constant_initializer(10))
@@ -342,7 +342,7 @@ class OptimizersTest(test.TestCase):
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
     ]
     for optimizer in optimizers:
-      with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      with ops.Graph().as_default() as g, self.session(graph=g) as session:
         x, var, loss, global_step = _setup_model()
         update_var = variable_scope.get_variable(
             "update", [], initializer=init_ops.constant_initializer(10))
@@ -365,7 +365,7 @@ class OptimizersTest(test.TestCase):
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
     ]
     for optimizer in optimizers:
-      with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      with ops.Graph().as_default() as g, self.session(graph=g) as session:
         x, var, loss, global_step = _setup_model()
         update_var = variable_scope.get_variable(
             "update", [], initializer=init_ops.constant_initializer(10))
@@ -389,7 +389,7 @@ class OptimizersTest(test.TestCase):
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
     ]
     for optimizer in optimizers:
-      with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      with ops.Graph().as_default() as g, self.session(graph=g) as session:
         x, var, loss, global_step = _setup_model()
         update_var = variable_scope.get_variable(
             "update", [], initializer=init_ops.constant_initializer(10))
@@ -413,7 +413,7 @@ class OptimizersTest(test.TestCase):
         gradient_descent.GradientDescentOptimizer(learning_rate=0.1)
     ]
     for optimizer in optimizers:
-      with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      with ops.Graph().as_default() as g, self.session(graph=g) as session:
         x, var, loss, global_step = _setup_model()
         update_var = variable_scope.get_variable(
             "update", [], initializer=init_ops.constant_initializer(10))
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index dad3da3748..b25f11b5a6 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -151,9 +151,19 @@ def _rev_block_forward(x1,
   return y1, y2
 
 
+def _safe_wraps(fn):
+  if isinstance(fn, functools.partial):
+    # functools.partial objects cannot be wrapped as they are missing the
+    # necessary properties (__name__, __module__, __doc__).
+    def passthrough(f):
+      return f
+    return passthrough
+  return functools.wraps(fn)
+
+
 def _scope_wrap(fn, scope):
 
-  @functools.wraps(fn)
+  @_safe_wraps(fn)
   def wrap(*args, **kwargs):
     with variable_scope.variable_scope(scope, use_resource=True):
       return fn(*args, **kwargs)
@@ -430,7 +440,7 @@ def rev_block(x1,
 def enable_with_args(dec):
   """A decorator for decorators to enable their usage with or without args."""
 
-  @functools.wraps(dec)
+  @_safe_wraps(dec)
   def new_dec(*args, **kwargs):
     if len(args) == 1 and not kwargs and callable(args[0]):
       # Used as decorator without args
@@ -477,7 +487,7 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
     tf.gradients).
   """
 
-  @functools.wraps(fn)
+  @_safe_wraps(fn)
   def wrapped(*args):
     return _recompute_grad(
         fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads)
diff --git a/tensorflow/contrib/layers/python/layers/utils_test.py b/tensorflow/contrib/layers/python/layers/utils_test.py
index 645dc1291e..a9bd89532a 100644
--- a/tensorflow/contrib/layers/python/layers/utils_test.py
+++ b/tensorflow/contrib/layers/python/layers/utils_test.py
@@ -47,7 +47,7 @@ class ConstantValueTest(test.TestCase):
 
   def test_variable(self):
     for v in [True, False, 1, 0, 1.0]:
-      with ops.Graph().as_default() as g, self.test_session(g) as sess:
+      with ops.Graph().as_default() as g, self.session(g) as sess:
         x = variables.Variable(v)
         value = utils.constant_value(x)
         self.assertEqual(value, None)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index d3aa3fa92c..418b0cf392 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -108,7 +108,6 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/data_feeder_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
@@ -164,7 +163,6 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -591,7 +589,6 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/io_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/learn/python/learn/datasets",
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 79bd73faaf..28a6f5aed9 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -19,7 +19,8 @@ This module and all its submodules are deprecated. See
 [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
 for migration instructions.
 
-See the @{$python/contrib.learn} guide.
+See the [Contrib Learn](https://tensorflow.org/api_guides/python/contrib.learn)
+guide.
 
 @@BaseEstimator
 @@Estimator
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index c9a11f27f1..1d8a59281a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -155,7 +155,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
     sequence_input = dynamic_rnn_estimator.build_sequence_input(
         self.GetColumnsToTensors(), self.sequence_feature_columns,
         self.context_feature_columns)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
       sequence_input_val = sess.run(sequence_input)
@@ -330,7 +330,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
     actual_state = dynamic_rnn_estimator.dict_to_state_tuple(state_dict, cell)
     flattened_state = dynamic_rnn_estimator.state_tuple_to_dict(actual_state)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       (state_dict_val, actual_state_val, flattened_state_val) = sess.run(
           [state_dict, actual_state, flattened_state])
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/rnn_common_test.py b/tensorflow/contrib/learn/python/learn/estimators/rnn_common_test.py
index 82563141cc..ebf5f5617d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/rnn_common_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/rnn_common_test.py
@@ -44,7 +44,7 @@ class RnnCommonTest(test.TestCase):
          constant_op.constant(labels, dtype=dtypes.int32),
          constant_op.constant(sequence_length, dtype=dtypes.int32))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       activations_masked, labels_masked = sess.run(
           [activations_masked_t, labels_masked_t])
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/stability_test.py b/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
index 6d04543819..81376c0e2a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/stability_test.py
@@ -68,12 +68,12 @@ class StabilityTest(test.TestCase):
     minval = -0.3333
     maxval = 0.3333
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         g.seed = my_seed
         x = random_ops.random_uniform([10, 10], minval=minval, maxval=maxval)
         val1 = session.run(x)
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         g.seed = my_seed
         x = random_ops.random_uniform([10, 10], minval=minval, maxval=maxval)
         val2 = session.run(x)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
index 442247409d..06c61554fa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
@@ -53,7 +53,7 @@ class PrepareInputsForRnnTest(test.TestCase):
                                                     sequence_feature_columns,
                                                     num_unroll)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
       features_val = sess.run(features_by_time)
@@ -314,7 +314,7 @@ class StateSavingRnnEstimatorTest(test.TestCase):
         else:
           self.assertAllEqual(v, got[k])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
       actual_sequence, actual_context = sess.run(
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions_test.py b/tensorflow/contrib/learn/python/learn/graph_actions_test.py
index df156da3f4..d5c02124ac 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions_test.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions_test.py
@@ -175,7 +175,7 @@ class GraphActionsTest(test.TestCase):
     return in0, in1, out
 
   def test_infer(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._assert_ckpt(self._output_dir, False)
       in0, in1, out = self._build_inference_graph()
       self.assertEqual({
@@ -193,7 +193,7 @@ class GraphActionsTest(test.TestCase):
       side_effect=learn.graph_actions.coordinator.Coordinator.request_stop,
       autospec=True)
   def test_coordinator_request_stop_called(self, request_stop):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       in0, in1, out = self._build_inference_graph()
       learn.graph_actions.infer(None, {'a': in0, 'b': in1, 'c': out})
       self.assertTrue(request_stop.called)
@@ -204,7 +204,7 @@ class GraphActionsTest(test.TestCase):
       side_effect=learn.graph_actions.coordinator.Coordinator.request_stop,
       autospec=True)
   def test_run_feeds_iter_cleanup_with_exceptions(self, request_stop):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       in0, in1, out = self._build_inference_graph()
       try:
         for _ in learn.graph_actions.run_feeds_iter({
@@ -249,7 +249,7 @@ class GraphActionsTest(test.TestCase):
       self._assert_ckpt(self._output_dir, False)
 
   def test_infer_invalid_feed(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._assert_ckpt(self._output_dir, False)
       in0, _, _ = self._build_inference_graph()
       with self.assertRaisesRegexp(TypeError, 'Can not convert a NoneType'):
@@ -257,7 +257,7 @@ class GraphActionsTest(test.TestCase):
       self._assert_ckpt(self._output_dir, False)
 
   def test_infer_feed(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._assert_ckpt(self._output_dir, False)
       in0, _, out = self._build_inference_graph()
       self.assertEqual(
@@ -271,7 +271,7 @@ class GraphActionsTest(test.TestCase):
   # TODO(ptucker): Test eval for 1 epoch.
 
   def test_evaluate_invalid_args(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._assert_ckpt(self._output_dir, False)
       with self.assertRaisesRegexp(ValueError, 'utput directory'):
         learn.graph_actions.evaluate(
@@ -288,7 +288,7 @@ class GraphActionsTest(test.TestCase):
       self._assert_ckpt(self._output_dir, False)
 
   def test_evaluate(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       _, _, out = self._build_inference_graph()
       writer = learn.graph_actions.get_summary_writer(self._output_dir)
       self._assert_summaries(self._output_dir, writer, expected_session_logs=[])
@@ -310,7 +310,7 @@ class GraphActionsTest(test.TestCase):
       self._assert_ckpt(self._output_dir, False)
 
   def test_evaluate_ready_for_local_init(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       variables_lib.create_global_step()
       v = variables.Variable(1.0)
       variables.Variable(
@@ -327,7 +327,7 @@ class GraphActionsTest(test.TestCase):
           max_steps=1)
 
   def test_evaluate_feed_fn(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       in0, _, out = self._build_inference_graph()
       writer = learn.graph_actions.get_summary_writer(self._output_dir)
       self._assert_summaries(self._output_dir, writer, expected_session_logs=[])
@@ -352,7 +352,7 @@ class GraphActionsTest(test.TestCase):
       self._assert_ckpt(self._output_dir, False)
 
   def test_evaluate_feed_fn_with_exhaustion(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       in0, _, out = self._build_inference_graph()
       writer = learn.graph_actions.get_summary_writer(self._output_dir)
       self._assert_summaries(self._output_dir, writer, expected_session_logs=[])
@@ -375,7 +375,7 @@ class GraphActionsTest(test.TestCase):
           expected_session_logs=[])
 
   def test_evaluate_with_saver(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       _, _, out = self._build_inference_graph()
       ops.add_to_collection(ops.GraphKeys.SAVERS, saver_lib.Saver())
       writer = learn.graph_actions.get_summary_writer(self._output_dir)
@@ -469,7 +469,7 @@ class GraphActionsTrainTest(test.TestCase):
     return in0, in1, out
 
   def test_train_invalid_args(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       train_op = constant_op.constant(1.0)
       loss_op = constant_op.constant(2.0)
       with self.assertRaisesRegexp(ValueError, 'utput directory'):
@@ -503,7 +503,7 @@ class GraphActionsTrainTest(test.TestCase):
   # TODO(ptucker): Mock supervisor, and assert all interactions.
 
   def test_train(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with ops.control_dependencies(self._build_inference_graph()):
         train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
       self._assert_summaries(self._output_dir)
@@ -522,7 +522,7 @@ class GraphActionsTrainTest(test.TestCase):
       self._assert_ckpt(self._output_dir, True)
 
   def test_train_steps_is_incremental(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with ops.control_dependencies(self._build_inference_graph()):
         train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
       learn.graph_actions.train(
@@ -535,7 +535,7 @@ class GraphActionsTrainTest(test.TestCase):
           self._output_dir, variables_lib.get_global_step().name)
       self.assertEqual(10, step)
 
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with ops.control_dependencies(self._build_inference_graph()):
         train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
       learn.graph_actions.train(
@@ -549,7 +549,7 @@ class GraphActionsTrainTest(test.TestCase):
       self.assertEqual(25, step)
 
   def test_train_max_steps_is_not_incremental(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with ops.control_dependencies(self._build_inference_graph()):
         train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
       learn.graph_actions.train(
@@ -562,7 +562,7 @@ class GraphActionsTrainTest(test.TestCase):
           self._output_dir, variables_lib.get_global_step().name)
       self.assertEqual(10, step)
 
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with ops.control_dependencies(self._build_inference_graph()):
         train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
       learn.graph_actions.train(
@@ -576,7 +576,7 @@ class GraphActionsTrainTest(test.TestCase):
       self.assertEqual(15, step)
 
   def test_train_loss(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       variables_lib.create_global_step()
       loss_var = variables_lib.local_variable(10.0)
       train_op = control_flow_ops.group(
@@ -598,7 +598,7 @@ class GraphActionsTrainTest(test.TestCase):
       self._assert_ckpt(self._output_dir, True)
 
   def test_train_summaries(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with ops.control_dependencies(self._build_inference_graph()):
         train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
       loss_op = constant_op.constant(2.0)
@@ -624,7 +624,7 @@ class GraphActionsTrainTest(test.TestCase):
       self._assert_ckpt(self._output_dir, True)
 
   def test_train_chief_monitor(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with ops.control_dependencies(self._build_inference_graph()):
         train_op = state_ops.assign_add(variables_lib.get_global_step(), 1)
       loss_op = constant_op.constant(2.0)
@@ -663,7 +663,7 @@ class GraphActionsTrainTest(test.TestCase):
       # and the other chief exclusive.
       chief_exclusive_monitor = _BaseMonitorWrapper(False)
       all_workers_monitor = _BaseMonitorWrapper(True)
-      with self.test_session(g):
+      with self.session(g):
         loss = learn.graph_actions.train(
             g,
             output_dir=self._output_dir,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
index 1f439965da..5e07b9313f 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@@ -58,7 +58,7 @@ class DataFeederTest(test.TestCase):
         self.assertEqual(expected_np_dtype, v)
     else:
       self.assertEqual(expected_np_dtype, feeder.input_dtype)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       inp, _ = feeder.input_builder()
       if isinstance(inp, dict):
         for v in list(inp.values()):
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index e11e8b698a..8e68a17e47 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -207,7 +207,7 @@ class GraphIOTest(test.TestCase):
             parsing_ops.FixedLenFeature(shape=shape, dtype=dtypes_lib.float32)
     }
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as sess:
+    with ops.Graph().as_default() as g, self.session(graph=g) as sess:
       features = graph_io.read_batch_record_features(
           _VALID_FILE_PATTERN,
           batch_size,
@@ -242,7 +242,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 1234
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as sess:
+    with ops.Graph().as_default() as g, self.session(graph=g) as sess:
       inputs = graph_io.read_batch_examples(
           _VALID_FILE_PATTERN,
           batch_size,
@@ -276,7 +276,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 1234
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as sess:
+    with ops.Graph().as_default() as g, self.session(graph=g) as sess:
       inputs = graph_io.read_batch_examples(
           [_VALID_FILE_PATTERN, _VALID_FILE_PATTERN_2],
           batch_size,
@@ -325,7 +325,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 5
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       inputs = graph_io.read_batch_examples(
           filename,
           batch_size,
@@ -374,7 +374,7 @@ class GraphIOTest(test.TestCase):
 
     features = {"sequence": parsing_ops.FixedLenFeature([], dtypes_lib.string)}
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       keys, result = graph_io.read_keyed_batch_features(
           filename,
           batch_size,
@@ -429,7 +429,7 @@ class GraphIOTest(test.TestCase):
 
     features = {"sequence": parsing_ops.FixedLenFeature([], dtypes_lib.string)}
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       result = graph_io.read_batch_features(
           filename,
           batch_size,
@@ -475,7 +475,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 5
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       inputs = graph_io.read_batch_examples(
           filenames,
           batch_size,
@@ -519,7 +519,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 5
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
@@ -640,7 +640,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 10
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       inputs = graph_io.read_batch_examples(
           [filename],
           batch_size,
@@ -672,7 +672,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 5
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       keys, inputs = graph_io.read_keyed_batch_examples(
           filename,
           batch_size,
@@ -714,7 +714,7 @@ class GraphIOTest(test.TestCase):
     queue_capacity = 5
     name = "my_batch"
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       dtypes = {"age": parsing_ops.FixedLenFeature([1], dtypes_lib.int64)}
       parse_fn = lambda example: parsing_ops.parse_single_example(  # pylint: disable=g-long-lambda
           parsing_ops.decode_json_example(example), dtypes)
@@ -773,7 +773,7 @@ class GraphIOTest(test.TestCase):
       examples = parsing_ops.parse_example(serialized, features)
       return math_ops.less(examples["age"], 2)
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       keys, inputs = graph_io._read_keyed_batch_examples_helper(
           filename,
           batch_size,
@@ -812,7 +812,7 @@ class GraphIOTest(test.TestCase):
       coord.join(threads)
 
   def test_queue_parsed_features_single_tensor(self):
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       features = {"test": constant_op.constant([1, 2, 3])}
       _, queued_features = graph_io.queue_parsed_features(features)
       coord = coordinator.Coordinator()
@@ -833,7 +833,7 @@ class GraphIOTest(test.TestCase):
     _, queued_feature = graph_io.read_keyed_batch_features_shared_queue(
         _VALID_FILE_PATTERN, batch_size, feature, reader)
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+    with ops.Graph().as_default() as g, self.session(graph=g) as session:
       features_result = graph_io.read_batch_features(
           _VALID_FILE_PATTERN, batch_size, feature, reader)
       session.run(variables.local_variables_initializer())
diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py
index ff1da32c21..83e48a36e7 100644
--- a/tensorflow/contrib/learn/python/learn/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/monitors_test.py
@@ -127,12 +127,12 @@ class MonitorsTest(test.TestCase):
     monitor.end()
 
   def test_base_monitor(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(learn.monitors.BaseMonitor())
 
   def test_every_0(self):
     monitor = _MyEveryN(every_n_steps=0, first_n_steps=-1)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(monitor, num_epochs=3, num_steps_per_epoch=10)
       expected_steps = list(range(30))
       self.assertAllEqual(expected_steps, monitor.steps_begun)
@@ -141,7 +141,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_1(self):
     monitor = _MyEveryN(every_n_steps=1, first_n_steps=-1)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(monitor, num_epochs=3, num_steps_per_epoch=10)
       expected_steps = list(range(1, 30))
       self.assertEqual(expected_steps, monitor.steps_begun)
@@ -150,7 +150,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_2(self):
     monitor = _MyEveryN(every_n_steps=2, first_n_steps=-1)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(monitor, num_epochs=3, num_steps_per_epoch=10)
       expected_steps = list(range(2, 29, 2)) + [29]
       self.assertEqual(expected_steps, monitor.steps_begun)
@@ -159,7 +159,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_8(self):
     monitor = _MyEveryN(every_n_steps=8, first_n_steps=2)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(monitor, num_epochs=3, num_steps_per_epoch=10)
       expected_steps = [0, 1, 2, 10, 18, 26, 29]
       self.assertEqual(expected_steps, monitor.steps_begun)
@@ -168,7 +168,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_8_no_max_steps(self):
     monitor = _MyEveryN(every_n_steps=8, first_n_steps=2)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(
           monitor, num_epochs=3, num_steps_per_epoch=10, pass_max_steps=False)
       begin_end_steps = [0, 1, 2, 10, 18, 26]
@@ -179,7 +179,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_8_recovered_after_step_begin(self):
     monitor = _MyEveryN(every_n_steps=8)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       for step in [8, 16]:
         monitor.step_begin(step)
         monitor.step_begin(step)
@@ -192,7 +192,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_8_recovered_after_step_end(self):
     monitor = _MyEveryN(every_n_steps=8)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       for step in [8, 16]:
         monitor.step_begin(step)
         monitor.step_end(step, output=None)
@@ -207,7 +207,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_8_call_post_step_at_the_end(self):
     monitor = _MyEveryN(every_n_steps=8)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       monitor.begin()
       for step in [8, 16]:
         monitor.step_begin(step)
@@ -224,7 +224,7 @@ class MonitorsTest(test.TestCase):
 
   def test_every_8_call_post_step_should_not_be_called_twice(self):
     monitor = _MyEveryN(every_n_steps=8)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       monitor.begin()
       for step in [8, 16]:
         monitor.step_begin(step)
@@ -240,13 +240,13 @@ class MonitorsTest(test.TestCase):
       self.assertEqual([8, 16], monitor.post_steps)
 
   def test_print(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       t = constant_op.constant(42.0, name='foo')
       self._run_monitor(learn.monitors.PrintTensor(tensor_names=[t.name]))
       self.assertRegexpMatches(str(self.logged_message), t.name)
 
   def test_logging_trainable(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       var = variables.Variable(constant_op.constant(42.0), name='foo')
       var.initializer.run()
       cof = constant_op.constant(1.0)
@@ -258,7 +258,7 @@ class MonitorsTest(test.TestCase):
       self.assertRegexpMatches(str(self.logged_message), var.name)
 
   def test_summary_saver(self):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       log_dir = 'log/dir'
       summary_writer = testing.FakeSummaryWriter(log_dir, g)
       var = variables.Variable(0.0)
@@ -312,7 +312,7 @@ class MonitorsTest(test.TestCase):
     monitor = learn.monitors.ValidationMonitor(
         x=constant_op.constant(2.0), every_n_steps=0)
     self._assert_validation_monitor(monitor)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with self.assertRaisesRegexp(ValueError, 'set_estimator'):
         self._run_monitor(monitor)
 
@@ -330,7 +330,7 @@ class MonitorsTest(test.TestCase):
         x=constant_op.constant(2.0), every_n_steps=0)
     self._assert_validation_monitor(monitor)
     monitor.set_estimator(estimator)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(monitor)
       self._assert_validation_monitor(monitor)
       mock_latest_checkpoint.assert_called_with(model_dir)
@@ -351,7 +351,7 @@ class MonitorsTest(test.TestCase):
         x=constant_op.constant(2.0), every_n_steps=0)
     self._assert_validation_monitor(monitor)
     monitor.set_estimator(estimator)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       self._run_monitor(monitor)
       self._assert_validation_monitor(monitor)
 
@@ -370,7 +370,7 @@ class MonitorsTest(test.TestCase):
         x=constant_op.constant(2.0), every_n_steps=0, early_stopping_rounds=1)
     self._assert_validation_monitor(monitor)
     monitor.set_estimator(estimator)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       with self.assertRaisesRegexp(ValueError, 'missing from outputs'):
         self._run_monitor(monitor, num_epochs=1, num_steps_per_epoch=1)
 
@@ -392,7 +392,7 @@ class MonitorsTest(test.TestCase):
 
     self._assert_validation_monitor(monitor)
     monitor.set_estimator(estimator)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       monitor.begin(max_steps=100)
       monitor.epoch_begin(epoch=0)
       self.assertEqual(0, estimator.evaluate.call_count)
@@ -477,7 +477,7 @@ class MonitorsTest(test.TestCase):
         every_n_steps=0, early_stopping_rounds=2)
     self._assert_validation_monitor(monitor)
     monitor.set_estimator(estimator)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       monitor.begin(max_steps=100)
       monitor.epoch_begin(epoch=0)
       self.assertEqual(0, estimator.evaluate.call_count)
@@ -509,7 +509,7 @@ class MonitorsTest(test.TestCase):
         metrics=constant_op.constant(2.0),
         every_n_steps=0, early_stopping_rounds=2)
     monitor.set_estimator(estimator)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       monitor.begin(max_steps=100)
       monitor.epoch_begin(epoch=0)
 
@@ -525,7 +525,7 @@ class MonitorsTest(test.TestCase):
   def test_graph_dump(self):
     monitor0 = learn.monitors.GraphDump()
     monitor1 = learn.monitors.GraphDump()
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       const_var = variables.Variable(42.0, name='my_const')
       counter_var = variables.Variable(0.0, name='my_counter')
       assign_add = state_ops.assign_add(counter_var, 1.0, name='my_assign_add')
@@ -568,7 +568,7 @@ class MonitorsTest(test.TestCase):
   def test_capture_variable(self):
     monitor = learn.monitors.CaptureVariable(
         var_name='my_assign_add:0', every_n=8, first_n=2)
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       var = variables.Variable(0.0, name='my_var')
       var.initializer.run()
       state_ops.assign_add(var, 1.0, name='my_assign_add')
diff --git a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
index 7ce5fb2da6..2f33a2b74d 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
@@ -950,7 +950,7 @@ class Seq2SeqTest(test.TestCase):
     num_dec_timesteps = 3
 
     def TestModel(seq2seq):
-      with self.test_session(graph=ops.Graph()) as sess:
+      with self.session(graph=ops.Graph()) as sess:
         random_seed.set_random_seed(111)
         random.seed(111)
         np.random.seed(111)
diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index a262a099cf..cbe4c03e4d 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -14,7 +14,8 @@
 # ==============================================================================
 """Linear algebra libraries.
 
-See the @{$python/contrib.linalg} guide.
+See the[Contrib Linalg](https://tensorflow.org/api_guides/python/contrib.linalg)
+guide.
 
 @@LinearOperator
 @@LinearOperatorBlockDiag
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 9872c6f97c..8ebe45d851 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -158,7 +158,7 @@ class SDCAOptimizer(object):
           # exactly 2 (i.e., its shape should be [batch_size, column.dim]).
           check_rank_op = control_flow_ops.Assert(
               math_ops.less_equal(array_ops.rank(transformed_tensor), 2),
-              ['transformed_tensor shouls have rank at most 2.'])
+              ['transformed_tensor should have rank at most 2.'])
           # Reshape to [batch_size, dense_column_dimension].
           with ops.control_dependencies([check_rank_op]):
             transformed_tensor = array_ops.reshape(transformed_tensor, [
@@ -172,7 +172,7 @@ class SDCAOptimizer(object):
         elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
           # A bucketized column corresponds to a sparse feature in SDCA. The
           # bucketized feature is "sparsified" for SDCA by converting it to a
-          # SparseFeatureColumn respresenting the one-hot encoding of the
+          # SparseFeatureColumn representing the one-hot encoding of the
           # bucketized feature.
           #
           # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
@@ -220,7 +220,7 @@ class SDCAOptimizer(object):
           # occur multiple times for a single example.
           projected_ids = projection_length * example_ids + flat_ids
 
-          # Remove any redudant ids.
+          # Remove any redundant ids.
           ids, idx = array_ops.unique(projected_ids)
           # Keep only one example id per duplicated ids.
           example_ids_filtered = math_ops.unsorted_segment_min(
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 1e6f1e7da2..0091587bf7 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -154,6 +154,14 @@ cc_library(
         "optional_debug_tools.h",
     ],
     copts = tflite_copts(),
+    linkopts = [
+    ] + select({
+        "//tensorflow:android": [
+            "-llog",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     deps = [
         ":arena_planner",
         ":builtin_op_data",
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 81844756bc..458a50f25c 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -227,6 +227,8 @@ def generated_test_models():
         "constant",
         "control_dep",
         "conv",
+        "conv_with_shared_weights",
+        "conv_to_depthwiseconv_with_shared_weights",
         "depthwiseconv",
         "div",
         "equal",
@@ -265,6 +267,7 @@ def generated_test_models():
         "prelu",
         "pow",
         "reduce_max",
+        "reduce_min",
         "reduce_prod",
         "relu",
         "relu1",
@@ -290,6 +293,7 @@ def generated_test_models():
         "topk",
         "transpose",
         #"transpose_conv",   # disabled due to b/111213074
+        "unpack",
         "where",
     ]
 
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 70178b2faa..e81f9e4f51 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -286,6 +286,11 @@ typedef struct {
   int axis;
 } TfLiteOneHotParams;
 
+typedef struct {
+  int num;
+  int axis;
+} TfLiteUnpackParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 8a8eb98568..9cf4bea73e 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -113,6 +113,10 @@ typedef enum {
   kTfLiteBuiltinOneHot = 85,
   kTfLiteBuiltinLogicalAnd = 86,
   kTfLiteBuiltinLogicalNot = 87,
+  kTfLiteBuiltinUnpack = 88,
+  kTfLiteBuiltinReduceMin = 89,
+  kTfLiteBuiltinFloorDiv = 90,
+  kTfLiteBuiltinReduceAny = 91,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 5bc20106d3..c7f4df3cdc 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -29,9 +29,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 #define TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 
-#if defined(_MSC_VER)
-#include <complex.h>
-#endif
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -49,7 +46,8 @@ typedef enum { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
 typedef enum {
   kTfLiteEigenContext = 0,     // include eigen_support.h to use.
   kTfLiteGemmLowpContext = 1,  // include gemm_support.h to use.
-  kTfLiteMaxExternalContexts = 2
+  kTfLiteEdgeTpuContext = 2,   // Placeholder for Edge TPU support.
+  kTfLiteMaxExternalContexts = 3
 } TfLiteExternalContextType;
 
 // An external context is a collection of information unrelated to the TF Lite
@@ -152,6 +150,11 @@ void TfLiteIntArrayFree(TfLiteIntArray* v);
     }                                      \
   } while (0)
 
+// Single-precision complex data type compatible with the C99 definition.
+typedef struct {
+  float re, im;  // real and imaginary parts, respectively.
+} TfLiteComplex64;
+
 // Types supported by tensor
 typedef enum {
   kTfLiteNoType = 0,
@@ -183,11 +186,7 @@ typedef union {
   uint8_t* uint8;
   bool* b;
   int16_t* i16;
-#if defined(_MSC_VER)
-  _Fcomplex* c64;
-#else
-  _Complex float* c64;
-#endif
+  TfLiteComplex64* c64;
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
@@ -452,13 +451,15 @@ typedef struct _TfLiteDelegate {
 
   // Copy the data from delegate buffer handle to raw memory.
   // This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyFromBufferHandle)(TfLiteDelegate* delegate,
+  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
+                                       TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
                                        void* data, size_t size);
 
   // Copy the data from raw memory to delegate buffer handle.
   // This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyToBufferHandle)(TfLiteDelegate* delegate,
+  TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
+                                     TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
                                      void* data, size_t size);
 
@@ -466,7 +467,7 @@ typedef struct _TfLiteDelegate {
   // this doesn't release the underlying resource (e.g. textures). The
   // resources are either owned by application layer or the delegate.
   // This can be null if the delegate doesn't use its own buffer.
-  void (*FreeBufferHandle)(TfLiteDelegate* delegate,
+  void (*FreeBufferHandle)(TfLiteContext* context, TfLiteDelegate* delegate,
                            TfLiteBufferHandle* handle);
 } TfLiteDelegate;
 
diff --git a/tensorflow/contrib/lite/delegates/eager/BUILD b/tensorflow/contrib/lite/delegates/eager/BUILD
index bb518becc5..88c70fbb8a 100644
--- a/tensorflow/contrib/lite/delegates/eager/BUILD
+++ b/tensorflow/contrib/lite/delegates/eager/BUILD
@@ -16,20 +16,22 @@ cc_library(
     deps = [
         ":util",
         "//tensorflow/c:c_api_internal",
-        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
 )
 
 tf_cc_test(
     name = "buffer_map_test",
     size = "small",
     srcs = ["buffer_map_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":buffer_map",
         "//tensorflow/contrib/lite:framework",
@@ -52,20 +54,22 @@ cc_library(
         ":delegate_data",
         ":kernel",
         ":util",
-        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:kernel_api",
         "//tensorflow/contrib/lite:util",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 tf_cc_test(
     name = "delegate_test",
     size = "small",
     srcs = ["delegate_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":delegate",
         ":test_util",
@@ -80,19 +84,22 @@ cc_library(
     hdrs = ["delegate_data.h"],
     deps = [
         ":buffer_map",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime/eager:context",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 tf_cc_test(
     name = "delegate_data_test",
     size = "small",
     srcs = ["delegate_data_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":delegate_data",
         "//tensorflow/contrib/lite:framework",
@@ -109,25 +116,31 @@ cc_library(
     deps = [
         ":delegate_data",
         ":util",
-        "//tensorflow/contrib/lite:framework",
+        "@flatbuffers",
         "//tensorflow/contrib/lite:kernel_api",
         "//tensorflow/contrib/lite:string",
         "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:execute",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
-        "@flatbuffers",
-    ],
+    ] + select({
+        # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
+        # set of core TensorFlow kernels. We may want to revisit this dependency
+        # to allow selective registration via build targets.
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:framework",
+        ],
+    }),
 )
 
 tf_cc_test(
     name = "kernel_test",
     size = "small",
     srcs = ["kernel_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":delegate_data",
         ":kernel",
@@ -155,22 +168,23 @@ cc_library(
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     deps = [
-        ":constants",
         "//tensorflow/c:c_api_internal",
-        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite_no_runtime",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:framework",
+        ],
+    }),
 )
 
 tf_cc_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":util",
         "//tensorflow/contrib/lite:string",
@@ -178,8 +192,3 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
-
-cc_library(
-    name = "constants",
-    hdrs = ["constants.h"],
-)
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.cc b/tensorflow/contrib/lite/delegates/eager/delegate.cc
index 7d22b45419..45fc158157 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate.cc
@@ -55,17 +55,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   return kTfLiteOk;
 }
 
-TfLiteStatus CopyFromBufferHandle(TfLiteDelegate* delegate,
+TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                  TfLiteDelegate* delegate,
                                   TfLiteBufferHandle buffer_handle, void* data,
                                   size_t size) {
-  // TODO(nupurgarg): Make BufferMap unique to each interpreter in order to
-  // support multiple interpreters using a single delegate.
   BufferMap* buffer_map =
-      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap();
+      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
 
-  // TODO(nupurgarg): Use TfLiteContext's ReportError instead of fprinf.
   if (!buffer_map->HasTensor(buffer_handle)) {
-    fprintf(stderr, "Invalid tensor index %d.\n", buffer_handle);
+    context->ReportError(context, "Invalid tensor index %d.", buffer_handle);
     return kTfLiteError;
   }
 
@@ -73,7 +71,8 @@ TfLiteStatus CopyFromBufferHandle(TfLiteDelegate* delegate,
   tensorflow::StringPiece t_data = t.tensor_data();
 
   if (size != t_data.size()) {
-    fprintf(stderr, "Not enough space to store TensorFlow's aligned buffer.\n");
+    context->ReportError(
+        context, "Not enough space to store TensorFlow's aligned buffer.");
     return kTfLiteError;
   }
 
@@ -84,27 +83,26 @@ TfLiteStatus CopyFromBufferHandle(TfLiteDelegate* delegate,
 }  // namespace delegate
 }  // namespace eager
 
-EagerDelegate::EagerDelegate() {}
-
-EagerDelegate::~EagerDelegate() {}
-
-TfLiteStatus EagerDelegate::Apply(Interpreter* interpreter) {
-  if (!delegate_) {
-    if (!eager::DelegateData::Create(&delegate_data_).ok()) {
-      fprintf(stderr, "Unable to initialize TensorFlow context.\n");
-      return kTfLiteError;
-    }
-
-    delegate_.reset(new TfLiteDelegate{
-        /*data_=*/delegate_data_.get(),
-        /*nullptr,*/ &eager::delegate::Prepare,
-        /*CopyFromBufferHandle=*/&eager::delegate::CopyFromBufferHandle,
-        /*CopyToBufferHandle=*/nullptr,
-        /*FreeBufferHandle=*/nullptr});
+std::unique_ptr<EagerDelegate> EagerDelegate::Create() {
+  std::unique_ptr<eager::DelegateData> delegate_data;
+  if (!eager::DelegateData::Create(&delegate_data).ok()) {
+    fprintf(stderr, "Unable to initialize TensorFlow context.\n");
+    return nullptr;
   }
 
-  return interpreter->ModifyGraphWithDelegate(delegate_.get(),
-                                              /*allow_dynamic_tensors=*/true);
+  return std::unique_ptr<EagerDelegate>(
+      new EagerDelegate(std::move(delegate_data)));
 }
 
+EagerDelegate::EagerDelegate(std::unique_ptr<eager::DelegateData> delegate_data)
+    : TfLiteDelegate{
+          /*data_=*/delegate_data.get(),
+          /*nullptr,*/ &eager::delegate::Prepare,
+          /*CopyFromBufferHandle=*/&eager::delegate::CopyFromBufferHandle,
+          /*CopyToBufferHandle=*/nullptr,
+          /*FreeBufferHandle=*/nullptr},
+      delegate_data_(std::move(delegate_data)) {}
+
+EagerDelegate::~EagerDelegate() {}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.h b/tensorflow/contrib/lite/delegates/eager/delegate.h
index 0defca7c32..6d15ba47dc 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate.h
+++ b/tensorflow/contrib/lite/delegates/eager/delegate.h
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
 
 namespace tflite {
 
@@ -26,28 +25,33 @@ namespace tflite {
 // executed by TensorFlow's runtime via Eager.
 //
 // The interpreter must be constructed after the EagerDelegate and destructed
-// before the EagerDelegate. This delegate can only be used with one
-// interpreter.
+// before the EagerDelegate. This delegate may be used with multiple
+// interpreters, but it is *not* thread-safe.
 //
 // Usage:
-//   EagerDelegate delegate;
+//   auto delegate = EagerDelegate::Create();
 //   ... build interpreter ...
 //
-//   delegate.Apply(interpreter);
+//   if (delegate) {
+//     interpreter->ModifyGraphWithDelegate(
+//         delegate.get(), /*allow_dynamic_tensors=*/true);
+//   }
 //   ... run inference ...
 //   ... destroy interpreter ...
 //   ... destroy delegate ...
-class EagerDelegate {
+class EagerDelegate : public TfLiteDelegate {
  public:
-  EagerDelegate();
-  ~EagerDelegate();
+  // Creates a delegate that supports TF ops.
+  //
+  // If the underyling TF Eager context creation fails, returns null.
+  static std::unique_ptr<EagerDelegate> Create();
 
-  // Modifies the graph loaded in the interpreter.
-  TfLiteStatus Apply(Interpreter* interpreter);
+  ~EagerDelegate();
 
  private:
+  explicit EagerDelegate(std::unique_ptr<eager::DelegateData> delegate_data);
+
   std::unique_ptr<eager::DelegateData> delegate_data_;
-  std::unique_ptr<TfLiteDelegate> delegate_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data.h b/tensorflow/contrib/lite/delegates/eager/delegate_data.h
index 8a0e8ba8bf..772d26f44e 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data.h
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_data.h
@@ -32,14 +32,18 @@ class DelegateData {
   // The EagerContext that is required for execution of Eager Ops.
   tensorflow::EagerContext* GetEagerContext() { return eager_context_.get(); }
 
-  // Map from TF Lite tensor index to TensorFlow tensor.
-  BufferMap* GetBufferMap() { return &buffer_map_; }
+  // Map from TF Lite tensor index to TensorFlow tensor for a given context.
+  BufferMap* GetBufferMap(const TfLiteContext* context) {
+    return &buffer_map_[context];
+  }
 
  private:
   explicit DelegateData(tensorflow::EagerContext* eager_context);
 
   std::unique_ptr<tensorflow::EagerContext> eager_context_;
-  BufferMap buffer_map_;
+  // TODO(b/112439500): Clean up stale BufferMap instances after adding the
+  // necessary cleanup hook from a TfLiteContext to a TfLiteDelegate.
+  std::unordered_map<const TfLiteContext*, BufferMap> buffer_map_;
 };
 
 }  // namespace eager
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc b/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
index 30251b8f82..b3a0ffcec1 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_data_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
@@ -29,8 +30,12 @@ TEST(DelegateDataTest, Basic) {
   // binary.
   EXPECT_TRUE(DelegateData::Create(&data).ok());
 
+  TfLiteContext dummy_context1 = {};
+  TfLiteContext dummy_context2 = {};
   EXPECT_NE(data->GetEagerContext(), nullptr);
-  EXPECT_NE(data->GetBufferMap(), nullptr);
+  EXPECT_NE(data->GetBufferMap(&dummy_context1), nullptr);
+  EXPECT_NE(data->GetBufferMap(&dummy_context1),
+            data->GetBufferMap(&dummy_context2));
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_test.cc b/tensorflow/contrib/lite/delegates/eager/delegate_test.cc
index 88fb34044e..eb47f46c0b 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_test.cc
@@ -25,26 +25,24 @@ namespace {
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 
-// TODO(nupurgarg): Add a test with multiple interpreters for one delegate.
-
 class DelegateTest : public testing::EagerModelTest {
  public:
   DelegateTest() {
-    // The delegate needs to be constructed before the interpreter because the
-    // interpreter references data contained in the delegate.
-    delegate_.reset(new EagerDelegate());
+    delegate_ = EagerDelegate::Create();
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
   ~DelegateTest() override {
     // The delegate needs to be destructed after the interpreter because the
     // interpreter references data contained in the delegate.
-    delete interpreter_.release();
-    delete delegate_.release();
+    interpreter_.reset();
+    delegate_.reset();
   }
 
   void ConfigureDelegate() {
-    CHECK(delegate_->Apply(interpreter_.get()) == kTfLiteOk);
+    ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(
+                  delegate_.get(), /*allow_dynamic_tensors=*/true),
+              kTfLiteOk);
   }
 
  private:
@@ -139,6 +137,56 @@ TEST_F(DelegateTest, OnlyTFLite) {
   ASSERT_THAT(GetValues(2), ElementsAre(1.1f, 4.4f, 9.9f, 17.6f));
 }
 
+TEST_F(DelegateTest, MultipleInterpretersSameDelegate) {
+  // Build a graph, configure the delegate and set inputs.
+  {
+    AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
+    AddTfOp(testing::kUnpack, {0}, {1, 2});
+    AddTfOp(testing::kUnpack, {3}, {4, 5});
+    AddTfOp(testing::kAdd, {1, 4}, {6});
+    AddTfOp(testing::kAdd, {2, 5}, {7});
+    AddTfOp(testing::kMul, {6, 7}, {8});
+    ConfigureDelegate();
+    SetShape(0, {2, 2, 1});
+    SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+    SetShape(3, {2, 2, 1});
+    SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+  }
+
+  // Create a new interpreter, inject into the test framework and build
+  // a different graph using the *same* delegate.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter(&error_reporter_));
+  interpreter_.swap(interpreter);
+  {
+    AddTensors(10, {0}, {9}, kTfLiteFloat32, {3});
+    AddTfOp(testing::kUnpack, {0}, {1, 2});
+    AddTfOp(testing::kAdd, {1, 2}, {3});
+    AddTfOp(testing::kUnpack, {3}, {4, 5});
+    AddTfLiteMulOp({4, 5}, {6});
+    AddTfOp(testing::kUnpack, {6}, {7, 8});
+    AddTfOp(testing::kAdd, {7, 8}, {9});
+    ConfigureDelegate();
+    SetShape(0, {2, 2, 2, 1});
+    SetValues(0, {3.0f, 1.0f, 0.5f, -1.0f, 0.0f, 1.0f, 1.5f, 3.0f});
+  }
+
+  // Swap back in the first interpreter and validate inference.
+  interpreter_.swap(interpreter);
+  {
+    ASSERT_TRUE(Invoke());
+    EXPECT_THAT(GetShape(8), ElementsAre(2, 1));
+    EXPECT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+  }
+
+  // Swap in the second interpreter and validate inference.
+  interpreter_.swap(interpreter);
+  {
+    ASSERT_TRUE(Invoke());
+    EXPECT_THAT(GetShape(9), ElementsAre(1));
+    EXPECT_THAT(GetValues(9), ElementsAre(10.0f));
+  }
+}
+
 }  // namespace
 }  // namespace eager
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.cc b/tensorflow/contrib/lite/delegates/eager/kernel.cc
index 1bd17a3bca..f8467c7cb2 100644
--- a/tensorflow/contrib/lite/delegates/eager/kernel.cc
+++ b/tensorflow/contrib/lite/delegates/eager/kernel.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/delegates/eager/kernel.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/builtin_ops.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/context_util.h"
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 
 // Note: this is part of TF Lite's Eager delegation code which is to be
 // completed soon.
@@ -150,8 +151,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   op_data->eager_context =
       reinterpret_cast<DelegateData*>(params->delegate->data_)
           ->GetEagerContext();
-  op_data->buffer_map =
-      reinterpret_cast<DelegateData*>(params->delegate->data_)->GetBufferMap();
+  op_data->buffer_map = reinterpret_cast<DelegateData*>(params->delegate->data_)
+                            ->GetBufferMap(context);
 
   CHECK(params->output_tensors);
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
@@ -189,6 +190,14 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
       }
     }
 
+    // Fill NodeDef with defaults if it's a valid op.
+    const tensorflow::OpRegistrationData* op_reg_data;
+    auto tf_status = tensorflow::OpRegistry::Global()->LookUp(
+        node_data.nodedef.op(), &op_reg_data);
+    if (tf_status.ok()) {
+      AddDefaultsToNodeDef(op_reg_data->op_def, &node_data.nodedef);
+    }
+
     for (auto input_index : TfLiteIntArrayView(node->inputs)) {
       node_data.inputs.push_back(input_index);
     }
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
index b7bfbb34e4..66f2226626 100644
--- a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
@@ -55,12 +55,14 @@ class KernelTest : public testing::EagerModelTest {
     delegate_.data_ = delegate_data_.get();
     delegate_.FreeBufferHandle = nullptr;
     delegate_.Prepare = prepare_function;
-    delegate_.CopyFromBufferHandle = [](TfLiteDelegate* delegate,
+    delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
                                         TfLiteBufferHandle buffer_handle,
                                         void* data, size_t size) {
       auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      tensorflow::StringPiece values =
-          delegate_data->GetBufferMap()->GetTensor(buffer_handle).tensor_data();
+      tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
+                                           ->GetTensor(buffer_handle)
+                                           .tensor_data();
       memcpy(data, values.data(), values.size());
       return kTfLiteOk;
     };
diff --git a/tensorflow/contrib/lite/delegates/eager/test_util.cc b/tensorflow/contrib/lite/delegates/eager/test_util.cc
index 26d96acc82..b8c9e2652a 100644
--- a/tensorflow/contrib/lite/delegates/eager/test_util.cc
+++ b/tensorflow/contrib/lite/delegates/eager/test_util.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/delegates/eager/test_util.h"
 
 #include "absl/memory/memory.h"
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/string.h"
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/delegates/eager/util.cc b/tensorflow/contrib/lite/delegates/eager/util.cc
index c8aa0b7f69..4426c653e6 100644
--- a/tensorflow/contrib/lite/delegates/eager/util.cc
+++ b/tensorflow/contrib/lite/delegates/eager/util.cc
@@ -13,16 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/delegates/eager/util.h"
-#include "tensorflow/contrib/lite/delegates/eager/constants.h"
 
 namespace tflite {
 namespace eager {
 
-bool IsEagerOp(const char* custom_name) {
-  return custom_name && strncmp(custom_name, kCustomCodePrefix,
-                                strlen(kCustomCodePrefix)) == 0;
-}
-
 TfLiteStatus ConvertStatus(TfLiteContext* context,
                            const tensorflow::Status& status) {
   if (!status.ok()) {
diff --git a/tensorflow/contrib/lite/delegates/eager/util.h b/tensorflow/contrib/lite/delegates/eager/util.h
index b7363361be..a9407be071 100644
--- a/tensorflow/contrib/lite/delegates/eager/util.h
+++ b/tensorflow/contrib/lite/delegates/eager/util.h
@@ -23,10 +23,6 @@ limitations under the License.
 namespace tflite {
 namespace eager {
 
-// Checks whether the prefix of the custom name indicates the operation is an
-// Eager operation.
-bool IsEagerOp(const char* custom_name);
-
 // Converts a tensorflow:Status into a TfLiteStatus. If the original status
 // represented an error, reports it using the given 'context'.
 TfLiteStatus ConvertStatus(TfLiteContext* context,
diff --git a/tensorflow/contrib/lite/delegates/eager/util_test.cc b/tensorflow/contrib/lite/delegates/eager/util_test.cc
index 541d0b1701..53378a1eaf 100644
--- a/tensorflow/contrib/lite/delegates/eager/util_test.cc
+++ b/tensorflow/contrib/lite/delegates/eager/util_test.cc
@@ -103,16 +103,6 @@ TEST(UtilTest, TypeConversions) {
   EXPECT_EQ(TF_BOOL, GetTensorFlowDataType(kTfLiteBool));
 }
 
-TEST(UtilTest, IsEagerOp) {
-  EXPECT_TRUE(IsEagerOp("Eager"));
-  EXPECT_TRUE(IsEagerOp("EagerOp"));
-  EXPECT_FALSE(IsEagerOp("eager"));
-  EXPECT_FALSE(IsEagerOp("Eage"));
-  EXPECT_FALSE(IsEagerOp("OpEager"));
-  EXPECT_FALSE(IsEagerOp(nullptr));
-  EXPECT_FALSE(IsEagerOp(""));
-}
-
 }  // namespace
 }  // namespace eager
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
index 3224b23a0c..720d6b741e 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -1968,16 +1968,19 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
     weights_feature_ = AddInput(weights_feature_type);
     weights_time_ = AddInput(weights_time_type);
     bias_ = AddNullInput();
-    state_ = AddOutput(TensorType_FLOAT32);
+    const int num_filters = units * rank;
+    activation_state_ = AddInput(
+        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
         CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE).Union());
     BuildInterpreter({
-        {batches_, input_size_},        // Input tensor
-        {units_ * rank, input_size_},   // weights_feature tensor
-        {units_ * rank, memory_size_},  // weights_time tensor
-        {units_}                        // bias tensor
+        {batches_, input_size_},              // input tensor
+        {units_ * rank, input_size_},         // weights_feature tensor
+        {units_ * rank, memory_size_},        // weights_time tensor
+        {units_},                             // bias tensor
+        {batches, memory_size * num_filters}  // activation_state tensor
     });
   }
 
@@ -1996,15 +1999,6 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
     PopulateTensor(input_, offset, begin, end);
   }
 
-  // Resets the state of SVDF op by filling it with 0's.
-  void ResetState() {
-    const int zero_buffer_size = rank_ * units_ * batches_ * memory_size_;
-    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
-    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
-    PopulateTensor(state_, 0, zero_buffer.get(),
-                   zero_buffer.get() + zero_buffer_size);
-  }
-
   // Extracts the output tensor from the SVDF op.
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
@@ -2017,7 +2011,7 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
   int weights_feature_;
   int weights_time_;
   int bias_;
-  int state_;
+  int activation_state_;
   int output_;
 
   int batches_;
@@ -2061,7 +2055,7 @@ class SVDFOpModel : public BaseSVDFOpModel {
   }
 };
 
-TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) {
+TEST(NNAPIDelegate, DISABLED_SVDFBlackBoxTestRank1) {
   SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
                    /*memory_size=*/10, /*rank=*/1);
   svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
@@ -2081,11 +2075,10 @@ TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) {
        -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
        -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
 
-  svdf.ResetState();
   svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input));
 }
 
-TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) {
+TEST(NNAPIDelegate, DISABLED_SVDFBlackBoxTestRank2) {
   SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
                    /*memory_size=*/10, /*rank=*/2);
   svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
@@ -2120,7 +2113,6 @@ TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) {
        0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
        0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
 
-  svdf.ResetState();
   svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input));
 }
 
@@ -2442,7 +2434,8 @@ class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
   }
 };
 
-TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       DISABLED_LstmBlackBoxTest) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -2549,7 +2542,8 @@ class CifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
   }
 };
 
-TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest,
+       DISABLED_LstmBlackBoxTest) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -3208,7 +3202,7 @@ class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest {
   }
 };
 
-TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, DISABLED_LstmBlackBoxTest) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
diff --git a/tensorflow/contrib/lite/examples/android/build.gradle b/tensorflow/contrib/lite/examples/android/build.gradle
index a47fa4bbf6..66a62a921a 100644
--- a/tensorflow/contrib/lite/examples/android/build.gradle
+++ b/tensorflow/contrib/lite/examples/android/build.gradle
@@ -14,6 +14,7 @@ buildscript {
 
 allprojects {
     repositories {
+        google()
         jcenter()
     }
 }
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
index 30fee64a6f..734b15e0a1 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -26,7 +26,7 @@
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 
 #define LOG(x) std::cerr
 
diff --git a/tensorflow/contrib/lite/examples/ios/camera/Podfile b/tensorflow/contrib/lite/examples/ios/camera/Podfile
index cd8c39043f..8084307ac7 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/camera/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_camera_example'
-       pod 'TensorFlowLite', '0.1.7'
+       pod 'TensorFlowLite', '1.10.0'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/Podfile b/tensorflow/contrib/lite/examples/ios/simple/Podfile
index c885398f44..eea7ecb759 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/simple/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_simple_example'
-       pod 'TensorFlowLite', '0.1.7'
+       pod 'TensorFlowLite', '1.10.0'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
index 0ab7aa25d0..650c73f732 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
@@ -25,7 +25,7 @@
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 
 #include "ios_image_load.h"
 
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
index 98934ce41d..96d2810937 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
-#define TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
 
 #include <vector>
 
 std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width,
                                        int* out_height, int* out_channels);
 
-#endif  // TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
index 5fc75b1f72..7881ee80ca 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
@@ -39,4 +39,4 @@ template void resize<float>(float*, unsigned char*, int, int, int, int, int,
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/get_top_n.h b/tensorflow/contrib/lite/examples/label_image/get_top_n.h
index 70a7586fe6..adef434c00 100644
--- a/tensorflow/contrib/lite/examples/label_image/get_top_n.h
+++ b/tensorflow/contrib/lite/examples/label_image/get_top_n.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
 
 #include "tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h"
 
@@ -35,4 +35,4 @@ template void get_top_n<float>(float*, int, size_t, float,
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h b/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h
index e416fbd39b..708cf2f2b1 100644
--- a/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
 
 #include <algorithm>
 #include <queue>
@@ -67,4 +67,4 @@ void get_top_n(T* prediction, int prediction_size, size_t num_results,
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h
index 34c223f713..f0be881b58 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
 
 #include "tensorflow/contrib/lite/string.h"
 
@@ -40,4 +40,4 @@ struct Settings {
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
index b6905b5fbf..676783063d 100644
--- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
@@ -29,15 +29,16 @@ namespace TensorFlowLite
   {
     private const string TensorFlowLibrary = "tensorflowlite_c";
 
-    private TFL_Interpreter handle;
+    private TFL_Model model;
+    private TFL_Interpreter interpreter;
 
     public Interpreter(byte[] modelData) {
       GCHandle modelDataHandle = GCHandle.Alloc(modelData, GCHandleType.Pinned);
       IntPtr modelDataPtr = modelDataHandle.AddrOfPinnedObject();
-      TFL_Model model = TFL_NewModel(modelDataPtr, modelData.Length);
-      handle = TFL_NewInterpreter(model, /*options=*/IntPtr.Zero);
-      TFL_DeleteModel(model);
-      if (handle == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
+      model = TFL_NewModel(modelDataPtr, modelData.Length);
+      if (model == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Model");
+      interpreter = TFL_NewInterpreter(model, /*options=*/IntPtr.Zero);
+      if (interpreter == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
     }
 
     ~Interpreter() {
@@ -45,43 +46,45 @@ namespace TensorFlowLite
     }
 
     public void Dispose() {
-      if (handle != IntPtr.Zero) TFL_DeleteInterpreter(handle);
-      handle = IntPtr.Zero;
+      if (interpreter != IntPtr.Zero) TFL_DeleteInterpreter(interpreter);
+      interpreter = IntPtr.Zero;
+      if (model != IntPtr.Zero) TFL_DeleteModel(model);
+      model = IntPtr.Zero;
     }
 
     public void Invoke() {
-      ThrowIfError(TFL_InterpreterInvoke(handle));
+      ThrowIfError(TFL_InterpreterInvoke(interpreter));
     }
 
     public int GetInputTensorCount() {
-      return TFL_InterpreterGetInputTensorCount(handle);
+      return TFL_InterpreterGetInputTensorCount(interpreter);
     }
 
     public void SetInputTensorData(int inputTensorIndex, Array inputTensorData) {
       GCHandle tensorDataHandle = GCHandle.Alloc(inputTensorData, GCHandleType.Pinned);
       IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
-      TFL_Tensor tensor = TFL_InterpreterGetInputTensor(handle, inputTensorIndex);
+      TFL_Tensor tensor = TFL_InterpreterGetInputTensor(interpreter, inputTensorIndex);
       ThrowIfError(TFL_TensorCopyFromBuffer(
           tensor, tensorDataPtr, Buffer.ByteLength(inputTensorData)));
     }
 
     public void ResizeInputTensor(int inputTensorIndex, int[] inputTensorShape) {
       ThrowIfError(TFL_InterpreterResizeInputTensor(
-          handle, inputTensorIndex, inputTensorShape, inputTensorShape.Length));
+          interpreter, inputTensorIndex, inputTensorShape, inputTensorShape.Length));
     }
 
     public void AllocateTensors() {
-      ThrowIfError(TFL_InterpreterAllocateTensors(handle));
+      ThrowIfError(TFL_InterpreterAllocateTensors(interpreter));
     }
 
     public int GetOutputTensorCount() {
-      return TFL_InterpreterGetOutputTensorCount(handle);
+      return TFL_InterpreterGetOutputTensorCount(interpreter);
     }
 
     public void GetOutputTensorData(int outputTensorIndex, Array outputTensorData) {
       GCHandle tensorDataHandle = GCHandle.Alloc(outputTensorData, GCHandleType.Pinned);
       IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
-      TFL_Tensor tensor = TFL_InterpreterGetOutputTensor(handle, outputTensorIndex);
+      TFL_Tensor tensor = TFL_InterpreterGetOutputTensor(interpreter, outputTensorIndex);
       ThrowIfError(TFL_TensorCopyToBuffer(
           tensor, tensorDataPtr, Buffer.ByteLength(outputTensorData)));
     }
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
index 834d1ebd66..121997dcb2 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
+++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <vector>
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
index 9d1e6a562f..32458305c4 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
+++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/g3doc/_book.yaml b/tensorflow/contrib/lite/g3doc/_book.yaml
index 98abd5743b..1dffe30790 100644
--- a/tensorflow/contrib/lite/g3doc/_book.yaml
+++ b/tensorflow/contrib/lite/g3doc/_book.yaml
@@ -1,6 +1,7 @@
 upper_tabs:
 # Tabs left of dropdown menu
 - include: /_upper_tabs_left.yaml
+- include: /versions/_upper_tabs_versions.yaml
 # Dropdown menu
 - name: Ecosystem
   path: /ecosystem
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index 776803da8c..f255017ad9 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # TensorFlow Lite APIs
 
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
index d979353bb3..ee6150b60e 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # How to use custom operators
 
diff --git a/tensorflow/contrib/lite/g3doc/demo_android.md b/tensorflow/contrib/lite/g3doc/demo_android.md
index d79a2696b4..c38b928684 100644
--- a/tensorflow/contrib/lite/g3doc/demo_android.md
+++ b/tensorflow/contrib/lite/g3doc/demo_android.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Android Demo App
 
diff --git a/tensorflow/contrib/lite/g3doc/demo_ios.md b/tensorflow/contrib/lite/g3doc/demo_ios.md
index a554898899..7579ad84a0 100644
--- a/tensorflow/contrib/lite/g3doc/demo_ios.md
+++ b/tensorflow/contrib/lite/g3doc/demo_ios.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # iOS Demo App
 
diff --git a/tensorflow/contrib/lite/g3doc/devguide.md b/tensorflow/contrib/lite/g3doc/devguide.md
index dc9cc98c08..90e7915c52 100644
--- a/tensorflow/contrib/lite/g3doc/devguide.md
+++ b/tensorflow/contrib/lite/g3doc/devguide.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Developer Guide
 
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/contrib/lite/g3doc/ios.md
index d78d373ccf..5ff0412209 100644
--- a/tensorflow/contrib/lite/g3doc/ios.md
+++ b/tensorflow/contrib/lite/g3doc/ios.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # TensorFlow Lite for iOS
 
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
index 4ceb9a53dc..b984671e89 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # List of Hosted Models
 
diff --git a/tensorflow/contrib/lite/g3doc/ops_versioning.md b/tensorflow/contrib/lite/g3doc/ops_versioning.md
index b06f4fd3b8..0d571ce547 100644
--- a/tensorflow/contrib/lite/g3doc/ops_versioning.md
+++ b/tensorflow/contrib/lite/g3doc/ops_versioning.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # TensorFlow Lite Ops Versioning
 
diff --git a/tensorflow/contrib/lite/g3doc/overview.md b/tensorflow/contrib/lite/g3doc/overview.md
index be60d7941a..8cf43496df 100644
--- a/tensorflow/contrib/lite/g3doc/overview.md
+++ b/tensorflow/contrib/lite/g3doc/overview.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Introduction to TensorFlow Lite
 
diff --git a/tensorflow/contrib/lite/g3doc/performance.md b/tensorflow/contrib/lite/g3doc/performance.md
index 5cd0aab44f..28cb6aba6e 100644
--- a/tensorflow/contrib/lite/g3doc/performance.md
+++ b/tensorflow/contrib/lite/g3doc/performance.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Performance
 
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
index 9fcf79ba00..8ed8640582 100644
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ b/tensorflow/contrib/lite/g3doc/rpi.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # TensorFlow Lite for Raspberry Pi
 
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index aa65ec9988..fb9d5f6787 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # TensorFlow Lite & TensorFlow Compatibility Guide
 
@@ -843,6 +841,19 @@ Outputs {
 }
 ```
 
+**UNPACK**
+
+```
+Inputs {
+  0: a tensor.
+  1: an integer.
+  2: an integer.
+}
+Outputs {
+  0-N: tensors of unpacked tensor.
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
index 76e16fc9db..c7cdee07de 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Building TensorFlow on Android
 
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/index.md b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
index bd047bfcec..d003bb2f38 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/index.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Overview
 
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
index 6223707892..be8b4100c8 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Building TensorFlow on iOS
 
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
index 4c2071ed05..4d4bb3bc08 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Integrating TensorFlow libraries
 
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
index a0192c3541..7436594fd8 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Optimizing for mobile
 
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
index 6b4e4a92bd..d1c67d4c61 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
@@ -1,5 +1,3 @@
-book_path: /mobile/_book.yaml
-project_path: /mobile/_project.yaml
 
 # Preparing models for mobile deployment
 
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 7a680f5c64..5ab53f4c1d 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -157,7 +157,7 @@ Interpreter::~Interpreter() {
     TfLiteTensor* tensor = &context_.tensors[i];
     if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
         tensor->delegate->FreeBufferHandle != nullptr) {
-      tensor->delegate->FreeBufferHandle(tensor->delegate,
+      tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
                                          &tensor->buffer_handle);
     }
     TfLiteTensorFree(tensor);
@@ -476,6 +476,10 @@ TfLiteStatus Interpreter::ResetVariableTensorsToZero() {
   return kTfLiteOk;
 }
 
+void Interpreter::ReserveNodes(int count) {
+  nodes_and_registration_.reserve(count);
+}
+
 TfLiteStatus Interpreter::AddNodeWithParameters(
     const std::vector<int>& inputs, const std::vector<int>& outputs,
     const char* init_data, size_t init_data_size, void* builtin_data,
@@ -988,7 +992,7 @@ TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
   tensor->delegate = delegate;
   if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
     TF_LITE_ENSURE(&context_, tensor->delegate->FreeBufferHandle != nullptr);
-    tensor->delegate->FreeBufferHandle(tensor->delegate,
+    tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
                                        &tensor->buffer_handle);
   }
   tensor->buffer_handle = buffer_handle;
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 159ff7bc20..2b1f1819b9 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -136,6 +136,11 @@ class Interpreter {
   // interpreter.
   TfLiteStatus SetVariables(std::vector<int> variables);
 
+  // Ensure the internal node storage memory allocates at least `count`
+  // spots for node. NOTE, this doesn't actually add operators. This is an
+  // efficiency optimization that is subject to change.
+  void ReserveNodes(int count);
+
   // Adds a node with the given parameters and returns the index of the new
   // node in `node_index` (optionally). Interpreter will take ownership of
   // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
@@ -350,7 +355,7 @@ class Interpreter {
       // This can be null if the delegate doesn't use its own buffer.
       TF_LITE_ENSURE(&context_,
                      tensor->delegate->CopyFromBufferHandle != nullptr);
-      tensor->delegate->CopyFromBufferHandle(tensor->delegate,
+      tensor->delegate->CopyFromBufferHandle(&context_, tensor->delegate,
                                              tensor->buffer_handle,
                                              tensor->data.raw, tensor->bytes);
       tensor->data_is_stale = false;
@@ -413,7 +418,12 @@ class Interpreter {
     return op_reg.profiling_string(&context_, node);
   }
 
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+
  private:
+  friend class InterpreterBuilder;
   friend class InterpreterTest;
 
   // Prevent 'context_' from accessing functions that are only available to
@@ -543,12 +553,30 @@ class Interpreter {
       struct TfLiteContext* context, TfLiteExternalContextType type);
 
   // Set the value of an external context.
-  void SetExternalContext(TfLiteExternalContextType type,
-                          TfLiteExternalContext* ctx);
   static void SetExternalContext(struct TfLiteContext* context,
                                  TfLiteExternalContextType type,
                                  TfLiteExternalContext* ctx);
 
+  using TfLiteDelegatePtr =
+      std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+  // Variant of the public ModifyGraphWithDelegate method that additionally
+  // Assumes ownership of the provided delegate.
+  // WARNING: This is an experimental API and subject to change.
+  template <typename Delegate>
+  TfLiteStatus ModifyGraphWithDelegate(std::unique_ptr<Delegate> typed_delegate,
+                                       bool allow_dynamic_tensors = false) {
+    TfLiteDelegatePtr delegate(typed_delegate.release(),
+                               [](TfLiteDelegate* delegate) {
+                                 delete static_cast<Delegate*>(delegate);
+                               });
+    // Note that we retain ownership of the delegate even if graph modification
+    // fails, as delegate use will be in an indeterminate state at that point.
+    owned_delegates_.push_back(std::move(delegate));
+    return ModifyGraphWithDelegate(owned_delegates_.back().get(),
+                                   allow_dynamic_tensors);
+  }
+
   // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
   // capacity. Calling this function may invalidate existing pointers to
   // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
@@ -628,6 +656,11 @@ class Interpreter {
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
 
+  // List of delegates that have been installed and are owned by this
+  // interpreter instance. Useful if client delegate ownership is burdensome.
+  // WARNING: This is an experimental API and subject to change.
+  std::vector<TfLiteDelegatePtr> owned_delegates_;
+
   std::unique_ptr<MemoryPlanner> memory_planner_;
 
   bool allow_buffer_handle_output_ = false;
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 2bf598bad7..5bcf0927d8 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -26,6 +26,13 @@ namespace tflite {
 
 // InterpreterTest is a friend of Interpreter, so it can access context_.
 class InterpreterTest : public ::testing::Test {
+ public:
+  template <typename Delegate>
+  static TfLiteStatus ModifyGraphWithDelegate(
+      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
+    return interpreter->ModifyGraphWithDelegate(std::move(delegate));
+  }
+
  protected:
   TfLiteContext* GetInterpreterContext() { return &interpreter_.context_; }
 
@@ -1080,21 +1087,22 @@ class TestDelegate : public ::testing::Test {
         return kTfLiteOk;
       };
       delegate_.CopyToBufferHandle =
-          [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
-             void* data, size_t size) -> TfLiteStatus {
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle, void* data,
+             size_t size) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
       delegate_.CopyFromBufferHandle =
-          [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle,
-             void* data, size_t size) -> TfLiteStatus {
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle, void* data,
+             size_t size) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
-      delegate_.FreeBufferHandle = [](TfLiteDelegate* delegate,
-                                      TfLiteBufferHandle* handle) {
-        *handle = kTfLiteNullBufferHandle;
-      };
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
       // Store type-punned data SimpleDelegate structure.
       delegate_.data_ = reinterpret_cast<void*>(this);
     }
@@ -1301,6 +1309,57 @@ TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
   ASSERT_EQ(interpreter_->execution_plan()[0], 1);
 }
 
+TEST(TestDelegateOwnership, ProperlyDisposed) {
+  struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
+    TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
+        : destroyed(destroyed), prepared(prepared) {
+      Prepare = [](TfLiteContext*, TfLiteDelegate* delegate) -> TfLiteStatus {
+        *static_cast<TfLiteInterpreterOwnedDelegate*>(delegate)->prepared =
+            true;
+        return kTfLiteOk;
+      };
+    }
+    ~TfLiteInterpreterOwnedDelegate() { *destroyed = true; }
+
+    bool* destroyed;
+    bool* prepared;
+  };
+
+  // Construct a delegate with flags for indicating preparation/destruction.
+  bool destroyed = false;
+  bool prepared = false;
+  std::unique_ptr<TfLiteInterpreterOwnedDelegate> delegate(
+      new TfLiteInterpreterOwnedDelegate(&destroyed, &prepared));
+  {
+    // Create an interpreter and assemble a simple graph.
+    Interpreter interpreter;
+    TfLiteRegistration registration = {nullptr, nullptr, nullptr, nullptr};
+    ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+    ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+    ASSERT_EQ(interpreter.SetOutputs({1}), kTfLiteOk);
+    ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr,
+                                                &registration),
+              kTfLiteOk);
+
+    // Pass delegate ownership to that interpreter.
+    ASSERT_EQ(InterpreterTest::ModifyGraphWithDelegate(&interpreter,
+                                                       std::move(delegate)),
+              kTfLiteOk);
+
+    // The delegate should be prepared as normal, and should be preserved.
+    EXPECT_TRUE(prepared);
+    EXPECT_FALSE(destroyed);
+
+    // Interpreter interaction should not impact the delegate's validity.
+    interpreter.AllocateTensors();
+    interpreter.Invoke();
+    EXPECT_FALSE(destroyed);
+  }
+
+  // Only after the interpreter is destroyed should the delegate be destroyed.
+  EXPECT_TRUE(destroyed);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index 94a1ec65d6..41093e8ffe 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -15,8 +15,8 @@ limitations under the License.
 
 package org.tensorflow.lite;
 
-/** Type of elements in a {@link TfLiteTensor}. */
-enum DataType {
+/** Represents the type of elements in a TensorFlow Lite {@link Tensor} as an enum. */
+public enum DataType {
   /** 32-bit single precision floating point. */
   FLOAT32(1),
 
@@ -35,13 +35,29 @@ enum DataType {
     this.value = value;
   }
 
-  /** Corresponding value of the kTfLite* enum in the TensorFlow Lite CC API. */
-  int getNumber() {
+  /** Returns the size of an element of this type, in bytes, or -1 if element size is variable. */
+  public int byteSize() {
+    switch (this) {
+      case FLOAT32:
+        return 4;
+      case INT32:
+        return 4;
+      case UINT8:
+        return 1;
+      case INT64:
+        return 8;
+    }
+    throw new IllegalArgumentException(
+        "DataType error: DataType " + this + " is not supported yet");
+  }
+
+  /** Corresponding value of the TfLiteType enum in the TensorFlow Lite C API. */
+  int c() {
     return value;
   }
 
-  /** Converts an integer to the corresponding type. */
-  static DataType fromNumber(int c) {
+  /** Converts a C TfLiteType enum value to the corresponding type. */
+  static DataType fromC(int c) {
     for (DataType t : values) {
       if (t.value == c) {
         return t;
@@ -55,22 +71,6 @@ enum DataType {
             + ")");
   }
 
-  /** Returns byte size of the type. */
-  int elemByteSize() {
-    switch (this) {
-      case FLOAT32:
-        return 4;
-      case INT32:
-        return 4;
-      case UINT8:
-        return 1;
-      case INT64:
-        return 8;
-    }
-    throw new IllegalArgumentException(
-        "DataType error: DataType " + this + " is not supported yet");
-  }
-
   /** Gets string names of the data type. */
   String toStringName() {
     switch (this) {
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 7002f82677..b84720ae8e 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -162,9 +162,7 @@ public final class Interpreter implements AutoCloseable {
    */
   public void runForMultipleInputsOutputs(
       @NonNull Object[] inputs, @NonNull Map<Integer, Object> outputs) {
-    if (wrapper == null) {
-      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
-    }
+    checkNotClosed();
     wrapper.run(inputs, outputs);
   }
 
@@ -174,12 +172,16 @@ public final class Interpreter implements AutoCloseable {
    * <p>IllegalArgumentException will be thrown if it fails to resize.
    */
   public void resizeInput(int idx, @NonNull int[] dims) {
-    if (wrapper == null) {
-      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
-    }
+    checkNotClosed();
     wrapper.resizeInput(idx, dims);
   }
 
+  /** Gets the number of input tensors. */
+  public int getInputTensorCount() {
+    checkNotClosed();
+    return wrapper.getInputTensorCount();
+  }
+
   /**
    * Gets index of an input given the op name of the input.
    *
@@ -187,51 +189,65 @@ public final class Interpreter implements AutoCloseable {
    * to initialize the {@link Interpreter}.
    */
   public int getInputIndex(String opName) {
-    if (wrapper == null) {
-      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
-    }
+    checkNotClosed();
     return wrapper.getInputIndex(opName);
   }
 
   /**
+   * Gets the Tensor associated with the provdied input index.
+   *
+   * <p>IllegalArgumentException will be thrown if the provided index is invalid.
+   */
+  public Tensor getInputTensor(int inputIndex) {
+    checkNotClosed();
+    return wrapper.getInputTensor(inputIndex);
+  }
+
+  /** Gets the number of output Tensors. */
+  public int getOutputTensorCount() {
+    checkNotClosed();
+    return wrapper.getOutputTensorCount();
+  }
+
+  /**
    * Gets index of an output given the op name of the output.
    *
    * <p>IllegalArgumentException will be thrown if the op name does not exist in the model file used
    * to initialize the {@link Interpreter}.
    */
   public int getOutputIndex(String opName) {
-    if (wrapper == null) {
-      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
-    }
+    checkNotClosed();
     return wrapper.getOutputIndex(opName);
   }
 
   /**
+   * Gets the Tensor associated with the provdied output index.
+   *
+   * <p>IllegalArgumentException will be thrown if the provided index is invalid.
+   */
+  public Tensor getOutputTensor(int outputIndex) {
+    checkNotClosed();
+    return wrapper.getOutputTensor(outputIndex);
+  }
+
+  /**
    * Returns native inference timing.
    * <p>IllegalArgumentException will be thrown if the model is not initialized by the
    * {@link Interpreter}.
    */
   public Long getLastNativeInferenceDurationNanoseconds() {
-    if (wrapper == null) {
-      throw new IllegalStateException("Internal error: The interpreter has already been closed.");
-    }
+    checkNotClosed();
     return wrapper.getLastNativeInferenceDurationNanoseconds();
   }
 
   /** Turns on/off Android NNAPI for hardware acceleration when it is available. */
   public void setUseNNAPI(boolean useNNAPI) {
-    if (wrapper != null) {
-      wrapper.setUseNNAPI(useNNAPI);
-    } else {
-      throw new IllegalStateException(
-          "Internal error: NativeInterpreterWrapper has already been closed.");
-    }
+    checkNotClosed();
+    wrapper.setUseNNAPI(useNNAPI);
   }
 
   public void setNumThreads(int numThreads) {
-    if (wrapper == null) {
-      throw new IllegalStateException("The interpreter has already been closed.");
-    }
+    checkNotClosed();
     wrapper.setNumThreads(numThreads);
   }
 
@@ -253,5 +269,11 @@ public final class Interpreter implements AutoCloseable {
     }
   }
 
+  private void checkNotClosed() {
+    if (wrapper == null) {
+      throw new IllegalStateException("Internal error: The Interpreter has already been closed.");
+    }
+  }
+
   NativeInterpreterWrapper wrapper;
 }
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 767a220f8c..fa25082304 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -114,12 +114,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       }
     }
 
-    if (!isMemoryAllocated) {
+    boolean needsAllocation = !isMemoryAllocated;
+    if (needsAllocation) {
       allocateTensors(interpreterHandle, errorHandle);
       isMemoryAllocated = true;
-      // Allocation can trigger dynamic resizing of output tensors, so clear the
-      // output tensor cache.
-      Arrays.fill(outputTensors, null);
     }
 
     for (int i = 0; i < inputs.length; ++i) {
@@ -130,6 +128,14 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     run(interpreterHandle, errorHandle);
     long inferenceDurationNanoseconds = System.nanoTime() - inferenceStartNanos;
 
+    // Allocation can trigger dynamic resizing of output tensors, so refresh all output shapes.
+    if (needsAllocation) {
+      for (int i = 0; i < outputTensors.length; ++i) {
+        if (outputTensors[i] != null) {
+          outputTensors[i].refreshShape();
+        }
+      }
+    }
     for (Map.Entry<Integer, Object> output : outputs.entrySet()) {
       getOutputTensor(output.getKey()).copyTo(output.getValue());
     }
@@ -144,8 +150,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   void resizeInput(int idx, int[] dims) {
     if (resizeInput(interpreterHandle, errorHandle, idx, dims)) {
       isMemoryAllocated = false;
-      // Resizing will invalidate the Tensor's shape, so invalidate the Tensor handle.
-      inputTensors[idx] = null;
+      if (inputTensors[idx] != null) {
+        inputTensors[idx].refreshShape();
+      }
     }
   }
 
@@ -230,6 +237,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return getOutputQuantizationScale(interpreterHandle, index);
   }
 
+  /** Gets the number of input tensors. */
+  int getInputTensorCount() {
+    return inputTensors.length;
+  }
+
   /**
    * Gets the input {@link Tensor} for the provided input index.
    *
@@ -247,6 +259,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return inputTensor;
   }
 
+  /** Gets the number of output tensors. */
+  int getOutputTensorCount() {
+    return inputTensors.length;
+  }
+
   /**
    * Gets the output {@link Tensor} for the provided output index.
    *
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 2403570c52..f174178d98 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -26,7 +26,7 @@ import java.util.Arrays;
  * <p>The native handle of a {@code Tensor} belongs to {@code NativeInterpreterWrapper}, thus not
  * needed to be closed here.
  */
-final class Tensor {
+public final class Tensor {
 
   static Tensor fromHandle(long nativeHandle) {
     return new Tensor(nativeHandle);
@@ -37,11 +37,26 @@ final class Tensor {
     return dtype;
   }
 
+  /**
+   * Returns the number of dimensions (sometimes referred to as <a
+   * href="https://www.tensorflow.org/resources/dims_types.html#rank">rank</a>) of the Tensor.
+   *
+   * <p>Will be 0 for a scalar, 1 for a vector, 2 for a matrix, 3 for a 3-dimensional tensor etc.
+   */
+  public int numDimensions() {
+    return shapeCopy.length;
+  }
+
   /** Returns the size, in bytes, of the tensor data. */
   public int numBytes() {
     return numBytes(nativeHandle);
   }
 
+  /** Returns the number of elements in a flattened (1-D) view of the tensor. */
+  public int numElements() {
+    return computeNumElements(shapeCopy);
+  }
+
   /**
    * Returns the <a href="https://www.tensorflow.org/resources/dims_types.html#shape">shape</a> of
    * the Tensor, i.e., the sizes of each dimension.
@@ -103,13 +118,22 @@ final class Tensor {
     if (isByteBuffer(input)) {
       return null;
     }
-    int[] inputShape = shapeOf(input);
+    int[] inputShape = computeShapeOf(input);
     if (Arrays.equals(shapeCopy, inputShape)) {
       return null;
     }
     return inputShape;
   }
 
+  /**
+   * Forces a refresh of the tensor's cached shape.
+   *
+   * <p>This is useful if the tensor is resized or has a dynamic shape.
+   */
+  void refreshShape() {
+    this.shapeCopy = shape(nativeHandle);
+  }
+
   /** Returns the type of the data. */
   static DataType dataTypeOf(Object o) {
     if (o != null) {
@@ -132,22 +156,31 @@ final class Tensor {
   }
 
   /** Returns the shape of an object as an int array. */
-  static int[] shapeOf(Object o) {
-    int size = numDimensions(o);
+  static int[] computeShapeOf(Object o) {
+    int size = computeNumDimensions(o);
     int[] dimensions = new int[size];
     fillShape(o, 0, dimensions);
     return dimensions;
   }
 
+  /** Returns the number of elements in a flattened (1-D) view of the tensor's shape. */
+  static int computeNumElements(int[] shape) {
+    int n = 1;
+    for (int i = 0; i < shape.length; ++i) {
+      n *= shape[i];
+    }
+    return n;
+  }
+
   /** Returns the number of dimensions of a multi-dimensional array, otherwise 0. */
-  static int numDimensions(Object o) {
+  static int computeNumDimensions(Object o) {
     if (o == null || !o.getClass().isArray()) {
       return 0;
     }
     if (Array.getLength(o) == 0) {
       throw new IllegalArgumentException("Array lengths cannot be 0.");
     }
-    return 1 + numDimensions(Array.get(o, 0));
+    return 1 + computeNumDimensions(Array.get(o, 0));
   }
 
   /** Recursively populates the shape dimensions for a given (multi-dimensional) array. */
@@ -188,7 +221,7 @@ final class Tensor {
               dtype, o.getClass().getName(), oType));
     }
 
-    int[] oShape = shapeOf(o);
+    int[] oShape = computeShapeOf(o);
     if (!Arrays.equals(oShape, shapeCopy)) {
       throw new IllegalArgumentException(
           String.format(
@@ -204,11 +237,11 @@ final class Tensor {
 
   private final long nativeHandle;
   private final DataType dtype;
-  private final int[] shapeCopy;
+  private int[] shapeCopy;
 
   private Tensor(long nativeHandle) {
     this.nativeHandle = nativeHandle;
-    this.dtype = DataType.fromNumber(dtype(nativeHandle));
+    this.dtype = DataType.fromC(dtype(nativeHandle));
     this.shapeCopy = shape(nativeHandle);
   }
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.h b/tensorflow/contrib/lite/java/src/main/native/exception_jni.h
index 3ffff052df..2a4bbdbead 100644
--- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_EXCEPTION_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_EXCEPTION_JNI_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
 
 #include <jni.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
@@ -47,4 +47,4 @@ class BufferErrorReporter : public tflite::ErrorReporter {
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_EXCEPTION_JNI_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index 618fba480e..55ca47fed7 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_NATIVEINTERPRETERWRAPPER_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_NATIVEINTERPRETERWRAPPER_JNI_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
 
 #include <jni.h>
 #include <stdio.h>
@@ -230,4 +230,4 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_NATIVEINTERPRETERWRAPPER_JNI_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
index 06e2546af8..c020f13d9c 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_TENSOR_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_TENSOR_JNI_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
 
 #include <jni.h>
 #include "tensorflow/contrib/lite/context.h"
@@ -92,4 +92,4 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_TENSOR_JNI_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h b/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h
index 65f8341149..5e2a7ded1b 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_TENSORFLOW_LITE_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_TENSORFLOW_LITE_JNI_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
 
 #include <jni.h>
 
@@ -33,4 +33,4 @@ Java_org_tensorflow_lite_TensorFlowLite_version(JNIEnv*, jclass);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_TENSORFLOW_LITE_JNI_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
index cebc944200..6d6417f895 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
@@ -26,9 +26,16 @@ public final class DataTypeTest {
 
   @Test
   public void testElemByteSize() {
-    assertThat(DataType.FLOAT32.elemByteSize()).isEqualTo(4);
-    assertThat(DataType.INT32.elemByteSize()).isEqualTo(4);
-    assertThat(DataType.UINT8.elemByteSize()).isEqualTo(1);
-    assertThat(DataType.INT64.elemByteSize()).isEqualTo(8);
+    assertThat(DataType.FLOAT32.byteSize()).isEqualTo(4);
+    assertThat(DataType.INT32.byteSize()).isEqualTo(4);
+    assertThat(DataType.UINT8.byteSize()).isEqualTo(1);
+    assertThat(DataType.INT64.byteSize()).isEqualTo(8);
+  }
+
+  @Test
+  public void testConversion() {
+    for (DataType dataType : DataType.values()) {
+      assertThat(DataType.fromC(dataType.c())).isEqualTo(dataType);
+    }
   }
 }
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index d66a73db94..9070b788b6 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -47,6 +47,10 @@ public final class InterpreterTest {
   public void testInterpreter() throws Exception {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
     assertThat(interpreter).isNotNull();
+    assertThat(interpreter.getInputTensorCount()).isEqualTo(1);
+    assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getOutputTensorCount()).isEqualTo(1);
+    assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
     interpreter.close();
   }
 
@@ -183,6 +187,19 @@ public final class InterpreterTest {
   }
 
   @Test
+  public void testResizeInput() {
+    try (Interpreter interpreter = new Interpreter(MODEL_FILE)) {
+      int[] inputDims = {1};
+      interpreter.resizeInput(0, inputDims);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(inputDims);
+      ByteBuffer input = ByteBuffer.allocateDirect(4).order(ByteOrder.nativeOrder());
+      ByteBuffer output = ByteBuffer.allocateDirect(4).order(ByteOrder.nativeOrder());
+      interpreter.run(input, output);
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(inputDims);
+    }
+  }
+
+  @Test
   public void testMobilenetRun() {
     // Create a gray image.
     float[][][][] img = new float[1][224][224][3];
@@ -199,6 +216,8 @@ public final class InterpreterTest {
 
     Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
     interpreter.run(img, labels);
+    assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+    assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
     interpreter.close();
 
     assertThat(labels[0])
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index 71ef044943..85ad393d89 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -64,6 +64,8 @@ public final class TensorTest {
     assertThat(tensor.shape()).isEqualTo(expectedShape);
     assertThat(tensor.dataType()).isEqualTo(DataType.FLOAT32);
     assertThat(tensor.numBytes()).isEqualTo(2 * 8 * 8 * 3 * 4);
+    assertThat(tensor.numElements()).isEqualTo(2 * 8 * 8 * 3);
+    assertThat(tensor.numDimensions()).isEqualTo(4);
   }
 
   @Test
@@ -201,12 +203,12 @@ public final class TensorTest {
   @Test
   public void testNumDimensions() {
     int scalar = 1;
-    assertThat(Tensor.numDimensions(scalar)).isEqualTo(0);
+    assertThat(Tensor.computeNumDimensions(scalar)).isEqualTo(0);
     int[][] array = {{2, 4}, {1, 9}};
-    assertThat(Tensor.numDimensions(array)).isEqualTo(2);
+    assertThat(Tensor.computeNumDimensions(array)).isEqualTo(2);
     try {
       int[] emptyArray = {};
-      Tensor.numDimensions(emptyArray);
+      Tensor.computeNumDimensions(emptyArray);
       fail();
     } catch (IllegalArgumentException e) {
       assertThat(e).hasMessageThat().contains("Array lengths cannot be 0.");
@@ -214,9 +216,21 @@ public final class TensorTest {
   }
 
   @Test
+  public void testNumElements() {
+    int[] scalarShape = {};
+    assertThat(Tensor.computeNumElements(scalarShape)).isEqualTo(1);
+    int[] vectorShape = {3};
+    assertThat(Tensor.computeNumElements(vectorShape)).isEqualTo(3);
+    int[] matrixShape = {3, 4};
+    assertThat(Tensor.computeNumElements(matrixShape)).isEqualTo(12);
+    int[] degenerateShape = {3, 4, 0};
+    assertThat(Tensor.computeNumElements(degenerateShape)).isEqualTo(0);
+  }
+
+  @Test
   public void testFillShape() {
     int[][][] array = {{{23}, {14}, {87}}, {{12}, {42}, {31}}};
-    int num = Tensor.numDimensions(array);
+    int num = Tensor.computeNumDimensions(array);
     int[] shape = new int[num];
     Tensor.fillShape(array, 0, shape);
     assertThat(num).isEqualTo(3);
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index c5586475ec..407d52f0e8 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -211,6 +211,7 @@ cc_library(
         "transpose_conv.cc",
         "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
+        "unpack.cc",
     ],
     hdrs = [
         "padding.h",
@@ -225,6 +226,7 @@ cc_library(
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite:util",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/kernels/internal:audio_utils",
         "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
@@ -1200,6 +1202,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "unpack_test",
+    size = "small",
+    srcs = ["unpack_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 817266a471..d6d62580e2 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -40,6 +40,11 @@ struct OpData {
   int diff_min = 0;
 };
 
+struct LogSoftmaxOpData : public OpData {
+  int32_t reverse_scaling_divisor = 0;
+  int32_t reverse_scaling_right_shift = 0;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -47,10 +52,19 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   return new OpData;
 }
 
+void* LogSoftmaxInit(TfLiteContext* context, const char* buffer,
+                     size_t length) {
+  return new LogSoftmaxOpData;
+}
+
 void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
+void LogSoftmaxFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<LogSoftmaxOpData*>(buffer);
+}
+
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -205,6 +219,34 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
+TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  LogSoftmaxOpData* data = reinterpret_cast<LogSoftmaxOpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  if (input->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+    TF_LITE_ENSURE_EQ(context, output->params.scale, 16.0 / 256);
+
+    static const double kBeta = 1.0;
+    static const int kScaledDiffIntegerBits = 5;
+    tflite::PreprocessLogSoftmaxScalingExp(
+        kBeta, input->params.scale, kScaledDiffIntegerBits,
+        &data->input_multiplier, &data->input_left_shift,
+        &data->reverse_scaling_divisor, &data->reverse_scaling_right_shift);
+    data->reverse_scaling_right_shift *= -1;
+    data->diff_min = -1.0 * tflite::CalculateInputRadius(
+                                kScaledDiffIntegerBits, data->input_left_shift);
+  }
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
 TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -509,6 +551,8 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  const LogSoftmaxOpData* data =
+      reinterpret_cast<LogSoftmaxOpData*>(node->user_data);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
@@ -517,6 +561,14 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorData<float>(input), GetTensorShape(input),
           GetTensorData<float>(output), GetTensorShape(output));
       return kTfLiteOk;
+    case kTfLiteUInt8:
+      optimized_ops::LogSoftmax(
+          GetTensorData<uint8_t>(input), GetTensorShape(input),
+          data->input_multiplier, data->input_left_shift,
+          data->reverse_scaling_divisor, data->reverse_scaling_right_shift,
+          data->diff_min, GetTensorData<uint8_t>(output),
+          GetTensorShape(output));
+      return kTfLiteOk;
     default:
       context->ReportError(context, "Only float32 supported currently., got %d",
                            input->type);
@@ -590,9 +642,9 @@ TfLiteRegistration* Register_SOFTMAX() {
 }
 
 TfLiteRegistration* Register_LOG_SOFTMAX() {
-  static TfLiteRegistration r = {activations::Init, activations::Free,
-                                 activations::GenericPrepare,
-                                 activations::LogSoftmaxEval};
+  static TfLiteRegistration r = {
+      activations::LogSoftmaxInit, activations::LogSoftmaxFree,
+      activations::LogSoftmaxPrepare, activations::LogSoftmaxEval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index 083cdf78d7..e577e3a762 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -471,6 +471,28 @@ TEST(FloatActivationsOpTest, LogSoftmax) {
                               })));
 }
 
+TEST(QuantizedActivationsOpTest, LogSoftmax) {
+  const float kLogSoftmaxQuantizedTolerance = 16 / 256.0;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOG_SOFTMAX,
+      /*input=*/{TensorType_UINT8, {2, 4}, -10, 10},
+      /*output=*/{TensorType_UINT8, {}, 0, 0, 16. / 256, 255});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -4.14297, -10.14297, -2.14297, -.142971,    //
+                      -7.00104, -12.00104, -.00104087, -9.00104,  //
+                  },
+                  kLogSoftmaxQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
+}
+
 class PReluOpModel : public SingleOpModel {
  public:
   PReluOpModel(const TensorData& input, const TensorData& alpha) {
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
index 91d8dd3fa7..1170d84553 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
index 8d460fdfc6..7346b9fd80 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 04c0263b78..50fe5c2e04 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -334,18 +334,31 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
 
-  switch (kernel_type) {
+  KernelType effective_kernel_type;
+  if ((kernel_type == kMultithreadOptimized ||
+       kernel_type == kCblasOptimized) &&
+      (params->dilation_width_factor != 1 ||
+       params->dilation_height_factor != 1)) {
+    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // Therefore, fallback to optimized.
+    effective_kernel_type = kGenericOptimized;
+  } else {
+    effective_kernel_type = kernel_type;
+  }
+
+  switch (effective_kernel_type) {
     case kReference:
       reference_ops::Conv(
           GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
           GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
           GetTensorData<int32_t>(bias), GetTensorDims(bias),
-          params->stride_width, params->stride_height, data->padding.width,
-          data->padding.height, output_offset, data->output_multiplier,
-          data->output_shift, data->output_activation_min,
-          data->output_activation_max, GetTensorData<uint8_t>(output),
-          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
-          GetTensorDims(im2col), gemm_context);
+          params->stride_width, params->stride_height,
+          params->dilation_width_factor, params->dilation_height_factor,
+          data->padding.width, data->padding.height, output_offset,
+          data->output_multiplier, data->output_shift,
+          data->output_activation_min, data->output_activation_max,
+          GetTensorData<uint8_t>(output), GetTensorDims(output),
+          GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
       break;
     case kGenericOptimized:
     case kMultithreadOptimized:
@@ -355,12 +368,13 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
           GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
           GetTensorData<int32_t>(bias), GetTensorDims(bias),
-          params->stride_width, params->stride_height, data->padding.width,
-          data->padding.height, output_offset, data->output_multiplier,
-          data->output_shift, data->output_activation_min,
-          data->output_activation_max, GetTensorData<uint8_t>(output),
-          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
-          GetTensorDims(im2col), gemm_context);
+          params->stride_width, params->stride_height,
+          params->dilation_width_factor, params->dilation_height_factor,
+          data->padding.width, data->padding.height, output_offset,
+          data->output_multiplier, data->output_shift,
+          data->output_activation_min, data->output_activation_max,
+          GetTensorData<uint8_t>(output), GetTensorDims(output),
+          GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
       break;
   }
 }
@@ -374,10 +388,10 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   KernelType effective_kernel_type;
-  if (((kernel_type == kMultithreadOptimized) ||
-       (kernel_type == kCblasOptimized)) &&
-      ((params->dilation_width_factor != 1) ||
-       (params->dilation_height_factor != 1))) {
+  if ((kernel_type == kMultithreadOptimized ||
+       kernel_type == kCblasOptimized) &&
+      (params->dilation_width_factor != 1 ||
+       params->dilation_height_factor != 1)) {
     // kMultithreadOptimized and kCblasOptimized do not support dilation.
     // Therefore, fallback to optimized.
     effective_kernel_type = kGenericOptimized;
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 24633c2fd7..98152043c9 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -370,6 +370,65 @@ TEST_P(ConvolutionOpTest, HandCalculatedValidFloat32) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({312, 357}));
 }
 
+TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 3;
+  const int dilation_height_factor = 3;
+  const Padding padding = Padding_VALID;
+  ConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE, dilation_width_factor,
+      dilation_height_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
 class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
@@ -500,6 +559,71 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
                              }));
 }
 
+TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 3;
+  const int dilation_height_factor = 3;
+  const Padding padding = Padding_VALID;
+  QuantizedConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {image_batch_count, image_height, image_width, depth},
+       0,
+       255},
+      {TensorType_UINT8,
+       {depth, filter_size, filter_size, filter_count},
+       0,
+       255},
+      {TensorType_UINT8, {}, 0, 255}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE, dilation_width_factor,
+      dilation_height_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
 INSTANTIATE_TEST_CASE_P(
     ConvolutionOpTest, ConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess.cc b/tensorflow/contrib/lite/kernels/detection_postprocess.cc
index d7bde0ff79..136697f945 100644
--- a/tensorflow/contrib/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/contrib/lite/kernels/detection_postprocess.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include <string.h>
 #include <numeric>
 #include <vector>
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc b/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
index 4e0f8484a3..94c91a6bd6 100644
--- a/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index a97db6c6b2..96798c900e 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -293,7 +293,6 @@ cc_library(
         ":round",
         ":strided_slice_logic",
         ":types",
-        "//third_party/eigen3",
         "@gemmlowp",
         "//tensorflow/contrib/lite:builtin_op_data",
     ] + select({
@@ -324,7 +323,6 @@ cc_library(
         ":round",
         ":strided_slice_logic",
         ":types",
-        "//third_party/eigen3",
         "@gemmlowp",
         "//tensorflow/contrib/lite:builtin_op_data",
     ] + select({
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index 200f2f1515..88a0622286 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -127,6 +127,47 @@ void LstmStep(
     float* cell_state_ptr, float* input_gate_scratch,
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch) {
+  LstmStepWithAuxInput(
+      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
+      input_to_cell_weights_ptr, input_to_output_weights_ptr,
+      /*aux_input_ptr_batch=*/nullptr,
+      /*aux_input_to_input_weights_ptr=*/nullptr,
+      /*aux_input_to_forget_weights_ptr=*/nullptr,
+      /*aux_input_to_cell_weights_ptr=*/nullptr,
+      /*aux_input_to_output_weights_ptr=*/nullptr,
+      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
+      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
+      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
+      cell_to_output_weights_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
+      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
+      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
+      output_state_ptr, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
+      cell_scratch, output_gate_scratch, output_ptr_batch);
+}
+
+void LstmStepWithAuxInput(
+    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
+    const float* aux_input_to_input_weights_ptr,
+    const float* aux_input_to_forget_weights_ptr,
+    const float* aux_input_to_cell_weights_ptr,
+    const float* aux_input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
+    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
+    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
+    float* cell_state_ptr, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* output_ptr_batch) {
   // Since we have already checked that weights are all there or none, we can
   // check the existense of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
@@ -160,6 +201,25 @@ void LstmStep(
       input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
       output_gate_scratch, /*result_stride=*/1);
 
+  // If auxiliary input is available then compute aux_input_weight * aux_input
+  if (aux_input_ptr_batch != nullptr) {
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_to_input_weights_ptr, n_cell, n_input, aux_input_ptr_batch,
+          n_batch, input_gate_scratch, /*result_stride=*/1);
+    }
+
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_forget_weights_ptr, n_cell, n_input, aux_input_ptr_batch,
+        n_batch, forget_gate_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_cell_weights_ptr, n_cell, n_input, aux_input_ptr_batch,
+        n_batch, cell_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_output_weights_ptr, n_cell, n_input, aux_input_ptr_batch,
+        n_batch, output_gate_scratch, /*result_stride=*/1);
+  }
+
   // For each batch and cell: compute recurrent_weight * output_state.
   if (!use_cifg) {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
@@ -286,227 +346,362 @@ void LstmStep(
     int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
     int8_t* quantized_cell_state_ptr, float* output_state_ptr,
     float* cell_state_ptr, float* output_ptr_batch) {
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-  // Initialize scratch buffers with bias.
-  if (!use_cifg) {
-    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
-                                          input_gate_scratch);
-  }
-  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                        forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                        cell_scratch);
-  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                        output_gate_scratch);
-
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-    // Save quantization and matmul computation for all zero input.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_input;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
-          &unused_min, &unused_max, &scaling_factors[b]);
+  LstmStepWithAuxInput(
+      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
+      input_to_forget_weights_ptr, input_to_forget_weights_scale,
+      input_to_cell_weights_ptr, input_to_cell_weights_scale,
+      input_to_output_weights_ptr, input_to_output_weights_scale,
+      /*aux_input_ptr_batch=*/nullptr,
+      /*aux_input_to_input_weights_ptr=*/nullptr,
+      /*aux_input_to_input_weights_scale=*/0.0f,
+      /*aux_input_to_forget_weights_ptr=*/nullptr,
+      /*aux_input_to_forget_weights_scale=*/0.0f,
+      /*aux_input_to_cell_weights_ptr=*/nullptr,
+      /*aux_input_to_cell_weights_scale=*/0.0f,
+      /*aux_input_to_output_weights_ptr=*/nullptr,
+      /*aux_input_to_output_weights_scale=*/0.0f,
+      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
+      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
+      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
+      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
+      cell_to_input_weights_ptr, cell_to_input_weights_scale,
+      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
+      cell_to_output_weights_ptr, cell_to_output_weights_scale,
+      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
+      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
+      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
+      input_gate_scratch, forget_gate_scratch, cell_scratch,
+      output_gate_scratch, scaling_factors, product_scaling_factors,
+      recovered_cell_weights, quantized_input_ptr_batch,
+      /*quantized_aux_input_ptr_batch=*/nullptr, quantized_output_state_ptr,
+      quantized_cell_state_ptr, output_state_ptr, cell_state_ptr,
+      output_ptr_batch);
     }
-    // For each batch and cell: compute input_weight * input.
-    if (!use_cifg) {
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * input_to_input_weights_scale;
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          input_to_input_weights_ptr, n_cell, n_input,
-          quantized_input_ptr_batch, product_scaling_factors, n_batch,
-          input_gate_scratch, /*result_stride=*/1);
-    }
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_forget_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, forget_gate_scratch,
-        /*result_stride=*/1);
 
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_cell_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_output_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, output_gate_scratch,
-        /*result_stride=*/1);
-  }
-
-  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
-    // Save quantization and matmul computation for all zero input.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_output;
-      tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output,
-                                            quantized_output_state_ptr + offset,
-                                            &unused_min, &unused_max,
-                                            &scaling_factors[b]);
-    }
-    // For each batch and cell: compute recurrent_weight * output_state.
-    if (!use_cifg) {
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * recurrent_to_input_weights_scale;
+    void LstmStepWithAuxInput(
+        const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+        float input_to_input_weights_scale,
+        const int8_t* input_to_forget_weights_ptr,
+        float input_to_forget_weights_scale,
+        const int8_t* input_to_cell_weights_ptr,
+        float input_to_cell_weights_scale,
+        const int8_t* input_to_output_weights_ptr,
+        float input_to_output_weights_scale, const float* aux_input_ptr_batch,
+        const int8_t* aux_input_to_input_weights_ptr,
+        float aux_input_to_input_weights_scale,
+        const int8_t* aux_input_to_forget_weights_ptr,
+        float aux_input_to_forget_weights_scale,
+        const int8_t* aux_input_to_cell_weights_ptr,
+        float aux_input_to_cell_weights_scale,
+        const int8_t* aux_input_to_output_weights_ptr,
+        float aux_input_to_output_weights_scale,
+        const int8_t* recurrent_to_input_weights_ptr,
+        float recurrent_to_input_weights_scale,
+        const int8_t* recurrent_to_forget_weights_ptr,
+        float recurrent_to_forget_weights_scale,
+        const int8_t* recurrent_to_cell_weights_ptr,
+        float recurrent_to_cell_weights_scale,
+        const int8_t* recurrent_to_output_weights_ptr,
+        float recurrent_to_output_weights_scale,
+        const int8_t* cell_to_input_weights_ptr,
+        float cell_to_input_weights_scale,
+        const int8_t* cell_to_forget_weights_ptr,
+        float cell_to_forget_weights_scale,
+        const int8_t* cell_to_output_weights_ptr,
+        float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+        const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+        const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+        float projection_weights_scale, const float* projection_bias_ptr,
+        const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+        int n_output, float* input_gate_scratch, float* forget_gate_scratch,
+        float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
+        float* product_scaling_factors, float* recovered_cell_weights,
+        int8_t* quantized_input_ptr_batch,
+        int8_t* quantized_aux_input_ptr_batch,
+        int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
+        float* output_state_ptr, float* cell_state_ptr,
+        float* output_ptr_batch) {
+      // Since we have already checked that weights are all there or none, we
+      // can check the existense of only one to the get the condition.
+      const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+      const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+      // Initialize scratch buffers with bias.
+      if (!use_cifg) {
+        tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
+                                              n_batch, input_gate_scratch);
+      }
+      tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell,
+                                            n_batch, forget_gate_scratch);
+      tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                            cell_scratch);
+      tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell,
+                                            n_batch, output_gate_scratch);
+
+      if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
+        // Save quantization and matmul computation for all zero input.
+        float unused_min, unused_max;
+        for (int b = 0; b < n_batch; ++b) {
+          const int offset = b * n_input;
+          tensor_utils::SymmetricQuantizeFloats(
+              input_ptr_batch + offset, n_input,
+              quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+              &scaling_factors[b]);
+        }
+        // For each batch and cell: compute input_weight * input.
+        if (!use_cifg) {
+          for (int b = 0; b < n_batch; ++b) {
+            product_scaling_factors[b] =
+                scaling_factors[b] * input_to_input_weights_scale;
+          }
+          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+              input_to_input_weights_ptr, n_cell, n_input,
+              quantized_input_ptr_batch, product_scaling_factors, n_batch,
+              input_gate_scratch, /*result_stride=*/1);
+        }
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * input_to_forget_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            input_to_forget_weights_ptr, n_cell, n_input,
+            quantized_input_ptr_batch, product_scaling_factors, n_batch,
+            forget_gate_scratch,
+            /*result_stride=*/1);
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * input_to_cell_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            input_to_cell_weights_ptr, n_cell, n_input,
+            quantized_input_ptr_batch, product_scaling_factors, n_batch,
+            cell_scratch, /*result_stride=*/1);
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * input_to_output_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            input_to_output_weights_ptr, n_cell, n_input,
+            quantized_input_ptr_batch, product_scaling_factors, n_batch,
+            output_gate_scratch,
+            /*result_stride=*/1);
       }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          recurrent_to_input_weights_ptr, n_cell, n_output,
-          quantized_output_state_ptr, product_scaling_factors, n_batch,
-          input_gate_scratch, /*result_stride=*/1);
-    }
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_forget_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_forget_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        forget_gate_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_cell_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_cell_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        cell_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_output_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_output_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        output_gate_scratch, /*result_stride=*/1);
-  }
-
-  // Save quantization and matmul computation for all zero input.
-  bool is_cell_state_all_zeros =
-      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
 
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole && !is_cell_state_all_zeros) {
-      tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
-                                         cell_to_input_weights_scale,
-                                         recovered_cell_weights);
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
+      if (aux_input_ptr_batch != nullptr &&
+          !tensor_utils::IsZeroVector(aux_input_ptr_batch, n_batch * n_input)) {
+        // Save quantization and matmul computation for all zero input.
+        float unused_min, unused_max;
+        for (int b = 0; b < n_batch; ++b) {
+          const int offset = b * n_input;
+          tensor_utils::SymmetricQuantizeFloats(
+              aux_input_ptr_batch + offset, n_input,
+              quantized_aux_input_ptr_batch + offset, &unused_min, &unused_max,
+              &scaling_factors[b]);
+        }
+        // For each batch and cell: compute input_weight * input.
+        if (!use_cifg) {
+          for (int b = 0; b < n_batch; ++b) {
+            product_scaling_factors[b] =
+                scaling_factors[b] * aux_input_to_input_weights_scale;
+          }
+          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+              aux_input_to_input_weights_ptr, n_cell, n_input,
+              quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+              input_gate_scratch, /*result_stride=*/1);
+        }
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * aux_input_to_forget_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            aux_input_to_forget_weights_ptr, n_cell, n_input,
+            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+            forget_gate_scratch, /*result_stride=*/1);
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * aux_input_to_cell_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            aux_input_to_cell_weights_ptr, n_cell, n_input,
+            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+            cell_scratch, /*result_stride=*/1);
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * aux_input_to_output_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            aux_input_to_output_weights_ptr, n_cell, n_input,
+            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+            output_gate_scratch, /*result_stride=*/1);
+      }
 
-  // For each batch and cell: update forget gate.
-  if (use_peephole && !is_cell_state_all_zeros) {
-    tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
-                                       cell_to_forget_weights_scale,
-                                       recovered_cell_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
+      if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
+        // Save quantization and matmul computation for all zero input.
+        float unused_min, unused_max;
+        for (int b = 0; b < n_batch; ++b) {
+          const int offset = b * n_output;
+          tensor_utils::SymmetricQuantizeFloats(
+              output_state_ptr + offset, n_output,
+              quantized_output_state_ptr + offset, &unused_min, &unused_max,
+              &scaling_factors[b]);
+        }
+        // For each batch and cell: compute recurrent_weight * output_state.
+        if (!use_cifg) {
+          for (int b = 0; b < n_batch; ++b) {
+            product_scaling_factors[b] =
+                scaling_factors[b] * recurrent_to_input_weights_scale;
+          }
+          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+              recurrent_to_input_weights_ptr, n_cell, n_output,
+              quantized_output_state_ptr, product_scaling_factors, n_batch,
+              input_gate_scratch, /*result_stride=*/1);
+        }
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * recurrent_to_forget_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            recurrent_to_forget_weights_ptr, n_cell, n_output,
+            quantized_output_state_ptr, product_scaling_factors, n_batch,
+            forget_gate_scratch, /*result_stride=*/1);
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * recurrent_to_cell_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            recurrent_to_cell_weights_ptr, n_cell, n_output,
+            quantized_output_state_ptr, product_scaling_factors, n_batch,
+            cell_scratch, /*result_stride=*/1);
+
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * recurrent_to_output_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            recurrent_to_output_weights_ptr, n_cell, n_output,
+            quantized_output_state_ptr, product_scaling_factors, n_batch,
+            output_gate_scratch, /*result_stride=*/1);
+      }
 
-  // For each batch and cell: update the cell.
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  }
-  if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
-  }
+      // Save quantization and matmul computation for all zero input.
+      bool is_cell_state_all_zeros =
+          tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+
+      // For each batch and cell: update input gate.
+      if (!use_cifg) {
+        if (use_peephole && !is_cell_state_all_zeros) {
+          tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
+                                             cell_to_input_weights_scale,
+                                             recovered_cell_weights);
+          tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+              recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+              input_gate_scratch);
+        }
+        tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                           input_gate_scratch);
+      }
 
-  is_cell_state_all_zeros =
-      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-  // For each batch and cell: update the output gate.
-  if (use_peephole && !is_cell_state_all_zeros) {
-    tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
-                                       cell_to_output_weights_scale,
-                                       recovered_cell_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
+      // For each batch and cell: update forget gate.
+      if (use_peephole && !is_cell_state_all_zeros) {
+        tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
+                                           cell_to_forget_weights_scale,
+                                           recovered_cell_weights);
+        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+            recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+            forget_gate_scratch);
+      }
+      tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                         forget_gate_scratch);
+
+      // For each batch and cell: update the cell.
+      tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
+                                             cell_state_ptr, n_batch * n_cell,
+                                             cell_state_ptr);
+      tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                            params->activation, cell_scratch);
+      if (use_cifg) {
+        tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                                 forget_gate_scratch);
+        tensor_utils::VectorVectorCwiseProductAccumulate(
+            cell_scratch, forget_gate_scratch, n_batch * n_cell,
+            cell_state_ptr);
+      } else {
+        tensor_utils::VectorVectorCwiseProductAccumulate(
+            cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+      }
+      if (params->cell_clip > 0.0) {
+        tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                                 params->cell_clip, cell_state_ptr);
+      }
 
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_ptr_batch);
-    } else {
-      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-    }
-    if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
-      // Save quantization and matmul computation for all zero input.
-      float unused_min, unused_max;
-      for (int b = 0; b < n_batch; ++b) {
-        const int offset = b * n_cell;
-        tensor_utils::SymmetricQuantizeFloats(
-            output_gate_scratch + offset, n_cell,
-            quantized_cell_state_ptr + offset, &unused_min, &unused_max,
-            &scaling_factors[b]);
+      is_cell_state_all_zeros =
+          tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+      // For each batch and cell: update the output gate.
+      if (use_peephole && !is_cell_state_all_zeros) {
+        tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
+                                           cell_to_output_weights_scale,
+                                           recovered_cell_weights);
+        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+            recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+            output_gate_scratch);
       }
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * projection_weights_scale;
+      tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                         output_gate_scratch);
+      tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                            params->activation, cell_scratch);
+      tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                             n_batch * n_cell,
+                                             output_gate_scratch);
+
+      // For each batch: update the projection and output_state.
+      const bool use_projection_weight = (projection_weights_ptr != nullptr);
+      const bool use_projection_bias = (projection_bias_ptr != nullptr);
+      if (use_projection_weight) {
+        if (use_projection_bias) {
+          tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                                n_batch, output_ptr_batch);
+        } else {
+          tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+        }
+        if (!tensor_utils::IsZeroVector(output_gate_scratch,
+                                        n_batch * n_cell)) {
+          // Save quantization and matmul computation for all zero input.
+          float unused_min, unused_max;
+          for (int b = 0; b < n_batch; ++b) {
+            const int offset = b * n_cell;
+            tensor_utils::SymmetricQuantizeFloats(
+                output_gate_scratch + offset, n_cell,
+                quantized_cell_state_ptr + offset, &unused_min, &unused_max,
+                &scaling_factors[b]);
+          }
+          for (int b = 0; b < n_batch; ++b) {
+            product_scaling_factors[b] =
+                scaling_factors[b] * projection_weights_scale;
+          }
+          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+              projection_weights_ptr, n_output, n_cell,
+              quantized_cell_state_ptr, product_scaling_factors, n_batch,
+              output_ptr_batch,
+              /*result_stride=*/1);
+        }
+        if (params->proj_clip > 0.0) {
+          tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                                   params->proj_clip, output_ptr_batch);
+        }
+      } else {
+        tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                                 output_ptr_batch);
       }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
-          product_scaling_factors, n_batch, output_ptr_batch,
-          /*result_stride=*/1);
-    }
-    if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
-                               params->proj_clip, output_ptr_batch);
+      tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                               output_state_ptr);
     }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output_ptr_batch);
-  }
-  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                           output_state_ptr);
-}
 
 }  // namespace kernel_utils
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
index 2a11b37a60..1824126828 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -92,6 +92,31 @@ void LstmStep(
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch);
 
+// Same as above but includes an auxiliary input with the corresponding weights.
+void LstmStepWithAuxInput(
+    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
+    const float* aux_input_to_input_weights_ptr,
+    const float* aux_input_to_forget_weights_ptr,
+    const float* aux_input_to_cell_weights_ptr,
+    const float* aux_input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
+    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
+    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
+    float* cell_state_ptr, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* output_ptr_batch);
+
 // Same as above but with quantized weight matrices. In detail:
 // Input of size 'n_batch * n_input':
 //   input_ptr_batch
@@ -175,6 +200,46 @@ void LstmStep(
     int8_t* quantized_cell_state_ptr, float* output_state_ptr,
     float* cell_state_ptr, float* output_ptr_batch);
 
+void LstmStepWithAuxInput(
+    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    float input_to_output_weights_scale, const float* aux_input_ptr_batch,
+    const int8_t* aux_input_to_input_weights_ptr,
+    float aux_input_to_input_weights_scale,
+    const int8_t* aux_input_to_forget_weights_ptr,
+    float aux_input_to_forget_weights_scale,
+    const int8_t* aux_input_to_cell_weights_ptr,
+    float aux_input_to_cell_weights_scale,
+    const int8_t* aux_input_to_output_weights_ptr,
+    float aux_input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
+    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
+    float* product_scaling_factors, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr_batch, int8_t* quantized_aux_input_ptr_batch,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
+    float* output_state_ptr, float* cell_state_ptr, float* output_ptr_batch);
+
 }  // namespace kernel_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
index 3a53d3ab07..934308ef29 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 
 namespace tflite {
 
@@ -58,4 +58,4 @@ inline bool TestCPUFeatureNeon() { return false; }
                        : Portable##funcname(__VA_ARGS__)
 #endif
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
index 250872c422..6443f425b7 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -140,4 +140,4 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
 
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
index d5503073a7..df4d871466 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -30,11 +30,6 @@ namespace optimized_ops {
 using reference_ops::Relu1;
 using reference_ops::Relu6;
 
-inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
-  return RuntimeShape(
-      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
-}
-
 template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
@@ -51,8 +46,8 @@ inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void Relu(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  Relu(input_data, DimsToShape(input_dims), output_data,
-       DimsToShape(output_dims));
+  Relu(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
 }
 
 // legacy, for compatibility with old checked-in code
@@ -294,6 +289,37 @@ void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
       output_data);
 }
 
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul4DSlow(
+      input1_data, input1_dims, input1_offset, input2_data, input2_dims,
+      input2_offset, output_offset, output_multiplier,
+      // This legacy version switches the sign of the output shift.
+      kReverseShift * output_shift,
+      // (Break to highlight preceding line.)
+      output_activation_min, output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
 inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
                         int stride_width, int stride_height, int pad_width,
                         int pad_height, int kwidth, int kheight,
@@ -554,8 +580,8 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
-  Logistic(input_data, DimsToShape(input_dims), output_data,
-           DimsToShape(output_dims));
+  Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+           output_data);
 }
 
 inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
@@ -575,8 +601,8 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
 
 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  Tanh(input_data, DimsToShape(input_dims), output_data,
-       DimsToShape(output_dims));
+  Tanh(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
 }
 
 inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
index 4a3545d47a..921aae1303 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
 
 #include <assert.h>
 #include <stdint.h>
@@ -164,4 +164,4 @@ inline void Conv(const Eigen::ThreadPoolDevice& device, const float* input_data,
 }  // namespace multithreaded_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index b870789772..7319636bf5 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
 
 #include <assert.h>
 #include <stdint.h>
@@ -47,6 +47,7 @@ using reference_ops::BroadcastGreater;
 using reference_ops::BroadcastGreaterEqual;
 using reference_ops::BroadcastLess;
 using reference_ops::BroadcastLessEqual;
+using reference_ops::BroadcastMul4DSlow;
 using reference_ops::BroadcastSub4DSlow;
 using reference_ops::Concatenation;
 using reference_ops::DepthConcatenation;
@@ -75,6 +76,11 @@ using reference_ops::Transpose;
 // Used mainly to convert from old-style shifts (right) to new-style (left).
 static constexpr int kReverseShift = -1;
 
+inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
+  return RuntimeShape(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
 // Make a local VectorMap typedef allowing to map a float array
 // as a Eigen vector expression. The std::conditional here is to
 // construct the suitable Eigen type for the constness of the
@@ -313,6 +319,7 @@ inline void AddBiasAndEvalActivationFunction(const float* bias_data,
 #endif
 }
 
+// Note: This to be converted to RuntimeShapes along with Conv.
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 void AddBiasAndEvalActivationFunction(const float* bias_data,
@@ -1978,12 +1985,12 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_offset, const uint8* filter_data,
                  const Dims<4>& filter_dims, int32 filter_offset,
                  const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32 output_offset, int32 output_multiplier, int output_shift,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims,
+                 uint8* im2col_data, const Dims<4>& im2col_dims,
                  gemmlowp::GemmContext* gemm_context) {
   gemmlowp::ScopedProfilingLabel label("Conv/8bit");
 
@@ -1995,9 +2002,22 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
   const Dims<4>* gemm_input_dims = nullptr;
   const int filter_width = ArraySize(filter_dims, 1);
   const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col = stride_width != 1 || stride_height != 1 ||
                            filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
+  if (need_dilated_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    DilatedIm2col(input_data, input_dims, filter_dims, stride_width,
+                  stride_height, dilation_width_factor, dilation_height_factor,
+                  pad_width, pad_height, output_dims, input_zero_point,
+                  im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else if (need_im2col) {
     TFLITE_DCHECK(im2col_data);
     const int input_zero_point = -input_offset;
     TFLITE_DCHECK_GE(input_zero_point, 0);
@@ -2053,6 +2073,24 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
       input_offset, output_pipeline);
 }
 
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
@@ -2105,38 +2143,6 @@ void Conv(const uint8* input_data, const Dims<4>& input_dims,
        im2col_data, im2col_dims, gemm_context);
 }
 
-template <typename T>
-inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
-                         int block_size, T* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("DepthToSpace");
-
-  const int input_depth = ArraySize(input_dims, 0);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_height = ArraySize(input_dims, 2);
-
-  const int output_depth = ArraySize(output_dims, 0);
-  const int batch_size = ArraySize(output_dims, 3);
-
-  // Number of continuous values that we can copy in one interation.
-  const int stride = block_size * output_depth;
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int in_h = 0; in_h < input_height; ++in_h) {
-      const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch);
-      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
-        const T* src = input_ptr;
-        for (int in_w = 0; in_w < input_width; ++in_w) {
-          memcpy(output_data, src, stride * sizeof(T));
-          output_data += stride;
-          src += input_depth;
-        }
-        input_ptr += stride;
-      }
-    }
-  }
-}
-
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac, typename T>
 void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
@@ -2212,25 +2218,87 @@ void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("DepthToSpace");
+
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+
+  const int output_depth = output_shape.Dims(3);
+  const int batch_size = output_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = op_params.block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      const T* input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
+      for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) {
+        const T* src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
+        }
+        input_ptr += stride;
+      }
+    }
+  }
+}
+
+// Legacy Dims<4>.
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
                          int block_size, T* output_data,
                          const Dims<4>& output_dims) {
+  tflite::DepthToSpaceParams op_params;
+  op_params.block_size = block_size;
+
+  DepthToSpace(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
   gemmlowp::ScopedProfilingLabel label("SpaceToDepth");
 
-  const int output_depth = ArraySize(output_dims, 0);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
 
-  const int input_depth = ArraySize(input_dims, 0);
-  const int batch_size = ArraySize(input_dims, 3);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  const int input_depth = input_shape.Dims(3);
+  const int batch_size = input_shape.Dims(0);
 
   // Number of continuous values that we can copy in one interation.
-  const int stride = block_size * input_depth;
+  const int stride = op_params.block_size * input_depth;
 
   for (int batch = 0; batch < batch_size; ++batch) {
     for (int out_h = 0; out_h < output_height; ++out_h) {
-      T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch);
-      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+      T* output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
+      for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) {
         T* dst = output_ptr;
         for (int out_w = 0; out_w < output_width; ++out_w) {
           memcpy(dst, input_data, stride * sizeof(T));
@@ -2243,6 +2311,18 @@ inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Dims<4>.
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  tflite::SpaceToDepthParams op_params;
+  op_params.block_size = block_size;
+
+  SpaceToDepth(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
 template <FusedActivationFunctionType Ac>
 void NonGlobalBatchNormalization(
     const float* input_data, const Dims<4>& input_dims, const float* mean_data,
@@ -2290,8 +2370,8 @@ void GlobalBatchNormalization(const float* input_data,
   }
 }
 
-inline void Relu(const float* input_data, const RuntimeShape& input_shape,
-                 float* output_data, const RuntimeShape& output_shape) {
+inline void Relu(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Relu (not fused)");
 
   const auto input = MapAsVector(input_data, input_shape);
@@ -2904,68 +2984,225 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
                output_dims);
 }
 
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8) {
+    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+    const auto input1_val_original = vld1_u8(input1_data + i);
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
+    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
 
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+    const auto input1_val_low = vget_low_s16(input1_val);
+    const auto input1_val_high = vget_high_s16(input1_val);
+    const auto input2_val_low = vget_low_s16(input2_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
 
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 unclamped_result =
-              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                  input1_val * input2_val, output_multiplier,
-                                  kReverseShift * output_shift);
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, unclamped_result));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
+    auto p1 = vmull_s16(input2_val_low, input1_val_low);
+    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, -params.output_shift);
+    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+
+    const auto p1_narrowed = vmovn_s32(p1);
+    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p =
+        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
+                               const uint8 broadcast_value,
+                               const uint8* input2_data, uint8* output_data) {
+  const int16 input1_val = params.input1_offset + broadcast_value;
+
+  int i = 0;
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  TFLITE_DCHECK_GT(params.output_offset, -256);
+  TFLITE_DCHECK_LT(params.output_offset, 256);
+#ifdef USE_NEON
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8) {
+    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+
+    const auto input2_val_low = vget_low_s16(input2_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+
+    auto p1 = vmull_n_s16(input2_val_low, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high, input1_val);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, -params.output_shift);
+    p2 = RoundingDivideByPOT(p2, -params.output_shift);
+
+    const auto p1_narrowed = vmovn_s32(p1);
+    const auto p2_narrowed = vmovn_s32(p2);
+    const auto p =
+        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
+    const auto clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  for (; i < size; ++i) {
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMulFivefold/8bit");
+
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8* output_data_ptr = output_data;
+  const uint8* input1_data_ptr = input1_data;
+  const uint8* input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          input1_data_ptr += y4;
         }
       }
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          ++input1_data_ptr;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
     }
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
-               input2_dims, input2_offset, output_offset, output_multiplier,
-               output_shift, output_activation_min, output_activation_max,
-               output_data, output_dims);
-}
-
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -4350,8 +4587,8 @@ inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Logistic(const float* input_data, const RuntimeShape& input_shape,
-                     float* output_data, const RuntimeShape& output_shape) {
+inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
+                     const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Logistic");
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
@@ -4496,8 +4733,8 @@ inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Logistic(const int16* input_data, const RuntimeShape& input_shape,
-                     int16* output_data, const RuntimeShape& output_shape) {
+inline void Logistic(const RuntimeShape& input_shape, const int16* input_data,
+                     const RuntimeShape& output_shape, int16* output_data) {
   gemmlowp::ScopedProfilingLabel label("Logistic/Int16");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
@@ -4556,8 +4793,14 @@ inline void Logistic(const int16* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Tanh(const float* input_data, const RuntimeShape& input_shape,
-                 float* output_data, const RuntimeShape& output_shape) {
+// Legacy version.
+inline void Logistic(const int16* input_data, const RuntimeShape& input_shape,
+                     int16* output_data, const RuntimeShape& output_shape) {
+  Logistic(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Tanh");
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
@@ -4820,14 +5063,21 @@ inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
   output_map.array() = input_map.array().template cast<DstT>();
 }
 
-inline void Floor(const float* input_data, const Dims<4>& input_dims,
-                  float* output_data, const Dims<4>& output_dims) {
+inline void Floor(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Floor");
-  auto input_map = MapAsVector(input_data, input_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
   output_map.array() = Eigen::floor(input_map.array());
 }
 
+// Legacy Dims<4> version.
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  Floor(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
+}
+
 #ifdef USE_NEON
 inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
                                  float scale, float* output_ptr) {
@@ -4927,12 +5177,14 @@ inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
 
 inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
                                     int32 x, int32 y, int32 depth, int32 batch,
+                                    const RuntimeShape& input_shape,
                                     const float* input_data,
-                                    const Dims<4>& input_dims,
-                                    float* output_data,
-                                    const Dims<4>& output_dims) {
-  const int32 input_width = ArraySize(input_dims, 1);
-  const int32 output_width = ArraySize(output_dims, 1);
+                                    const RuntimeShape& output_shape,
+                                    float* output_data) {
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int32 input_width = input_shape.Dims(2);
+  const int32 output_width = output_shape.Dims(2);
 
   const int32 input_x_offset = (x1 - x0) * depth;
   const int32 input_y_offset = (y1 - y0) * depth * input_width;
@@ -4940,7 +5192,6 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
   const int32 output_y_offset = depth * output_width;
 
 #ifdef USE_NEON
-  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
   TFLITE_DCHECK(x1 >= x0);
   TFLITE_DCHECK(y1 >= y0);
 
@@ -4950,7 +5201,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
     const float* input_ptr = nullptr;
 
     float32x4x2_t x0y0;
-    input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)];
+    input_ptr = &input_data[Offset(input_shape, batch, y0, x0, ic)];
     x0y0.val[0] = vld1q_f32(input_ptr);
     x0y0.val[1] = vld1q_f32(input_ptr + 4);
 
@@ -4970,7 +5221,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
     x1y1.val[1] = vld1q_f32(input_ptr + 4);
 
     // Top left corner.
-    float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)];
+    float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)];
     vst1q_f32(output_ptr, x0y0.val[0]);
     vst1q_f32(output_ptr + 4, x0y0.val[1]);
 
@@ -5009,14 +5260,15 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
   }
   // Handle 4 input channels at a time.
   for (; ic <= depth - 4; ic += 4) {
-    const float* input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)];
+    const float* input_ptr =
+        &input_data[Offset(input_shape, batch, y0, x0, ic)];
     float32x4_t x0y0 = vld1q_f32(input_ptr);
     float32x4_t x1y0 = vld1q_f32(input_ptr + input_x_offset);
     float32x4_t x0y1 = vld1q_f32(input_ptr + input_y_offset);
     float32x4_t x1y1 = vld1q_f32(input_ptr + input_x_offset + input_y_offset);
 
     // Top left corner.
-    float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)];
+    float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)];
     vst1q_f32(output_ptr, x0y0);
 
     // Top right corner.
@@ -5040,7 +5292,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
   }
   // Handle one input channel at a time.
   for (; ic < depth; ic++) {
-    const int32 input_offset = Offset(input_dims, ic, x0, y0, batch);
+    const int32 input_offset = Offset(input_shape, batch, y0, x0, ic);
 
     float x0y0 = input_data[input_offset];
     float x1y0 = input_data[input_offset + input_x_offset];
@@ -5048,7 +5300,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
     float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
 
     // Top left corner.
-    const int32 output_offset = Offset(output_dims, ic, x, y, batch);
+    const int32 output_offset = Offset(output_shape, batch, y, x, ic);
     output_data[output_offset] = x0y0;
 
     // Top right corner.
@@ -5064,7 +5316,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
   }
 #else
   for (int ch = 0; ch < depth; ch++) {
-    const int32 input_offset = Offset(input_dims, ch, x0, y0, batch);
+    const int32 input_offset = Offset(input_shape, batch, y0, x0, ch);
 
     float x0y0 = input_data[input_offset];
     float x1y0 = input_data[input_offset + input_x_offset];
@@ -5072,7 +5324,7 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
     float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
 
     // Top left corner.
-    const int32 output_offset = Offset(output_dims, ch, x, y, batch);
+    const int32 output_offset = Offset(output_shape, batch, y, x, ch);
     output_data[output_offset] = x0y0;
 
     // Top right corner.
@@ -5089,31 +5341,30 @@ inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
 #endif
 }
 
-inline void ResizeBilinear2x2(const float* input_data,
-                              const Dims<4>& input_dims, float* output_data,
-                              const Dims<4>& output_dims, int32 batches,
-                              int32 input_height, int32 input_width,
-                              int32 depth, int32 output_height,
-                              int32 output_width) {
+inline void ResizeBilinear2x2(int32 batches, int32 input_height,
+                              int32 input_width, int32 depth,
+                              int32 output_height, int32 output_width,
+                              const RuntimeShape& input_shape,
+                              const float* input_data,
+                              const RuntimeShape& output_shape,
+                              float* output_data) {
   for (int b = 0; b < batches; b++) {
     for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) {
       for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) {
         int32 x1 = std::min(x0 + 1, input_width - 1);
         int32 y1 = std::min(y0 + 1, input_height - 1);
-        ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_data,
-                                input_dims, output_data, output_dims);
+        ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape,
+                                input_data, output_shape, output_data);
       }
     }
   }
 }
 
-inline void ResizeBilinearGeneric(const float* input_data,
-                                  const Dims<4>& input_dims, float* output_data,
-                                  const Dims<4>& output_dims, int32 batches,
-                                  int32 input_height, int32 input_width,
-                                  int32 depth, int32 output_height,
-                                  int32 output_width, float height_scale,
-                                  float width_scale) {
+inline void ResizeBilinearGeneric(
+    int32 batches, int32 input_height, int32 input_width, int32 depth,
+    int32 output_height, int32 output_width, float height_scale,
+    float width_scale, const RuntimeShape& input_shape, const float* input_data,
+    const RuntimeShape& output_shape, float* output_data) {
   memset(output_data, 0,
          batches * output_height * output_width * depth * sizeof(float));
 
@@ -5130,22 +5381,22 @@ inline void ResizeBilinearGeneric(const float* input_data,
         float* output_ptr = &output_data[output_offset];
 
         // Run kernel on the 4 corners of the bilinear resize algorithm.
-        int32 input_offset = Offset(input_dims, 0, x0, y0, b);
+        int32 input_offset = Offset(input_shape, b, y0, x0, 0);
         float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
         const float* input_ptr = &input_data[input_offset];
         ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
 
-        input_offset = Offset(input_dims, 0, x1, y0, b);
+        input_offset = Offset(input_shape, b, y0, x1, 0);
         scale = (1 - (input_y - y0)) * (input_x - x0);
         input_ptr = &input_data[input_offset];
         ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
 
-        input_offset = Offset(input_dims, 0, x0, y1, b);
+        input_offset = Offset(input_shape, b, y1, x0, 0);
         scale = (input_y - y0) * (1 - (input_x - x0));
         input_ptr = &input_data[input_offset];
         ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
 
-        input_offset = Offset(input_dims, 0, x1, y1, b);
+        input_offset = Offset(input_shape, b, y1, x1, 0);
         scale = (input_y - y0) * (input_x - x0);
         input_ptr = &input_data[input_offset];
         ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
@@ -5158,10 +5409,10 @@ inline void ResizeBilinearGeneric(const float* input_data,
 
 template <typename T>
 inline void ResizeBilinearGenericSmallChannel(
-    const T* input_data, const Dims<4>& input_dims, T* output_data,
-    const Dims<4>& output_dims, int32 batches, int32 input_height,
-    int32 input_width, int32 depth, int32 output_height, int32 output_width,
-    float height_scale, float width_scale) {
+    int32 batches, int32 input_height, int32 input_width, int32 depth,
+    int32 output_height, int32 output_width, float height_scale,
+    float width_scale, const RuntimeShape& input_shape, const T* input_data,
+    const RuntimeShape& output_shape, T* output_data) {
   memset(output_data, 0,
          batches * output_height * output_width * depth * sizeof(T));
 
@@ -5176,9 +5427,10 @@ inline void ResizeBilinearGenericSmallChannel(
         int32 x0 = static_cast<int32>(input_x);
         int32 x1 = std::min(x0 + 1, input_width - 1);
 
-        int32 input_offset[4] = {
-            Offset(input_dims, 0, x0, y0, b), Offset(input_dims, 0, x1, y0, b),
-            Offset(input_dims, 0, x0, y1, b), Offset(input_dims, 0, x1, y1, b)};
+        int32 input_offset[4] = {Offset(input_shape, b, y0, x0, 0),
+                                 Offset(input_shape, b, y0, x1, 0),
+                                 Offset(input_shape, b, y1, x0, 0),
+                                 Offset(input_shape, b, y1, x1, 0)};
         float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
                           (1 - (input_y - y0)) * (input_x - x0),
                           (input_y - y0) * (1 - (input_x - x0)),
@@ -5196,79 +5448,123 @@ inline void ResizeBilinearGenericSmallChannel(
   }
 }
 
-inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const float* input_data,
+                           const RuntimeShape& unextended_output_size_shape,
                            const int32* output_size_data,
-                           const Dims<4>& output_size_dims, float* output_data,
-                           const Dims<4>& output_dims, bool align_corners) {
+                           const RuntimeShape& unextended_output_shape,
+                           float* output_data) {
   gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
-  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  int32 input_height = ArraySize(input_dims, 2);
-  int32 input_width = ArraySize(input_dims, 1);
-  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
-  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
-  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  RuntimeShape output_size_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32 input_height = input_shape.Dims(1);
+  int32 input_width = input_shape.Dims(2);
+  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
+  int32 output_height = output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
 
   // Specialize for 2x2 upsample.
-  if (!align_corners && output_height == 2 * input_height &&
+  if (!op_params.align_corners && output_height == 2 * input_height &&
       output_width == 2 * input_width) {
-    ResizeBilinear2x2(input_data, input_dims, output_data, output_dims, batches,
-                      input_height, input_width, depth, output_height,
-                      output_width);
+    ResizeBilinear2x2(batches, input_height, input_width, depth, output_height,
+                      output_width, input_shape, input_data, output_shape,
+                      output_data);
   } else {
     float height_scale = static_cast<float>(input_height) / output_height;
     float width_scale = static_cast<float>(input_width) / output_width;
-    if (align_corners && output_height > 1) {
+    if (op_params.align_corners && output_height > 1) {
       height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
     }
-    if (align_corners && output_width > 1) {
+    if (op_params.align_corners && output_width > 1) {
       width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
     }
 
-    ResizeBilinearGeneric(input_data, input_dims, output_data, output_dims,
-                          batches, input_height, input_width, depth,
+    ResizeBilinearGeneric(batches, input_height, input_width, depth,
                           output_height, output_width, height_scale,
-                          width_scale);
+                          width_scale, input_shape, input_data, output_shape,
+                          output_data);
   }
 }
 
+// Legacy Dims<4>
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims, bool align_corners) {
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = align_corners;
+  ResizeBilinear(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(output_size_dims), output_size_data,
+                 DimsToShape(output_dims), output_data);
+}
+
 // TODO(prabhumk): This is not a real quantized bilinear. It does not use int8
 // or int16 arithmetic.
-inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& input_shape,
+                           const uint8* input_data,
+                           const RuntimeShape& output_size_shape,
                            const int32* output_size_data,
-                           const Dims<4>& output_size_dims, uint8* output_data,
-                           const Dims<4>& output_dims, bool align_corners) {
+                           const RuntimeShape& output_shape,
+                           uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
-  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  int32 input_height = ArraySize(input_dims, 2);
-  int32 input_width = ArraySize(input_dims, 1);
-  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
-  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
-  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_size_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32 input_height = input_shape.Dims(1);
+  int32 input_width = input_shape.Dims(2);
+  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
+  int32 output_height = output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
 
   float height_scale =
-      (align_corners && output_height > 1)
+      (op_params.align_corners && output_height > 1)
           ? (static_cast<float>(input_height - 1) / (output_height - 1))
           : (static_cast<float>(input_height) / output_height);
 
   float width_scale =
-      (align_corners && output_width > 1)
+      (op_params.align_corners && output_width > 1)
           ? (static_cast<float>(input_width - 1) / (output_width - 1))
           : (static_cast<float>(input_width) / output_width);
 
   ResizeBilinearGenericSmallChannel<uint8>(
-      input_data, input_dims, output_data, output_dims, batches, input_height,
-      input_width, depth, output_height, output_width, height_scale,
-      width_scale);
+      batches, input_height, input_width, depth, output_height, output_width,
+      height_scale, width_scale, input_shape, input_data, output_shape,
+      output_data);
+}
+
+// Legacy Dims<4>
+inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, uint8* output_data,
+                           const Dims<4>& output_dims, bool align_corners) {
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = align_corners;
+  ResizeBilinear(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(output_size_dims), output_size_data,
+                 DimsToShape(output_dims), output_data);
 }
 
 // legacy, for compatibility with old checked-in code
@@ -5311,20 +5607,29 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
 }
 
 template <typename T>
-inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
-                           const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims,
-                           const int32* crops_data, const Dims<4>& crops_dims,
-                           T* output_data, const Dims<4>& output_dims) {
+inline void BatchToSpaceND(
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
+    const RuntimeShape& unextended_input3_shape, const int32* crops_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
   gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
 
-  const int output_batch_size = ArraySize(output_dims, 3);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  const int input_batch_size = ArraySize(input_dims, 3);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int depth = ArraySize(input_dims, 0);
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input1_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input1_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
   const int crops_top = crops_data[0];
@@ -5359,14 +5664,28 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                           spatial_offset % block_shape_width - crops_left;
         TFLITE_DCHECK_GE(out_w, 0);
         TFLITE_DCHECK_LT(out_w, output_width);
-        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
-        const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
+        T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
+        const T* in =
+            input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
         memcpy(out, in, depth * sizeof(T));
       }
     }
   }
 }
 
+// Legacy Dims<4>.
+template <typename T>
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
+  BatchToSpaceND(DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(crops_dims), crops_data, DimsToShape(output_dims),
+                 output_data);
+}
+
 template <typename T>
 void TypedMemset(void* ptr, T value, size_t num) {
   // Optimization for common cases where memset() will suffice.
@@ -5383,31 +5702,54 @@ void TypedMemset(void* ptr, T value, size_t num) {
   }
 }
 
-template <typename T>
-inline void PadV2(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& left_paddings,
-                  const std::vector<int>& right_paddings, T* output_data,
-                  const Dims<4>& output_dims, const T pad_value) {
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32 is considered a
+// specialization distinct from P=int32.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const P* pad_value_ptr, const RuntimeShape& output_shape,
+                    T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Pad");
-  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
-  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  RuntimeShape ext_input_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  RuntimeShape ext_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
+  TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
+
+  // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
+  // we can pad them to 4 dims (yes, we are "padding the padding").
+  std::vector<int> left_padding_copy(4, 0);
+  const int left_padding_extend = 4 - op_params.left_padding_count;
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[left_padding_extend + i] = op_params.left_padding[i];
+  }
+  std::vector<int> right_padding_copy(4, 0);
+  const int right_padding_extend = 4 - op_params.right_padding_count;
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[right_padding_extend + i] = op_params.right_padding[i];
+  }
 
-  const int output_batch = ArraySize(output_dims, 3);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_depth = ArraySize(output_dims, 0);
+  const int output_batch = ext_output_shape.Dims(0);
+  const int output_height = ext_output_shape.Dims(1);
+  const int output_width = ext_output_shape.Dims(2);
+  const int output_depth = ext_output_shape.Dims(3);
 
-  const int left_b_padding = left_paddings[3];
-  const int left_h_padding = left_paddings[2];
-  const int left_w_padding = left_paddings[1];
-  const int left_d_padding = left_paddings[0];
+  const int left_b_padding = left_padding_copy[0];
+  const int left_h_padding = left_padding_copy[1];
+  const int left_w_padding = left_padding_copy[2];
+  const int left_d_padding = left_padding_copy[3];
 
-  const int right_b_padding = right_paddings[3];
-  const int right_h_padding = right_paddings[2];
-  const int right_w_padding = right_paddings[1];
-  const int right_d_padding = right_paddings[0];
+  const int right_b_padding = right_padding_copy[0];
+  const int right_h_padding = right_padding_copy[1];
+  const int right_w_padding = right_padding_copy[2];
+  const int right_d_padding = right_padding_copy[3];
 
-  const int input_depth = ArraySize(input_dims, 0);
+  const int input_depth = ext_input_shape.Dims(3);
+  const T pad_value = *pad_value_ptr;
 
   if (left_b_padding != 0) {
     TypedMemset<T>(
@@ -5417,61 +5759,112 @@ inline void PadV2(const T* input_data, const Dims<4>& input_dims,
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      TypedMemset<T>(output_data + Offset(output_dims, 0, 0, 0, out_b),
+      TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, 0, 0, 0),
                      pad_value, left_h_padding * output_width * output_depth);
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        TypedMemset<T>(output_data + Offset(output_dims, 0, 0, out_h, out_b),
-                       pad_value, left_w_padding * output_depth);
+        TypedMemset<T>(
+            output_data + Offset(ext_output_shape, out_b, out_h, 0, 0),
+            pad_value, left_w_padding * output_depth);
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
           TypedMemset<T>(
-              output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+              output_data + Offset(ext_output_shape, out_b, out_h, out_w, 0),
               pad_value, left_d_padding);
         }
 
         T* out = output_data +
-                 Offset(output_dims, left_d_padding, out_w, out_h, out_b);
-        const T* in =
-            input_data + Offset(input_dims, 0, out_w - left_w_padding,
-                                out_h - left_h_padding, out_b - left_b_padding);
+                 Offset(ext_output_shape, out_b, out_h, out_w, left_d_padding);
+        const T* in = input_data +
+                      Offset(ext_input_shape, out_b - left_b_padding,
+                             out_h - left_h_padding, out_w - left_w_padding, 0);
         memcpy(out, in, input_depth * sizeof(T));
 
         if (right_d_padding != 0) {
           TypedMemset<T>(
-              output_data + Offset(output_dims, output_depth - right_d_padding,
-                                   out_w, out_h, out_b),
+              output_data + Offset(ext_output_shape, out_b, out_h, out_w,
+                                   output_depth - right_d_padding),
               pad_value, right_d_padding);
         }
       }
       if (right_w_padding != 0) {
-        TypedMemset<T>(
-            output_data + Offset(output_dims, 0, output_width - right_w_padding,
-                                 out_h, out_b),
-            pad_value, right_w_padding * output_depth);
+        TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, out_h,
+                                            output_width - right_w_padding, 0),
+                       pad_value, right_w_padding * output_depth);
       }
     }
     if (right_h_padding != 0) {
       TypedMemset<T>(
-          output_data +
-              Offset(output_dims, 0, 0, output_height - right_h_padding, out_b),
+          output_data + Offset(ext_output_shape, out_b,
+                               output_height - right_h_padding, 0, 0),
           pad_value, right_h_padding * output_width * output_depth);
     }
   }
   if (right_b_padding != 0) {
     TypedMemset<T>(
         output_data +
-            Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
+            Offset(ext_output_shape, output_batch - right_b_padding, 0, 0, 0),
         pad_value,
         right_b_padding * output_height * output_width * output_depth);
   }
 }
 
-// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const P* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// The second (pad-value) input can be int32 when, say, the first is uint8.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+          output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const int32* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                int32* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// Legacy signature, function covered both Pad and PadV2.
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  tflite::PadParams op_params;
+  op_params.left_padding_count = 4;
+  op_params.right_padding_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.left_padding[i] = left_paddings[3 - i];
+    op_params.right_padding[i] = right_paddings[3 - i];
+  }
+  const T pad_value_copy = pad_value;
+
+  Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy,
+      DimsToShape(output_dims), output_data);
+}
+
+// Old Pad that calls legacy PadV2.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -5482,34 +5875,45 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
            output_dims, converted_pad_value);
 }
 
+// Old Pad that only padded with 0.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
                 const Dims<4>& output_dims) {
-  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
-      output_dims, 0);
+  const T pad_value = static_cast<T>(0);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, pad_value);
 }
 
 template <typename T>
-inline void Slice(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& begin, const std::vector<int>& size,
-                  T* output_data, const Dims<4>& output_dims) {
-  // TODO(dkalenichenko): This op only supports 4D tensors.
-  TFLITE_DCHECK_EQ(begin.size(), 4);
-  TFLITE_DCHECK_EQ(size.size(), 4);
-  const int start_b = begin[3];
-  const int stop_b =
-      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
-  const int start_h = begin[2];
-  const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
-  const int start_w = begin[1];
-  const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
-  const int start_d = begin[0];
-  const int stop_d =
-      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Slice");
+  RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
+  TFLITE_DCHECK_LE(op_params.begin_count, 4);
+  TFLITE_DCHECK_LE(op_params.size_count, 4);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
+  const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
+                         ? ext_shape.Dims(0) - start_b
+                         : start_b + op_params.size[0];
+  const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
+  const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
+                         ? ext_shape.Dims(1) - start_h
+                         : start_h + op_params.size[size_count - 3];
+  const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
+  const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
+                         ? ext_shape.Dims(2) - start_w
+                         : start_w + op_params.size[size_count - 2];
+  const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
+  const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
+                         ? ext_shape.Dims(3) - start_d
+                         : start_d + op_params.size[size_count - 1];
 
   T* out_ptr = output_data;
   for (int in_b = start_b; in_b < stop_b; ++in_b) {
@@ -5517,7 +5921,7 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
       for (int in_w = start_w; in_w < stop_w; ++in_w) {
         const int len = stop_d - start_d;
         memcpy(out_ptr,
-               input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
+               input_data + Offset(ext_shape, in_b, in_h, in_w, start_d),
                len * sizeof(T));
         out_ptr += len;
       }
@@ -5526,28 +5930,60 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, T* output_data,
-                       const Dims<4>& output_dims) {
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::SliceParams op_params;
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.begin[i] = begin[3 - i];
+    op_params.size[i] = size[3 - i];
+  }
+
+  Slice(op_params, DimsToShape(input_dims), input_data,
+        DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
   gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum");
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
   auto min_value = input2_data[0];
   output_map.array() = input1_map.array().min(min_value);
 }
 
 template <typename T>
-void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, T* output_data,
-                       const Dims<4>& output_dims) {
+void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
   gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum");
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
   auto max_value = input2_data[0];
   output_map.array() = input1_map.array().max(max_value);
 }
 
 template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Minimum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Maximum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
 void TransposeIm2col(const T* input_data, const Dims<4>& input_dims,
                      const Dims<4>& filter_dims, int stride_width,
                      int stride_height, int pad_width, int pad_height,
@@ -5667,4 +6103,4 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
 #pragma GCC diagnostic pop
 #endif
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
index bcf5e4e4f6..71ae74f34c 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -26,11 +26,6 @@ namespace tflite {
 
 namespace reference_ops {
 
-inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
-  return RuntimeShape(
-      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
-}
-
 template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
@@ -47,20 +42,20 @@ inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void Relu(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  Relu(input_data, DimsToShape(input_dims), output_data,
-       DimsToShape(output_dims));
+  Relu(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
 }
 
 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  Relu1(input_data, DimsToShape(input_dims), output_data,
-        DimsToShape(output_dims));
+  Relu1(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
 }
 
 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  Relu6(input_data, DimsToShape(input_dims), output_data,
-        DimsToShape(output_dims));
+  Relu6(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
 }
 
 template <FusedActivationFunctionType Ac>
@@ -316,6 +311,37 @@ inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
               DimsToShape(output_dims), output_data);
 }
 
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul4DSlow(
+      input1_data, input1_dims, input1_offset, input2_data, input2_dims,
+      input2_offset, output_offset, output_multiplier,
+      //
+      kReverseShift * output_shift,
+      //
+      output_activation_min, output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 void AveragePool(const float* input_data, const Dims<4>& input_dims,
@@ -557,8 +583,8 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
-  Logistic(input_data, DimsToShape(input_dims), output_data,
-           DimsToShape(output_dims));
+  Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+           output_data);
 }
 
 inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
@@ -572,14 +598,14 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
-  Logistic(input_data, DimsToShape(input_dims), output_data,
-           DimsToShape(output_dims));
+  Logistic(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+           output_data);
 }
 
 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  Tanh(input_data, DimsToShape(input_dims), output_data,
-       DimsToShape(output_dims));
+  Tanh(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+       output_data);
 }
 
 inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index f4176e474e..ff77f61191 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <sys/types.h>
 #include <algorithm>
 #include <cmath>
+#include <functional>
 #include <limits>
 #include <memory>
 #include <type_traits>
 
-#include "third_party/eigen3/Eigen/Core"
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
@@ -105,6 +105,11 @@ namespace reference_ops {
 // Used mainly to convert from old-style shifts (right) to new-style (left).
 static constexpr int kReverseShift = -1;
 
+inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
+  return RuntimeShape(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
 template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
@@ -271,12 +276,12 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_offset, const uint8* filter_data,
                  const Dims<4>& filter_dims, int32 filter_offset,
                  const int32* bias_data, const Dims<4>& bias_dims,
-                 int stride_width, int stride_height, int pad_width,
-                 int pad_height, int32 output_offset, int32 output_multiplier,
-                 int output_shift, int32 output_activation_min,
-                 int32 output_activation_max, uint8* output_data,
-                 const Dims<4>& output_dims, uint8* im2col_data,
-                 const Dims<4>& im2col_dims,
+                 int stride_width, int stride_height, int dilation_width_factor,
+                 int dilation_height_factor, int pad_width, int pad_height,
+                 int32 output_offset, int32 output_multiplier, int output_shift,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims,
+                 uint8* im2col_data, const Dims<4>& im2col_dims,
                  gemmlowp::GemmContext* gemm_context) {
   (void)im2col_data;   // only used in optimized code.
   (void)im2col_dims;   // only used in optimized code.
@@ -302,8 +307,9 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + filter_x;
-                const int in_y = in_y_origin + filter_y;
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
                 // If the location is outside the bounds of the input image,
                 // use zero as a default value.
                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
@@ -335,6 +341,24 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height, 1, 1,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
 // legacy, for compatibility with old checked-in code
 template <FusedActivationFunctionType Ac>
 inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
@@ -383,18 +407,29 @@ void Conv(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
-                         int block_size, T* output_data,
-                         const Dims<4>& output_dims) {
-  const int input_depth = ArraySize(input_dims, 0);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_batch = ArraySize(input_dims, 3);
+inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_batch = input_shape.Dims(0);
 
-  const int output_depth = ArraySize(output_dims, 0);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_batch = ArraySize(output_dims, 3);
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch = output_shape.Dims(0);
+
+  const int32 block_size = op_params.block_size;
 
   TFLITE_DCHECK_EQ(input_width * block_size, output_width);
   TFLITE_DCHECK_EQ(input_height * block_size, output_height);
@@ -413,9 +448,9 @@ inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
           const int in_h = out_h / block_size;
           const int in_b = out_b;
 
+          const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
           const int output_index =
-              Offset(output_dims, out_d, out_w, out_h, out_b);
-          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
+              Offset(output_shape, out_b, out_h, out_w, out_d);
 
           output_data[output_index] = input_data[input_index];
         }
@@ -424,19 +459,42 @@ inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Dims<4>.
 template <typename T>
-inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
                          int block_size, T* output_data,
                          const Dims<4>& output_dims) {
-  const int input_depth = ArraySize(input_dims, 0);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_batch = ArraySize(input_dims, 3);
+  tflite::DepthToSpaceParams op_params;
+  op_params.block_size = block_size;
 
-  const int output_depth = ArraySize(output_dims, 0);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_batch = ArraySize(output_dims, 3);
+  DepthToSpace(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
+                         const RuntimeShape& unextended_input_shape,
+                         const T* input_data,
+                         const RuntimeShape& unextended_output_shape,
+                         T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int input_depth = input_shape.Dims(3);
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int input_batch = input_shape.Dims(0);
+
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch = output_shape.Dims(0);
+
+  const int32 block_size = op_params.block_size;
 
   TFLITE_DCHECK_EQ(input_width, output_width * block_size);
   TFLITE_DCHECK_EQ(input_height, output_height * block_size);
@@ -454,9 +512,9 @@ inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
           const int out_h = in_h / block_size;
           const int out_b = in_b;
 
+          const int input_index = Offset(input_shape, in_b, in_h, in_w, in_d);
           const int output_index =
-              Offset(output_dims, out_d, out_w, out_h, out_b);
-          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
+              Offset(output_shape, out_b, out_h, out_w, out_d);
 
           output_data[output_index] = input_data[input_index];
         }
@@ -465,6 +523,18 @@ inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Dims<4>.
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  tflite::SpaceToDepthParams op_params;
+  op_params.block_size = block_size;
+
+  SpaceToDepth(op_params, DimsToShape(input_dims), input_data,
+               DimsToShape(output_dims), output_data);
+}
+
 inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
                            const float* weights_data,
                            const Dims<4>& weights_dims, const float* bias_data,
@@ -822,8 +892,8 @@ void GlobalBatchNormalization(const float* input_data,
   }
 }
 
-inline void Relu(const float* input_data, const RuntimeShape& input_shape,
-                 float* output_data, const RuntimeShape& output_shape) {
+inline void Relu(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
     const float val = input_data[i];
@@ -833,8 +903,8 @@ inline void Relu(const float* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Relu1(const float* input_data, const RuntimeShape& input_shape,
-                  float* output_data, const RuntimeShape& output_shape) {
+inline void Relu1(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
@@ -846,8 +916,8 @@ inline void Relu1(const float* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Relu6(const float* input_data, const RuntimeShape& input_shape,
-                  float* output_data, const RuntimeShape& output_shape) {
+inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
@@ -859,11 +929,14 @@ inline void Relu6(const float* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void ReluX(uint8 min_value, uint8 max_value, const uint8* input_data,
-                  const RuntimeShape& input_shape, uint8* output_data,
-                  const RuntimeShape& output_shape) {
+inline void ReluX(const tflite::ActivationParams& params,
+                  const RuntimeShape& input_shape, const uint8* input_data,
+
+                  const RuntimeShape& output_shape, uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  const uint8 max_value = params.quantized_activation_max;
+  const uint8 min_value = params.quantized_activation_min;
   for (int i = 0; i < flat_size; ++i) {
     const uint8 val = input_data[i];
     const uint8 clamped =
@@ -872,6 +945,16 @@ inline void ReluX(uint8 min_value, uint8 max_value, const uint8* input_data,
   }
 }
 
+// Legacy.
+inline void ReluX(uint8 min_value, uint8 max_value, const uint8* input_data,
+                  const RuntimeShape& input_shape, uint8* output_data,
+                  const RuntimeShape& output_shape) {
+  tflite::ActivationParams params;
+  params.quantized_activation_max = max_value;
+  params.quantized_activation_min = min_value;
+  ReluX(params, input_shape, input_data, output_shape, output_data);
+}
+
 template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const RuntimeShape& input_shape,
                      float* output_data, const RuntimeShape& output_shape) {
@@ -1374,13 +1457,144 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
                output_dims);
 }
 
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
+// Element-wise mul that can often be used for inner loop of broadcast Mul as
+// well as the non-broadcast Mul.
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8* output_data_ptr = output_data;
+  const uint8* input1_data_ptr = input1_data;
+  const uint8* input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0) {
+    const uint8* input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1) {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2) {
+        for (int i3 = 0; i3 < y3; ++i3) {
+          MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                         output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        input1_data_ptr += y4;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void BroadcastMul4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  // The input shapes are extended as part of NdArrayDesc initialization.
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  input1_val * input2_val, params.output_multiplier,
+                  params.output_shift);
+          const int32 clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// Transitional version that will be moved shortly to legacy_reference_ops, as
+// part of RuntimeShape revisions.
+inline void BroadcastMul4DSlow(const uint8* input1_data,
+                               const Dims<4>& input1_dims, int32 input1_offset,
+                               const uint8* input2_data,
+                               const Dims<4>& input2_dims, int32 input2_offset,
+                               int32 output_offset, int32 output_multiplier,
+                               int output_shift, int32 output_activation_min,
+                               int32 output_activation_max, uint8* output_data,
+                               const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
 
   NdArrayDesc<4> desc1;
@@ -1407,9 +1621,9 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
           const int32 input2_val =
               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
           const int32 unclamped_result =
-              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                  input1_val * input2_val, output_multiplier,
-                                  kReverseShift * output_shift);
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  input1_val * input2_val, output_multiplier, output_shift);
           const int32 clamped_output =
               std::min(output_activation_max,
                        std::max(output_activation_min, unclamped_result));
@@ -1464,21 +1678,6 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
-               input2_dims, input2_offset, output_offset, output_multiplier,
-               output_shift, output_activation_min, output_activation_max,
-               output_data, output_dims);
-}
-
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -1881,6 +2080,25 @@ void Pack(int dim, const Scalar* const* input_data,
   }
 }
 
+template <typename Scalar>
+void Unpack(int axis, const Scalar* input_data, const Dims<4>& input_dims,
+            int dimensions, int outputs_count, Scalar* const* output_datas,
+            const Dims<4>& output_dims) {
+  int outer_size = 1;
+  for (int i = dimensions - axis; i < 4; i++) {
+    outer_size *= input_dims.sizes[i];
+  }
+
+  const int copy_size = FlatSize(input_dims) / outer_size / outputs_count;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < outputs_count; ++i) {
+      Scalar* output_ptr = output_datas[i] + copy_size * k;
+      int loc = k * outputs_count * copy_size + i * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
 // TODO(prabhumk): This is the same as the optimized implementation.
 // TODO(prabhumk): The quantized implementation of concatentation isn't fully
 // quantized as it takes scale as a floating point value. This should be fixed
@@ -1936,6 +2154,44 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data,
   }
 }
 
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+          const Dims<4>* const* input_dims, const int32* input_zeropoint,
+          const float* input_scale, int inputs_count, Scalar* output_data,
+          const Dims<4>& output_dims, const int32 output_zeropoint,
+          const float output_scale) {
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  int outer_size = 1;
+  for (int i = dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  Scalar* output_ptr = output_data;
+  const int copy_size = FlatSize(**input_dims) / outer_size;
+  const float inverse_output_scale = 1.f / output_scale;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_data[i] + k * copy_size,
+               copy_size * sizeof(Scalar));
+      } else {
+        assert(false);
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        auto input_ptr = input_data[i];
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
@@ -2308,36 +2564,6 @@ void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
                   output_data, output_dims);
 }
 
-// TODO(benoitjacob) make this a proper reference impl without Eigen!
-template <typename Scalar>
-using MatrixMap = typename std::conditional<
-    std::is_const<Scalar>::value,
-    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
-                                   Eigen::Dynamic, Eigen::Dynamic>>,
-    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
-
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
-                                                const Dims<N>& dims) {
-  const int rows = dims.sizes[0];
-  int cols = 1;
-  for (int d = 1; d < N; d++) {
-    cols *= dims.sizes[d];
-  }
-  return MatrixMap<Scalar>(data, rows, cols);
-}
-
-template <typename Scalar, int N>
-MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
-                                               const Dims<N>& dims) {
-  const int cols = dims.sizes[N - 1];
-  int rows = 1;
-  for (int d = 0; d < N - 1; d++) {
-    rows *= dims.sizes[d];
-  }
-  return MatrixMap<Scalar>(data, rows, cols);
-}
-
 inline int NodeOffset(int b, int h, int w, int height, int width) {
   return (b * height + h) * width + w;
 }
@@ -2978,8 +3204,8 @@ inline void LogSoftmax(const uint8* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Logistic(const float* input_data, const RuntimeShape& input_shape,
-                     float* output_data, const RuntimeShape& output_shape) {
+inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
+                     const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
@@ -3027,8 +3253,8 @@ inline void Logistic(const uint8* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Logistic(const int16* input_data, const RuntimeShape& input_shape,
-                     int16* output_data, const RuntimeShape& output_shape) {
+inline void Logistic(const RuntimeShape& input_shape, const int16* input_data,
+                     const RuntimeShape& output_shape, int16* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
@@ -3045,8 +3271,8 @@ inline void Logistic(const int16* input_data, const RuntimeShape& input_shape,
   }
 }
 
-inline void Tanh(const float* input_data, const RuntimeShape& input_shape,
-                 float* output_data, const RuntimeShape& output_shape) {
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
@@ -3172,9 +3398,9 @@ inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
   }
 }
 
-inline void Floor(const float* input_data, const Dims<4>& input_dims,
-                  float* output_data, const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+inline void Floor(const RuntimeShape& input_shape, const float* input_data,
+                  const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
   for (int i = 0; i < flat_size; i++) {
     int offset = i;
@@ -3182,6 +3408,13 @@ inline void Floor(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Dims<4> version.
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  Floor(DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+        output_data);
+}
+
 template <typename T>
 inline void Gather(const T* input_data, const Dims<4>& input_dims,
                    int input_rank, const int32* coords_data,
@@ -3201,27 +3434,41 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void ResizeBilinear(const T* input_data, const Dims<4>& input_dims,
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const T* input_data,
+                           const RuntimeShape& unextended_output_size_shape,
                            const int32* output_size_data,
-                           const Dims<4>& output_size_dims, T* output_data,
-                           const Dims<4>& output_dims, bool align_corners) {
-  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  int32 input_height = ArraySize(input_dims, 2);
-  int32 input_width = ArraySize(input_dims, 1);
-  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
-  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
-  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
-  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+                           const RuntimeShape& unextended_output_shape,
+                           T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  RuntimeShape output_size_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32 input_height = input_shape.Dims(1);
+  int32 input_width = input_shape.Dims(2);
+  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
+  int32 output_height = output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
+
   float height_scale = static_cast<float>(input_height) / output_height;
   float width_scale = static_cast<float>(input_width) / output_width;
-  if (align_corners && output_height > 1) {
+  if (op_params.align_corners && output_height > 1) {
     height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
   }
-  if (align_corners && output_width > 1) {
+  if (op_params.align_corners && output_width > 1) {
     width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
   }
 
@@ -3236,21 +3483,34 @@ inline void ResizeBilinear(const T* input_data, const Dims<4>& input_dims,
         int32 x1 = std::min(x0 + 1, input_width - 1);
         for (int c = 0; c < depth; ++c) {
           T interpolation =
-              static_cast<T>(input_data[Offset(input_dims, c, x0, y0, b)] *
+              static_cast<T>(input_data[Offset(input_shape, b, y0, x0, c)] *
                                  (1 - (input_y - y0)) * (1 - (input_x - x0)) +
-                             input_data[Offset(input_dims, c, x0, y1, b)] *
+                             input_data[Offset(input_shape, b, y1, x0, c)] *
                                  (input_y - y0) * (1 - (input_x - x0)) +
-                             input_data[Offset(input_dims, c, x1, y0, b)] *
+                             input_data[Offset(input_shape, b, y0, x1, c)] *
                                  (1 - (input_y - y0)) * (input_x - x0) +
-                             input_data[Offset(input_dims, c, x1, y1, b)] *
+                             input_data[Offset(input_shape, b, y1, x1, c)] *
                                  (input_y - y0) * (input_x - x0));
-          output_data[Offset(output_dims, c, x, y, b)] = interpolation;
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
         }
       }
     }
   }
 }
 
+// Legacy Dims<4>.
+template <typename T>
+inline void ResizeBilinear(const T* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, T* output_data,
+                           const Dims<4>& output_dims, bool align_corners) {
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = align_corners;
+  ResizeBilinear(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(output_size_dims), output_size_data,
+                 DimsToShape(output_dims), output_data);
+}
+
 // legacy, for compatibility with old checked-in code
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
@@ -3261,6 +3521,7 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                         /*align_corners=*/false);
 }
 
+// Legacy.
 inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, uint8* output_data,
@@ -3271,45 +3532,56 @@ inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
-                           const int32* block_shape_data,
-                           const Dims<4>& block_shape_dims,
-                           const int32* paddings_data,
-                           const Dims<4>& paddings_dims, T* output_data,
-                           const Dims<4>& output_dims,
-                           const int32_t pad_value) {
-  const int output_batch_size = ArraySize(output_dims, 3);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  const int input_batch_size = ArraySize(input_dims, 3);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int depth = ArraySize(input_dims, 0);
+inline void SpaceToBatchND(
+    const SpaceToBatchParams& params,
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
+    const RuntimeShape& unextended_input3_shape, const int32* paddings_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input1_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input1_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
   const int block_shape_height = block_shape_data[0];
   const int block_shape_width = block_shape_data[1];
   const int padding_top = paddings_data[0];
   const int padding_left = paddings_data[2];
 
+  // For uint8 quantized, the correct padding "zero value" is the output offset.
+  const int32_t pad_value = params.output_offset;
+
   for (int out_b = 0; out_b < output_batch_size; ++out_b) {
     int input_batch = out_b % input_batch_size;
     int shift_w = (out_b / input_batch_size) % block_shape_width;
     int shift_h = (out_b / input_batch_size) / block_shape_width;
     for (int out_h = 0; out_h < output_height; ++out_h) {
       for (int out_w = 0; out_w < output_width; ++out_w) {
-        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
+        T* out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
         if (out_h * block_shape_height + shift_h < padding_top ||
             out_h * block_shape_height + shift_h >=
                 padding_top + input_height ||
             out_w * block_shape_width + shift_w < padding_left ||
             out_w * block_shape_width + shift_w >= padding_left + input_width) {
+          // This may not execute correctly when pad_value != 0 and T != uint8.
           memset(out, pad_value, depth * sizeof(T));
         } else {
           const T* in =
-              input_data +
-              Offset(input_dims, 0,
-                     (out_w * block_shape_width + shift_w) - padding_left,
+              input1_data +
+              Offset(input1_shape, input_batch,
                      (out_h * block_shape_height + shift_h) - padding_top,
-                     input_batch);
+                     (out_w * block_shape_width + shift_w) - padding_left, 0);
           memcpy(out, in, depth * sizeof(T));
         }
       }
@@ -3317,30 +3589,63 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Dims<4>.
 template <typename T>
 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
                            const Dims<4>& block_shape_dims,
                            const int32* paddings_data,
                            const Dims<4>& paddings_dims, T* output_data,
-                           const Dims<4>& output_dims) {
-  SpaceToBatchND(input_data, input_dims, block_shape_data, block_shape_dims,
-                 paddings_data, paddings_dims, output_data, output_dims, 0);
+                           const Dims<4>& output_dims,
+                           const int32_t pad_value) {
+  tflite::SpaceToBatchParams op_params;
+  op_params.output_offset = pad_value;
+
+  SpaceToBatchND(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(paddings_dims), paddings_data,
+                 DimsToShape(output_dims), output_data);
 }
 
+// Legacy if no good reason to have signature with pad_value=0.
 template <typename T>
-inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
                            const Dims<4>& block_shape_dims,
-                           const int32* crops_data, const Dims<4>& crops_dims,
-                           T* output_data, const Dims<4>& output_dims) {
-  const int output_batch_size = ArraySize(output_dims, 3);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  const int input_batch_size = ArraySize(input_dims, 3);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int depth = ArraySize(input_dims, 0);
+                           const int32* paddings_data,
+                           const Dims<4>& paddings_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  tflite::SpaceToBatchParams op_params;
+  op_params.output_offset = 0;
+
+  SpaceToBatchND(op_params, DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(paddings_dims), paddings_data,
+                 DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void BatchToSpaceND(
+    const RuntimeShape& unextended_input1_shape, const T* input1_data,
+    const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
+    const RuntimeShape& unextended_input3_shape, const int32* crops_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  RuntimeShape input1_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input1_shape);
+  RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_batch_size = output_shape.Dims(0);
+
+  const int depth = input1_shape.Dims(3);
+  const int input_width = input1_shape.Dims(2);
+  const int input_height = input1_shape.Dims(1);
+  const int input_batch_size = input1_shape.Dims(0);
+
   const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
   const int crops_top = crops_data[0];
@@ -3362,36 +3667,72 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
         if (out_w < 0 || out_w >= output_width) {
           continue;
         }
-        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
-        const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
+        T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
+        const T* in =
+            input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
         memcpy(out, in, depth * sizeof(T));
       }
     }
   }
 }
 
+// Legacy Dims<4>.
 template <typename T>
-inline void PadV2(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& left_paddings,
-                  const std::vector<int>& right_paddings, T* output_data,
-                  const Dims<4>& output_dims, const T pad_value) {
-  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
-  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
-
-  const int output_batch = ArraySize(output_dims, 3);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  const int output_depth = ArraySize(output_dims, 0);
-
-  const int left_b_padding = left_paddings[3];
-  const int left_h_padding = left_paddings[2];
-  const int left_w_padding = left_paddings[1];
-  const int left_d_padding = left_paddings[0];
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32* crops_data, const Dims<4>& crops_dims,
+                           T* output_data, const Dims<4>& output_dims) {
+  BatchToSpaceND(DimsToShape(input_dims), input_data,
+                 DimsToShape(block_shape_dims), block_shape_data,
+                 DimsToShape(crops_dims), crops_data, DimsToShape(output_dims),
+                 output_data);
+}
 
-  const int right_b_padding = right_paddings[3];
-  const int right_h_padding = right_paddings[2];
-  const int right_w_padding = right_paddings[1];
-  const int right_d_padding = right_paddings[0];
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32 is considered a
+// specialization distinct from P=int32.
+template <typename T, typename P>
+inline void PadImpl(const tflite::PadParams& op_params,
+                    const RuntimeShape& input_shape, const T* input_data,
+                    const P* pad_value_ptr, const RuntimeShape& output_shape,
+                    T* output_data) {
+  RuntimeShape ext_input_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  RuntimeShape ext_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
+  TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
+
+  // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
+  // we can pad them to 4 dims (yes, we are "padding the padding").
+  std::vector<int> left_padding_copy(4, 0);
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[i] = op_params.left_padding[i];
+  }
+  std::vector<int> right_padding_copy(4, 0);
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[i] = op_params.right_padding[i];
+  }
+
+  const int output_batch = ext_output_shape.Dims(0);
+  const int output_height = ext_output_shape.Dims(1);
+  const int output_width = ext_output_shape.Dims(2);
+  const int output_depth = ext_output_shape.Dims(3);
+
+  const int left_b_padding = left_padding_copy[0];
+  const int left_h_padding = left_padding_copy[1];
+  const int left_w_padding = left_padding_copy[2];
+  const int left_d_padding = left_padding_copy[3];
+
+  const int right_b_padding = right_padding_copy[0];
+  const int right_h_padding = right_padding_copy[1];
+  const int right_w_padding = right_padding_copy[2];
+  const int right_d_padding = right_padding_copy[3];
+
+  const T pad_value = *pad_value_ptr;
 
   const T* in_ptr = input_data;
   T* out_ptr = output_data;
@@ -3417,7 +3758,59 @@ inline void PadV2(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T, typename P>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const P* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// The second (pad-value) input can be int32 when, say, the first is uint8.
+template <typename T>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const T* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                T* output_data) {
+  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
+  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
+          output_shape, output_data);
+}
+
+// This version avoids conflicting template matching.
+template <>
+inline void Pad(const tflite::PadParams& op_params,
+                const RuntimeShape& input_shape, const int32* input_data,
+                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                int32* output_data) {
+  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+          output_data);
+}
+
+// Legacy signature, function covered both Pad and PadV2.
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
+  TFLITE_DCHECK_EQ(left_paddings.size(), 4);
+  TFLITE_DCHECK_EQ(right_paddings.size(), 4);
+  tflite::PadParams op_params;
+  op_params.left_padding_count = 4;
+  op_params.right_padding_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.left_padding[i] = left_paddings[3 - i];
+    op_params.right_padding[i] = right_paddings[3 - i];
+  }
+  // SetFloatOrInt(pad_value, &op_params.pad_value);
+  const T pad_value_copy = pad_value;
+
+  Pad(op_params, DimsToShape(input_dims), input_data, &pad_value_copy,
+      DimsToShape(output_dims), output_data);
+}
+
+// Old Pad that calls legacy PadV2.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -3428,13 +3821,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
            output_dims, converted_pad_value);
 }
 
+// Old Pad that only padded with 0.
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
                 const Dims<4>& output_dims) {
-  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
-      output_dims, 0);
+  const T pad_value = static_cast<T>(0);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, pad_value);
 }
 
 template <typename T>
@@ -3491,31 +3886,39 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void Slice(const T* input_data, const Dims<4>& input_dims,
-                  const std::vector<int>& begin, const std::vector<int>& size,
-                  T* output_data, const Dims<4>& output_dims) {
-  // TODO(dkalenichenko): This op only supports 4D tensors.
-  TFLITE_DCHECK_EQ(begin.size(), 4);
-  TFLITE_DCHECK_EQ(size.size(), 4);
-  const int start_b = begin[3];
-  const int stop_b =
-      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
-  const int start_h = begin[2];
-  const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
-  const int start_w = begin[1];
-  const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
-  const int start_d = begin[0];
-  const int stop_d =
-      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+inline void Slice(const tflite::SliceParams& op_params,
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
+  RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
+  // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
+  TFLITE_DCHECK_LE(op_params.begin_count, 4);
+  TFLITE_DCHECK_LE(op_params.size_count, 4);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
+  const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
+                         ? ext_shape.Dims(0) - start_b
+                         : start_b + op_params.size[0];
+  const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
+  const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
+                         ? ext_shape.Dims(1) - start_h
+                         : start_h + op_params.size[size_count - 3];
+  const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
+  const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
+                         ? ext_shape.Dims(2) - start_w
+                         : start_w + op_params.size[size_count - 2];
+  const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
+  const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
+                         ? ext_shape.Dims(3) - start_d
+                         : start_d + op_params.size[size_count - 1];
 
   T* out_ptr = output_data;
   for (int in_b = start_b; in_b < stop_b; ++in_b) {
     for (int in_h = start_h; in_h < stop_h; ++in_h) {
       for (int in_w = start_w; in_w < stop_w; ++in_w) {
         for (int in_d = start_d; in_d < stop_d; ++in_d) {
-          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+          *out_ptr++ = input_data[Offset(ext_shape, in_b, in_h, in_w, in_d)];
         }
       }
     }
@@ -3523,6 +3926,22 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::SliceParams op_params;
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; ++i) {
+    op_params.begin[i] = begin[3 - i];
+    op_params.size[i] = size[3 - i];
+  }
+
+  Slice(op_params, DimsToShape(input_dims), input_data,
+        DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
 inline void Exp(const T* input_data, const size_t num_elements,
                 T* output_data) {
   for (size_t idx = 0; idx < num_elements; ++idx) {
@@ -3618,15 +4037,18 @@ inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
   return true;
 }
 
-// Computes the sum of elements across dimensions given in axis.
+// Computes the generic value (i.e., sum/max/min/prod) of elements across
+// dimensions given in axis. It needs to pass in init_value and reducer.
 template <typename T>
-inline bool Sum(const T* input_data, const int* input_dims,
-                const int input_num_dims, T* output_data,
-                const int* output_dims, const int output_num_dims,
-                const int* axis, const int num_axis_dimensions, bool keep_dims,
-                int* temp_index, int* resolved_axis) {
+inline bool ReduceGeneric(const T* input_data, const int* input_dims,
+                          const int input_num_dims, T* output_data,
+                          const int* output_dims, const int output_num_dims,
+                          const int* axis, const int64_t num_axis_dimensions,
+                          bool keep_dims, int* temp_index, int* resolved_axis,
+                          T init_value,
+                          T reducer(const T current, const T in)) {
   // Reset output data.
-  if (!InitTensorDataForReduce(output_dims, output_num_dims, static_cast<T>(0),
+  if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
                                output_data)) {
     return false;
   }
@@ -3638,9 +4060,25 @@ inline bool Sum(const T* input_data, const int* input_dims,
     return false;
   }
 
-  return ReduceSumImpl<T, T>(input_data, input_dims, output_dims,
-                             input_num_dims, output_num_dims, resolved_axis,
-                             num_resolved_axis, temp_index, output_data);
+  return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
+                      output_num_dims, resolved_axis, num_resolved_axis,
+                      temp_index, reducer, output_data);
+}
+
+// Computes the sum of elements across dimensions given in axis.
+template <typename T>
+inline bool Sum(const T* input_data, const int* input_dims,
+                const int input_num_dims, T* output_data,
+                const int* output_dims, const int output_num_dims,
+                const int* axis, const int num_axis_dimensions, bool keep_dims,
+                int* temp_index, int* resolved_axis) {
+  T init_value = static_cast<T>(0);
+
+  auto reducer = [](const T current, const T in) -> T { return current + in; };
+  return ReduceGeneric<T>(input_data, input_dims, input_num_dims, output_data,
+                          output_dims, output_num_dims, axis,
+                          num_axis_dimensions, keep_dims, temp_index,
+                          resolved_axis, init_value, reducer);
 }
 
 // Computes the max of elements across dimensions given in axis.
@@ -3651,25 +4089,32 @@ inline bool ReduceMax(const T* input_data, const int* input_dims,
                       const int* axis, const int64_t num_axis_dimensions,
                       bool keep_dims, int* temp_index, int* resolved_axis) {
   T init_value = std::numeric_limits<T>::lowest();
-  // Reset output data.
-  if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
-                               output_data)) {
-    return false;
-  }
-
-  // Resolve axis.
-  int num_resolved_axis = 0;
-  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
-                   &num_resolved_axis)) {
-    return false;
-  }
 
   auto reducer = [](const T current, const T in) -> T {
     return (in > current) ? in : current;
   };
-  return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
-                      output_num_dims, resolved_axis, num_resolved_axis,
-                      temp_index, reducer, output_data);
+  return ReduceGeneric<T>(input_data, input_dims, input_num_dims, output_data,
+                          output_dims, output_num_dims, axis,
+                          num_axis_dimensions, keep_dims, temp_index,
+                          resolved_axis, init_value, reducer);
+}
+
+// Computes the min of elements across dimensions given in axis.
+template <typename T>
+inline bool ReduceMin(const T* input_data, const int* input_dims,
+                      const int input_num_dims, T* output_data,
+                      const int* output_dims, const int output_num_dims,
+                      const int* axis, const int64_t num_axis_dimensions,
+                      bool keep_dims, int* temp_index, int* resolved_axis) {
+  T init_value = std::numeric_limits<T>::max();
+
+  auto reducer = [](const T current, const T in) -> T {
+    return (in < current) ? in : current;
+  };
+  return ReduceGeneric<T>(input_data, input_dims, input_num_dims, output_data,
+                          output_dims, output_num_dims, axis,
+                          num_axis_dimensions, keep_dims, temp_index,
+                          resolved_axis, init_value, reducer);
 }
 
 // Computes the prod of elements across dimensions given in axis.
@@ -3679,23 +4124,13 @@ inline bool ReduceProd(const T* input_data, const int* input_dims,
                        const int* output_dims, const int output_num_dims,
                        const int* axis, const int64_t num_axis_dimensions,
                        bool keep_dims, int* temp_index, int* resolved_axis) {
-  // Reset output data.
-  if (!InitTensorDataForReduce(output_dims, output_num_dims, static_cast<T>(1),
-                               output_data)) {
-    return false;
-  }
-
-  // Resolve axis.
-  int num_resolved_axis = 0;
-  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
-                   &num_resolved_axis)) {
-    return false;
-  }
+  T init_value = static_cast<T>(1);
 
   auto reducer = [](const T current, const T in) -> T { return in * current; };
-  return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
-                      output_num_dims, resolved_axis, num_resolved_axis,
-                      temp_index, reducer, output_data);
+  return ReduceGeneric<T>(input_data, input_dims, input_num_dims, output_data,
+                          output_dims, output_num_dims, axis,
+                          num_axis_dimensions, keep_dims, temp_index,
+                          resolved_axis, init_value, reducer);
 }
 
 // Computes the mean of elements across dimensions given in axis.
@@ -3789,11 +4224,75 @@ inline void Mean(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis for quantized values.
+template <typename T, typename U>
+inline bool Mean(const T* input_data, int32 input_zero_point, float input_scale,
+                 const int* input_dims, const int input_num_dims,
+                 T* output_data, int32 output_zero_point, float output_scale,
+                 const int* output_dims, const int output_num_dims,
+                 const int* axis, const int num_axis_dimensions, bool keep_dims,
+                 int* temp_index, int* resolved_axis, U* temp_sum) {
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = T();
+    temp_sum[idx] = U();
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
+                           output_num_dims, resolved_axis, num_resolved_axis,
+                           temp_index, temp_sum)) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  U num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
+    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
+  }
+
+  if (num_elements_in_axis > 0) {
+    const float scale = input_scale / output_scale;
+    const float bias = -input_zero_point * scale;
+    for (size_t idx = 0; idx < num_outputs; ++idx) {
+      float float_mean = static_cast<float>(temp_sum[idx]) /
+                         static_cast<float>(num_elements_in_axis);
+
+      // Convert to float value.
+      output_data[idx] =
+          static_cast<T>(round(float_mean * scale + bias)) + output_zero_point;
+    }
+  }
+  return true;
+}
+
 template <typename T>
-void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, T* output_data,
-                       const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
+void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
 
   auto min_value = input2_data[0];
   for (int i = 0; i < flat_size; i++) {
@@ -3802,10 +4301,10 @@ void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
 }
 
 template <typename T>
-void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, T* output_data,
-                       const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
+void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
+             const T* input2_data, const RuntimeShape& output_shape,
+             T* output_data) {
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
 
   auto max_value = input2_data[0];
   for (int i = 0; i < flat_size; i++) {
@@ -3813,22 +4312,41 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Minimum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  Maximum(DimsToShape(input1_dims), input1_data, input2_data,
+          DimsToShape(output_dims), output_data);
+}
+
 template <typename T, typename Op>
-void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
-                              const T* input2_data, const Dims<4>& input2_dims,
-                              T* output_data, const Dims<4>& output_dims,
-                              Op op) {
+void MaximumMinimumBroadcast4DSlow(const RuntimeShape& input1_shape,
+                                   const T* input1_data,
+                                   const RuntimeShape& input2_shape,
+                                   const T* input2_data,
+                                   const RuntimeShape& output_shape,
+                                   T* output_data, Op op) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
 
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          auto out_idx = Offset(output_dims, c, x, y, b);
-          auto in1_idx = SubscriptToIndex(desc1, c, x, y, b);
-          auto in2_idx = SubscriptToIndex(desc2, c, x, y, b);
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
           auto in1_val = input1_data[in1_idx];
           auto in2_val = input2_data[in2_idx];
           output_data[out_idx] = op(in1_val, in2_val);
@@ -3838,9 +4356,20 @@ void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+template <typename T, typename Op>
+void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                              const T* input2_data, const Dims<4>& input2_dims,
+                              T* output_data, const Dims<4>& output_dims,
+                              Op op) {
+  MaximumMinimumBroadcast4DSlow(DimsToShape(input1_dims), input1_data,
+                                DimsToShape(input2_dims), input2_data,
+                                DimsToShape(output_dims), output_data, op);
+}
+
 template <typename T1, typename T2, typename T3, typename Cmp>
-void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
-               T2* output_data, const Dims<4>& output_dims, const Cmp& cmp) {
+void ArgMinMax(const T3* axis, const RuntimeShape& input_shape,
+               const T1* input_data, const RuntimeShape& output_shape,
+               T2* output_data, const Cmp& cmp) {
   // The current ArgMax implemention can only determine the index of the maximum
   // value in the last dimension. So the axis argument is ignored.
 
@@ -3848,9 +4377,11 @@ void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   // 1). For the sake of simplicity, the output dimensions are equal to the
   // input dimensions here. We enforce the constraint that the last dimension
   // must always be 1.
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
-  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
-  const int depth = ArraySize(input_dims, 0);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.Dims(3), 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, 3, output_shape);
+  const int depth = input_shape.Dims(3);
 
   for (int i = 0; i < outer_size; ++i) {
     auto min_max_value = input_data[i * depth];
@@ -3866,6 +4397,15 @@ void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Dims<4> version.
+template <typename T1, typename T2, typename T3, typename Cmp>
+void ArgMinMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+               T2* output_data, const Dims<4>& output_dims, const Cmp& cmp) {
+  ArgMinMax(axis, DimsToShape(input_dims), input_data, DimsToShape(output_dims),
+            output_data, cmp);
+}
+
+// Legacy.
 // TODO(renjieliu): Remove this one.
 template <typename T1, typename T2, typename T3>
 void ArgMax(const T3* axis, const T1* input_data,
@@ -3998,16 +4538,26 @@ template <typename T>
 using ComparisonFn = bool (*)(T, T);
 
 template <typename T, ComparisonFn<T> F>
-inline void Comparison(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, const Dims<4>& input2_dims,
-                       bool* output_data, const Dims<4>& output_dims) {
+inline void Comparison(const RuntimeShape& input1_shape, const T* input1_data,
+                       const RuntimeShape& input2_shape, const T* input2_data,
+                       const RuntimeShape& output_shape, bool* output_data) {
   const int64_t flatsize =
-      MatchingFlatSize(input1_dims, input2_dims, output_dims);
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int64_t i = 0; i < flatsize; ++i) {
     output_data[i] = F(input1_data[i], input2_data[i]);
   }
 }
 
+// Legacy Dims<4> version.
+template <typename T, ComparisonFn<T> F>
+inline void Comparison(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       bool* output_data, const Dims<4>& output_dims) {
+  Comparison<T, F>(DimsToShape(input1_dims), input1_data,
+                   DimsToShape(input2_dims), input2_data,
+                   DimsToShape(output_dims), output_data);
+}
+
 template <typename T, ComparisonFn<int32> F>
 inline void Comparison(int left_shift, const T* input1_data,
                        const Dims<4>& input1_dims, int32 input1_offset,
@@ -4218,69 +4768,156 @@ inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
 }
 
 template <typename T>
-inline void Pow(const T* input1_data, const Dims<4>& input1_dims,
-                const T* input2_data, const Dims<4>& input2_dims,
-                T* output_data, const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void Pow(const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = std::pow(input1_data[i], input2_data[i]);
   }
 }
 
+// Legacy Dims<4> version.
 template <typename T>
-inline void BroadcastPow(const T* input1_data, const Dims<4>& input1_dims,
-                         const T* input2_data, const Dims<4>& input2_dims,
-                         T* output_data, const Dims<4>& output_dims) {
+inline void Pow(const T* input1_data, const Dims<4>& input1_dims,
+                const T* input2_data, const Dims<4>& input2_dims,
+                T* output_data, const Dims<4>& output_dims) {
+  Pow(DimsToShape(input1_dims), input1_data, DimsToShape(input2_dims),
+      input2_data, DimsToShape(output_dims), output_data);
+}
+
+template <typename T>
+inline void BroadcastPow4DSlow(const RuntimeShape& input1_shape,
+                               const T* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const T* input2_data,
+                               const RuntimeShape& output_shape,
+                               T* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              std::pow(input1_data[SubscriptToIndex(desc1, c, x, y, b)],
-                       input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = std::pow(in1_val, in2_val);
         }
       }
     }
   }
 }
 
+// Legacy Dims<4> version.
+template <typename T>
+inline void BroadcastPow(const T* input1_data, const Dims<4>& input1_dims,
+                         const T* input2_data, const Dims<4>& input2_dims,
+                         T* output_data, const Dims<4>& output_dims) {
+  BroadcastPow4DSlow(DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+inline void Logical(const RuntimeShape& input1_shape, const bool* input1_data,
+                    const RuntimeShape& input2_shape, const bool* input2_data,
+                    const RuntimeShape& output_shape, bool* output_data,
+                    const std::function<bool(bool, bool)>& func) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
+// Legacy Dims<4> version.
 inline void Logical(const bool* input1_data, const Dims<4>& input1_dims,
                     const bool* input2_data, const Dims<4>& input2_dims,
                     bool* output_data, const Dims<4>& output_dims,
                     const std::function<bool(bool, bool)>& func) {
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = func(input1_data[i], input2_data[i]);
+  Logical(DimsToShape(input1_dims), input1_data, DimsToShape(input2_dims),
+          input2_data, DimsToShape(output_dims), output_data, func);
+}
+
+inline void BroadcastLogical4DSlow(
+    const RuntimeShape& input1_shape, const bool* input1_data,
+    const RuntimeShape& input2_shape, const bool* input2_data,
+    const RuntimeShape& output_shape, bool* output_data,
+    const std::function<bool(bool, bool)>& func) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = func(in1_val, in2_val);
+        }
+      }
+    }
   }
 }
 
+// Legacy Dims<4> version.
 inline void BroadcastLogical(const bool* input1_data,
                              const Dims<4>& input1_dims,
                              const bool* input2_data,
                              const Dims<4>& input2_dims, bool* output_data,
                              const Dims<4>& output_dims,
                              const std::function<bool(bool, bool)>& func) {
+  BroadcastLogical4DSlow(DimsToShape(input1_dims), input1_data,
+                         DimsToShape(input2_dims), input2_data,
+                         DimsToShape(output_dims), output_data, func);
+}
+
+// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
+// generalized and efficient BroadcastBinaryFunction.
+//
+// Also appears to duplicte MinimumMaximum.
+//
+// R: Result type. T1: Input 1 type. T2: Input 2 type.
+template <typename R, typename T1, typename T2>
+inline void BroadcastBinaryFunction4DSlow(const RuntimeShape& input1_shape,
+                                          const T1* input1_data,
+                                          const RuntimeShape& input2_shape,
+                                          const T2* input2_data,
+                                          const RuntimeShape& output_shape,
+                                          R* output_data, R (*func)(T1, T2)) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              func(input1_data[SubscriptToIndex(desc1, c, x, y, b)],
-                   input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b) {
+    for (int y = 0; y < output_shape.Dims(1); ++y) {
+      for (int x = 0; x < output_shape.Dims(2); ++x) {
+        for (int c = 0; c < output_shape.Dims(3); ++c) {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = func(in1_val, in2_val);
         }
       }
     }
   }
 }
 
-// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
-// generalized and efficient BroadcastBinaryFunction.
+// Legacy Dims<4> version.
 //
 // R: Result type. T1: Input 1 type. T2: Input 2 type.
 template <typename R, typename T1, typename T2>
@@ -4290,20 +4927,9 @@ inline void BroadcastBinaryFunction(const T1* input1_data,
                                     const Dims<4>& input2_dims, R* output_data,
                                     const Dims<4>& output_dims,
                                     R (*func)(T1, T2)) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              func(input1_data[SubscriptToIndex(desc1, c, x, y, b)],
-                   input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
-        }
-      }
-    }
-  }
+  BroadcastBinaryFunction4DSlow(DimsToShape(input1_dims), input1_data,
+                                DimsToShape(input2_dims), input2_data,
+                                DimsToShape(output_dims), output_data, func);
 }
 
 }  // namespace reference_ops
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index c44698b677..2603ed2eb7 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -129,6 +129,13 @@ class RuntimeShape {
     }
   }
 
+  RuntimeShape(int shape_size, int32 value) : size_(0) {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i) {
+      SetDim(i, value);
+    }
+  }
+
   RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
     ReplaceWith(dimensions_count, dims_data);
   }
@@ -237,7 +244,7 @@ class RuntimeShape {
   bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
 
  private:
-  // For use only by ExtendFrom(), written to guarantee (return-value) copy
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
   // elision in C++17.
   // This creates a shape padded to the desired size with the specified value.
   RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
@@ -645,22 +652,6 @@ void ComputeStrides(Dims<N>* dims) {
   }
 }
 
-struct PoolParams {
-  FusedActivationFunctionType activation;
-  PaddingType padding_type;
-  PaddingValues padding_values;
-  int stride_height;
-  int stride_width;
-  int filter_height;
-  int filter_width;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  // float activation params.
-  float float_activation_min;
-  float float_activation_max;
-};
-
 enum class BroadcastableOpCategory : uint8 {
   kNone,
   kNonBroadcast,               // Matching input shapes.
@@ -669,6 +660,19 @@ enum class BroadcastableOpCategory : uint8 {
   kGenericBroadcast,           // Fall-back.
 };
 
+struct MinMax {
+  float min;
+  float max;
+};
+static_assert(sizeof(MinMax) == 8, "");
+
+struct ActivationParams {
+  FusedActivationFunctionType activation_type;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+};
+
 // For Add, Sub, Mul ops.
 struct ArithmeticParams {
   // Shape dependent / common to data / op types.
@@ -704,6 +708,211 @@ struct ArithmeticParams {
   int broadcast_shape[5];
 };
 
+struct ConcatenationParams {
+  int8 axis;
+};
+
+struct ComparisonParams {
+  // uint8 inference params.
+  int left_shift;
+  int32 input0_offset;
+  int32 input0_multiplier;
+  int input0_shift;
+  int32 input1_offset;
+  int32 input1_multiplier;
+  int input1_shift;
+  // Shape dependent / common to inference types.
+  bool is_broadcast;
+};
+
+struct ConvParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int8 stride_width;
+  int8 stride_height;
+  int8 dilation_width_factor;
+  int8 dilation_height_factor;
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+};
+
+struct DepthToSpaceParams {
+  int32 block_size;
+};
+
+struct DepthwiseParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int8 stride;
+  int8 depth_multiplier;
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+};
+
+struct FakeQuantParams {
+  MinMax minmax;
+  int32 num_bits;
+};
+
+struct FullyConnectedParams {
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+  FullyConnectedWeightsFormat weights_format;
+};
+
+struct GatherParams {
+  int8 input_rank;
+  int16 axis;
+};
+
+struct L2NormalizationParams {
+  // uint8 inference params.
+  int32 input_zero_point;
+};
+
+struct LocalResponseNormalizationParams {
+  int32 range;
+  double bias;
+  double alpha;
+  double beta;
+};
+
+struct LogisticParams {
+  // uint8 inference params.
+  int32 input_zero_point;
+  int32 input_range_radius;
+  int32 input_multiplier;
+  int input_left_shift;
+};
+
+struct LstmCellParams {
+  int32 weights_zero_point;
+  int32 accum_multiplier;
+  int accum_shift;
+  int state_integer_bits;
+};
+
+struct MeanParams {
+  int8 axis_count;
+  int16 axis[4];
+};
+
+struct PadParams {
+  int8 left_padding_count;
+  int32 left_padding[4];
+  int8 right_padding_count;
+  int32 right_padding[4];
+};
+
+struct PoolParams {
+  FusedActivationFunctionType activation;
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int stride_height;
+  int stride_width;
+  int filter_height;
+  int filter_width;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct ReshapeParams {
+  int8 shape_count;
+  int32 shape[4];
+};
+
+struct ResizeBilinearParams {
+  bool align_corners;
+};
+
+struct SliceParams {
+  int8 begin_count;
+  int32 begin[4];
+  int8 size_count;
+  int32 size[4];
+};
+
+struct SoftmaxParams {
+  // beta is not really used (not a Tensorflow parameter) and not implemented
+  // for LogSoftmax.
+  double beta;
+  // uint8 inference params.  Used even when beta defaults to 1.0.
+  int32 input_beta_multiplier;
+  int32 input_beta_left_shift;
+  // Reverse scaling is only used by LogSoftmax.
+  int32 reverse_scaling_divisor;
+  int32 reverse_scaling_right_shift;
+  int diff_min;
+};
+
+struct SpaceToBatchParams {
+  // "Zero" padding for uint8 means padding with the output offset.
+  int32 output_offset;
+};
+
+struct SpaceToDepthParams {
+  int32 block_size;
+};
+
+struct SplitParams {
+  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
+  // OperatorEdges are of type uint16.
+  uint16 num_split;
+};
+
+struct SqueezeParams {
+  int8 squeeze_dims_count;
+  int32 squeeze_dims[4];
+};
+
+struct StridedSliceParams {
+  int8 start_indices_count;
+  int16 start_indices[4];
+  int8 stop_indices_count;
+  int16 stop_indices[4];
+  int8 strides_count;
+  int16 strides[4];
+
+  int16 begin_mask;
+  int16 ellipsis_mask;
+  int16 end_mask;
+  int16 new_axis_mask;
+  int16 shrink_axis_mask;
+};
+
+struct TanhParams {
+  int32 input_zero_point;
+  int32 input_range_radius;
+  int32 input_multiplier;
+  int input_left_shift;
+};
+
 template <typename T>
 inline void SetActivationParams(T min, T max, ArithmeticParams* params);
 
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index ba251c451e..74dc3f25f9 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -37,7 +37,7 @@ namespace builtin {
 namespace lstm {
 
 struct OpData {
-  // Which kernel type to use. Full kernel (18 or 20 inputs) or basic kernel
+  // Which kernel type to use. Full kernel (20 inputs) or basic kernel
   // (5 inputs).
   TfLiteLSTMKernelType kernel_type;
 
@@ -47,7 +47,7 @@ struct OpData {
   int scratch_tensor_index;
 };
 
-// For full inputs kernel (18 or 20 inputs).
+// For full inputs kernel (20-inputs).
 namespace full {
 
 // Input Tensors of size {n_batch, n_input}
@@ -81,19 +81,13 @@ constexpr int kProjectionWeightsTensor = 16;  // Optional
 // Projection bias tensor of size {n_output}
 constexpr int kProjectionBiasTensor = 17;  // Optional
 
-// If the node has 20 inputs, the following 2 tensors are used as state tensors.
-// These are defined as variable tensors, and will be modified by this op.
+// These state tensors are defined as variable tensors, and will be modified by
+// this op.
 constexpr int kInputActivationStateTensor = 18;
 constexpr int kInputCellStateTensor = 19;
 
 // Output tensors.
-// * If the node has 18 inputs, these 2 tensors are used as state tensors.
-// * If the node has 20 inputs, these 2 tensors are ignored.
-// TODO(ycling): Make the 2 output state tensors optional, and propagate the
-// state to output tensors when the 2 tensors present.
-constexpr int kOutputStateTensor = 0;
-constexpr int kCellStateTensor = 1;
-constexpr int kOutputTensor = 2;
+constexpr int kOutputTensor = 0;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData();
@@ -258,30 +252,12 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 3);
-
-  // True if the node is using input variable state tensors. It means:
-  // * The state tensors are defined as inputs. In this case it would be the
-  //   19th and 20th input tensors.
-  // * Otherwise, the output tensors are used to store states.
-  bool use_input_variable_states;
-  if (node->inputs->size == 20) {
-    use_input_variable_states = true;
-    op_data->activation_state_tensor_index =
-        node->inputs->data[kInputActivationStateTensor];
-    op_data->cell_state_tensor_index =
-        node->inputs->data[kInputCellStateTensor];
-  } else if (node->inputs->size == 18) {
-    use_input_variable_states = false;
-    op_data->activation_state_tensor_index =
-        node->outputs->data[kOutputStateTensor];
-    op_data->cell_state_tensor_index = node->outputs->data[kCellStateTensor];
-  } else {
-    context->ReportError(
-        context, "The LSTM Full kernel expects 18 or 20 inputs. Got %d inputs",
-        node->inputs->size);
-    return kTfLiteError;
-  }
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 20);
+
+  op_data->activation_state_tensor_index =
+      node->inputs->data[kInputActivationStateTensor];
+  op_data->cell_state_tensor_index = node->inputs->data[kInputCellStateTensor];
 
   // Inferring batch size, number of outputs and number of cells from the
   // input tensors.
@@ -316,31 +292,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* cell_state =
       &context->tensors[op_data->cell_state_tensor_index];
 
-  if (use_input_variable_states) {
-    // Check the shape of input state tensors.
-    // These tensor may be 1D or 2D. It's fine as long as the total size is
-    // correct.
-    TF_LITE_ENSURE_EQ(context, NumElements(activation_state),
-                      n_batch * n_output);
-    TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
-  } else {
-    // If the state tensors are outputs, this function takes the
-    // responsibility to resize the state tensors.
-    TfLiteIntArray* activation_state_size = TfLiteIntArrayCreate(2);
-    activation_state_size->data[0] = n_batch;
-    activation_state_size->data[1] = n_output;
-    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, activation_state,
-                                                     activation_state_size));
-
-    TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2);
-    cell_size->data[0] = n_batch;
-    cell_size->data[1] = n_cell;
-    TF_LITE_ENSURE_OK(context,
-                      context->ResizeTensor(context, cell_state, cell_size));
-    // Mark state tensors as persistent tensors.
-    activation_state->allocation_type = kTfLiteArenaRwPersistent;
-    cell_state->allocation_type = kTfLiteArenaRwPersistent;
-  }
+  // Check the shape of input state tensors.
+  // These tensor may be 1D or 2D. It's fine as long as the total size is
+  // correct.
+  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
 
   // Resize the output tensors.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc
index 0266f5fe57..e7ddfceb45 100644
--- a/tensorflow/contrib/lite/kernels/lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/lstm_test.cc
@@ -106,14 +106,13 @@ class LSTMOpModel : public SingleOpModel {
     input_cell_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
 
-    output_state_ = AddOutput(TensorType_FLOAT32);
-    cell_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
 
     SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
                  CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
                                    cell_clip, proj_clip)
                      .Union());
+
     BuildInterpreter(input_shapes);
   }
 
@@ -185,22 +184,6 @@ class LSTMOpModel : public SingleOpModel {
     PopulateTensor(projection_bias_, f);
   }
 
-  void ResetOutputState() {
-    const int zero_buffer_size = n_cell_ * n_batch_;
-    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
-    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
-    PopulateTensor(output_state_, 0, zero_buffer.get(),
-                   zero_buffer.get() + zero_buffer_size);
-  }
-
-  void ResetCellState() {
-    const int zero_buffer_size = n_cell_ * n_batch_;
-    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
-    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
-    PopulateTensor(cell_state_, 0, zero_buffer.get(),
-                   zero_buffer.get() + zero_buffer_size);
-  }
-
   void SetInput(int offset, const float* begin, const float* end) {
     PopulateTensor(input_, offset, const_cast<float*>(begin),
                    const_cast<float*>(end));
@@ -469,10 +452,6 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
   lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
-  // Resetting cell_state and output_state
-  lstm.ResetCellState();
-  lstm.ResetOutputState();
-
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
@@ -529,10 +508,6 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
   lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
   lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
-  // Resetting cell_state and output_state
-  lstm.ResetCellState();
-  lstm.ResetOutputState();
-
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
                 /*tolerance=*/0.0157651);
 }
@@ -637,10 +612,6 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   lstm.SetCellToForgetWeights(cell_to_forget_weights_);
   lstm.SetCellToOutputWeights(cell_to_output_weights_);
 
-  // Resetting cell_state and output_state
-  lstm.ResetCellState();
-  lstm.ResetOutputState();
-
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
@@ -698,14 +669,10 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
   lstm.SetCellToForgetWeights(cell_to_forget_weights_);
   lstm.SetCellToOutputWeights(cell_to_output_weights_);
 
-  // Resetting cell_state and output_state
-  lstm.ResetCellState();
-  lstm.ResetOutputState();
-
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
 }
 
-class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest {
+class NoCifgPeepholeProjectionNoClippingLstmTest : public BaseLstmTest {
   void SetUp() override {
     input_to_input_weights_ = {
         0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
@@ -1304,7 +1271,7 @@ class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest {
   }
 };
 
-TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
+TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -1362,14 +1329,10 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
 
   lstm.SetProjectionWeights(projection_weights_);
 
-  // Resetting cell_state and output_state
-  lstm.ResetCellState();
-  lstm.ResetOutputState();
-
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -1428,10 +1391,6 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
 
   lstm.SetProjectionWeights(projection_weights_);
 
-  // Resetting cell_state and output_state
-  lstm.ResetCellState();
-  lstm.ResetOutputState();
-
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
 }
 
diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc
index 3f5bc4d68a..306f676619 100644
--- a/tensorflow/contrib/lite/kernels/mfcc.cc
+++ b/tensorflow/contrib/lite/kernels/mfcc.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/internal/mfcc.h"
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
diff --git a/tensorflow/contrib/lite/kernels/mfcc_test.cc b/tensorflow/contrib/lite/kernels/mfcc_test.cc
index 0291ca8c1c..c9124adcaf 100644
--- a/tensorflow/contrib/lite/kernels/mfcc_test.cc
+++ b/tensorflow/contrib/lite/kernels/mfcc_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/flexbuffers.h"  // flatbuffers
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 349f3e6726..561e39cfc6 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -93,7 +93,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         input1->params.scale * input2->params.scale / output->params.scale;
     QuantizeMultiplierSmallerThanOneExp(
         real_multiplier, &data->output_multiplier, &data->output_shift);
-    data->output_shift *= -1;
   }
 
   return context->ResizeTensor(context, output, output_size);
@@ -161,9 +160,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     // The quantized version of Mul doesn't support activations, so we
     // always use BroadcastMul.
     if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, BroadcastMul);
+      TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
     } else {
-      TF_LITE_MUL(optimized_ops, BroadcastMul);
+      TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow);
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h
index 7568eaa88e..d66364c4d8 100644
--- a/tensorflow/contrib/lite/kernels/op_macros.h
+++ b/tensorflow/contrib/lite/kernels/op_macros.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
 
 #include <cstdio>
 
@@ -31,4 +31,4 @@ limitations under the License.
     if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
   } while (0)
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
index 1c728a4733..90a915bb02 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -101,8 +101,6 @@ class LSTMOpModel : public SingleOpModel {
     input_cell_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
 
-    output_state_ = AddOutput(TensorType_FLOAT32);
-    cell_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
 
     SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
@@ -180,22 +178,6 @@ class LSTMOpModel : public SingleOpModel {
     PopulateTensor(projection_bias_, f);
   }
 
-  void ResetOutputState() {
-    const int zero_buffer_size = n_cell_ * n_batch_;
-    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
-    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
-    PopulateTensor(output_state_, 0, zero_buffer.get(),
-                   zero_buffer.get() + zero_buffer_size);
-  }
-
-  void ResetCellState() {
-    const int zero_buffer_size = n_cell_ * n_batch_;
-    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
-    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
-    PopulateTensor(cell_state_, 0, zero_buffer.get(),
-                   zero_buffer.get() + zero_buffer_size);
-  }
-
   void SetInput(int offset, float* begin, float* end) {
     PopulateTensor(input_, offset, begin, end);
   }
@@ -238,8 +220,6 @@ class LSTMOpModel : public SingleOpModel {
   int input_cell_state_;
 
   int output_;
-  int output_state_;
-  int cell_state_;
 
   int n_batch_;
   int n_input_;
@@ -324,10 +304,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   lstm.SetCellToOutputWeights(
       {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
 
-  // Resetting cell_state and output_state
-  lstm.ResetCellState();
-  lstm.ResetOutputState();
-
   // Verify the model by unpacking it.
   lstm.Verify();
 }
diff --git a/tensorflow/contrib/lite/kernels/pack.cc b/tensorflow/contrib/lite/kernels/pack.cc
index bb3416f6a6..cc326a7d51 100644
--- a/tensorflow/contrib/lite/kernels/pack.cc
+++ b/tensorflow/contrib/lite/kernels/pack.cc
@@ -27,24 +27,9 @@ namespace {
 
 constexpr int kOutputTensor = 0;
 
-// Op data for pack op.
-struct OpData {
-  int values_count;
-  int axis;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-  data->axis = 0;
-  return data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+  const TfLitePackParams* data =
+      reinterpret_cast<TfLitePackParams*>(node->builtin_data);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), data->values_count);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -54,9 +39,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumDimensions(input0) >= data->axis);
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
-  if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32) {
+  if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16) {
     context->ReportError(context,
-                         "Currently pack only supports int32 and float32.");
+                         "Currently pack only supports "
+                         "float32/uint8/int16/int32.");
     return kTfLiteError;
   }
   // Make sure all inputs have the same shape and type.
@@ -82,6 +69,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, output->type, input0->type);
 
+  // Guarantee input/output quantization params match as we do not support
+  // packing quantized tensors.
+  for (int i = 0; i < data->values_count; i++) {
+    const TfLiteTensor* input = GetInput(context, node, i);
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point,
+                      output->params.zero_point);
+    TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
+  }
+
   return context->ResizeTensor(context, output, output_shape);
 }
 
@@ -95,7 +91,8 @@ void PackImpl(TfLiteContext* context, TfLiteNode* node, TfLiteTensor* output,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+  const TfLitePackParams* data =
+      reinterpret_cast<TfLitePackParams*>(node->builtin_data);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   switch (output->type) {
@@ -103,13 +100,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       PackImpl<float>(context, node, output, data->values_count, data->axis);
       break;
     }
+    case kTfLiteUInt8: {
+      PackImpl<uint8_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
     case kTfLiteInt32: {
       PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
       break;
     }
     default: {
       context->ReportError(context,
-                           "Currently pack only supports int32 and float32.");
+                           "Currently pack only supports "
+                           "float32/uint8/int32.");
       return kTfLiteError;
     }
   }
@@ -121,8 +123,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace pack
 
 TfLiteRegistration* Register_PACK() {
-  static TfLiteRegistration r = {pack::Init, pack::Free, pack::Prepare,
-                                 pack::Eval};
+  static TfLiteRegistration r = {nullptr, nullptr, pack::Prepare, pack::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/pack_test.cc b/tensorflow/contrib/lite/kernels/pack_test.cc
index 485a50ad3a..c70dbd2764 100644
--- a/tensorflow/contrib/lite/kernels/pack_test.cc
+++ b/tensorflow/contrib/lite/kernels/pack_test.cc
@@ -51,6 +51,7 @@ class PackOpModel : public SingleOpModel {
   int output_;
 };
 
+// float32 tests.
 TEST(PackOpTest, FloatThreeInputs) {
   PackOpModel<float> model({TensorType_FLOAT32, {2}}, 0, 3);
   model.SetInput(0, {1, 4});
@@ -81,7 +82,8 @@ TEST(PackOpTest, FloatMultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
-TEST(PackOpTest, IntThreeInputs) {
+// int32 tests.
+TEST(PackOpTest, Int32ThreeInputs) {
   PackOpModel<int32_t> model({TensorType_INT32, {2}}, 0, 3);
   model.SetInput(0, {1, 4});
   model.SetInput(1, {2, 5});
@@ -91,7 +93,7 @@ TEST(PackOpTest, IntThreeInputs) {
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
 }
 
-TEST(PackOpTest, IntThreeInputsDifferentAxis) {
+TEST(PackOpTest, Int32ThreeInputsDifferentAxis) {
   PackOpModel<int32_t> model({TensorType_INT32, {2}}, 1, 3);
   model.SetInput(0, {1, 4});
   model.SetInput(1, {2, 5});
@@ -101,7 +103,7 @@ TEST(PackOpTest, IntThreeInputsDifferentAxis) {
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
 
-TEST(PackOpTest, IntMultilDimensions) {
+TEST(PackOpTest, Int32MultilDimensions) {
   PackOpModel<int32_t> model({TensorType_INT32, {2, 3}}, 1, 2);
   model.SetInput(0, {1, 2, 3, 4, 5, 6});
   model.SetInput(1, {7, 8, 9, 10, 11, 12});
@@ -110,6 +112,38 @@ TEST(PackOpTest, IntMultilDimensions) {
   EXPECT_THAT(model.GetOutput(),
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
+
+// uint8
+TEST(PackOpTest, Uint8ThreeInputs) {
+  PackOpModel<uint8_t> model({TensorType_UINT8, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, Uint8ThreeInputsDifferentAxis) {
+  PackOpModel<uint8_t> model({TensorType_UINT8, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, Uint8MultilDimensions) {
+  PackOpModel<uint8_t> model({TensorType_UINT8, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/reduce.cc b/tensorflow/contrib/lite/kernels/reduce.cc
index e99f67c725..839b48cb83 100644
--- a/tensorflow/contrib/lite/kernels/reduce.cc
+++ b/tensorflow/contrib/lite/kernels/reduce.cc
@@ -256,11 +256,27 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t));
         break;
       case kTfLiteUInt8:
-        TF_LITE_ENSURE_EQ(context, op_context.input->params.scale,
-                          op_context.output->params.scale);
-        TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point,
-                          op_context.output->params.zero_point);
-        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int));
+        if (op_context.input->params.zero_point ==
+                op_context.output->params.zero_point &&
+            op_context.input->params.scale == op_context.output->params.scale) {
+          TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int));
+        } else {
+          TF_LITE_ENSURE(
+              context,
+              reference_ops::Mean<>(
+                  GetTensorData<uint8_t>(op_context.input),
+                  op_context.input->params.zero_point,
+                  op_context.input->params.scale, op_context.input->dims->data,
+                  op_context.input->dims->size,
+                  GetTensorData<uint8_t>(op_context.output),
+                  op_context.output->params.zero_point,
+                  op_context.output->params.scale,
+                  op_context.output->dims->data, op_context.output->dims->size,
+                  GetTensorData<int>(op_context.axis), num_axis,
+                  op_context.params->keep_dims, GetTensorData<int>(temp_index),
+                  GetTensorData<int>(resolved_axis),
+                  GetTensorData<int>(temp_sum)));
+        }
         break;
       default:
         return kTfLiteError;
@@ -412,6 +428,54 @@ TfLiteStatus EvalMax(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+template <KernelType kernel_type>
+TfLiteStatus EvalMin(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+  int64_t num_axis = NumElements(op_context.axis);
+  TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempAxis(context, &op_context, resolved_axis));
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+#define TF_LITE_MIN(kernel_type, data_type)                         \
+  kernel_type::ReduceMin<>(                                         \
+      GetTensorData<data_type>(op_context.input),                   \
+      op_context.input->dims->data, op_context.input->dims->size,   \
+      GetTensorData<data_type>(op_context.output),                  \
+      op_context.output->dims->data, op_context.output->dims->size, \
+      GetTensorData<int>(op_context.axis), num_axis,                \
+      op_context.params->keep_dims, GetTensorData<int>(temp_index), \
+      GetTensorData<int>(resolved_axis))
+
+  if (kernel_type == kReference) {
+    switch (op_context.input->type) {
+      case kTfLiteFloat32:
+        TF_LITE_ENSURE(context, TF_LITE_MIN(reference_ops, float));
+        break;
+      case kTfLiteInt32:
+        TF_LITE_ENSURE(context, TF_LITE_MIN(reference_ops, int));
+        break;
+      case kTfLiteInt64:
+        TF_LITE_ENSURE(context, TF_LITE_MIN(reference_ops, int64_t));
+        break;
+      case kTfLiteUInt8:
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.scale,
+                          op_context.output->params.scale);
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point,
+                          op_context.output->params.zero_point);
+        TF_LITE_ENSURE(context, TF_LITE_MIN(reference_ops, uint8_t));
+        break;
+      default:
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_MIN
+  return kTfLiteOk;
+}
 }  // namespace reduce
 
 TfLiteRegistration* Register_MEAN_REF() {
@@ -442,6 +506,13 @@ TfLiteRegistration* Register_REDUCE_MAX_REF() {
   return &r;
 }
 
+TfLiteRegistration* Register_REDUCE_MIN_REF() {
+  static TfLiteRegistration r = {reduce::Init, reduce::Free,
+                                 reduce::PrepareSimple,
+                                 reduce::EvalMin<reduce::kReference>};
+  return &r;
+}
+
 // TODO(kanlig): add optimized implementation of Mean.
 TfLiteRegistration* Register_MEAN() { return Register_MEAN_REF(); }
 TfLiteRegistration* Register_SUM() { return Register_SUM_REF(); }
@@ -449,6 +520,7 @@ TfLiteRegistration* Register_REDUCE_PROD() {
   return Register_REDUCE_PROD_REF();
 }
 TfLiteRegistration* Register_REDUCE_MAX() { return Register_REDUCE_MAX_REF(); }
+TfLiteRegistration* Register_REDUCE_MIN() { return Register_REDUCE_MIN_REF(); }
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/contrib/lite/kernels/reduce_test.cc b/tensorflow/contrib/lite/kernels/reduce_test.cc
index 5d432d34ef..69a07f76b6 100644
--- a/tensorflow/contrib/lite/kernels/reduce_test.cc
+++ b/tensorflow/contrib/lite/kernels/reduce_test.cc
@@ -169,6 +169,35 @@ class MaxOpDynamicModel : public BaseOpModel {
   }
 };
 
+// Model for the tests case where axis is a const tensor.
+class MinOpConstModel : public BaseOpModel {
+ public:
+  MinOpConstModel(const TensorData& input, const TensorData& output,
+                  std::initializer_list<int> axis_shape,
+                  std::initializer_list<int> axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_REDUCE_MIN, BuiltinOptions_ReducerOptions,
+                 CreateReducerOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+// Model for the tests case where axis is a dynamic tensor.
+class MinOpDynamicModel : public BaseOpModel {
+ public:
+  MinOpDynamicModel(const TensorData& input, const TensorData& output,
+                    const TensorData& axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_REDUCE_MIN, BuiltinOptions_ReducerOptions,
+                 CreateReducerOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
 // for quantized Add, the error shouldn't exceed step
 float GetTolerance(int min, int max) { return (max - min) / 255.0; }
 
@@ -309,6 +338,33 @@ TEST(DynamicUint8MeanOpTest, KeepDims) {
       ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance)));
 }
 
+TEST(DynamicUint8MeanOpTest, QuantizedScalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {0.643};
+  MeanOpDynamicModel m({TensorType_UINT8, {}, 0.0, 1.0},
+                       {TensorType_UINT8, {}, -10.0, 12.0},
+                       {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({0.643}, kQuantizedTolerance)));
+}
+
+TEST(ConstUint8MeanOpTest, QuantizedKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MeanOpConstModel m({TensorType_UINT8, {3, 2}, 0.0, 1.0},
+                     {TensorType_UINT8, {3}, -5.0, 5.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
+}
+
 // Tests for reduce_sum
 
 TEST(ConstFloatSumOpTest, NotKeepDims) {
@@ -665,6 +721,147 @@ TEST(DynamicUint8MaxOpTest, Scalar) {
               ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
 }
 
+// Tests for reduce_min
+
+TEST(ConstFloatMinOpTest, NotKeepDims) {
+  std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                             9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                             17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MinOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
+                    {4}, {1, 0, -3, -3}, false);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({1, 2})));
+}
+
+TEST(ConstFloatMinOpTest, KeepDims) {
+  std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                             9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                             17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MinOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
+                    {2}, {0, 2}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 3, 5})));
+}
+
+TEST(DynamicFloatMinOpTest, NotKeepDims) {
+  std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                             9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                             17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MinOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                      {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}},
+                      false);
+  std::vector<int> axis = {1, 0, -3, -3};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({1, 2})));
+}
+
+TEST(DynamicFloatMinOpTest, KeepDims) {
+  std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                             9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                             17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MinOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                      {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}}, true);
+  std::vector<int> axis = {0, 2};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 3, 5})));
+}
+
+TEST(DynamicFloatMinOpTest, Scale) {
+  std::vector<float> data = {9.527};
+  MinOpDynamicModel m({TensorType_FLOAT32, {1}}, {TensorType_FLOAT32, {1}},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527})));
+}
+
+TEST(ConstUint8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0},
+                    {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance)));
+}
+
+TEST(ConstUint8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0},
+                    {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance)));
+}
+
+TEST(DynamicUint8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
+  MinOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0},
+                      {TensorType_UINT8, {2}, -5.0, 2.0},
+                      {TensorType_INT32, {1}}, false);
+  std::vector<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance)));
+}
+
+TEST(DynamicUint8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14, -0.14, 7.423, 0.879};
+  MinOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0},
+                      {TensorType_UINT8, {2}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance)));
+}
+
+TEST(DynamicUint8MinOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MinOpDynamicModel m({TensorType_UINT8, {}, -10.0, 12.0},
+                      {TensorType_UINT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 8d2c108116..341fd14127 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/util.h"
 
 namespace tflite {
 namespace ops {
@@ -93,6 +94,7 @@ TfLiteRegistration* Register_NEG();
 TfLiteRegistration* Register_SUM();
 TfLiteRegistration* Register_REDUCE_PROD();
 TfLiteRegistration* Register_REDUCE_MAX();
+TfLiteRegistration* Register_REDUCE_MIN();
 TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE();
 TfLiteRegistration* Register_SIN();
@@ -111,6 +113,7 @@ TfLiteRegistration* Register_ONE_HOT();
 TfLiteRegistration* Register_LOGICAL_OR();
 TfLiteRegistration* Register_LOGICAL_AND();
 TfLiteRegistration* Register_LOGICAL_NOT();
+TfLiteRegistration* Register_UNPACK();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -127,9 +130,9 @@ const TfLiteRegistration* BuiltinOpResolver::FindOp(tflite::BuiltinOperator op,
 
 const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op,
                                                     int version) const {
-  // Return the NULL Op for all ops whose name start with "Eager:", allowing
+  // Return the NULL Op for all ops whose name start with "Eager", allowing
   // the interpreter to delegate their execution.
-  if (string(op).find("Eager:") == 0) {
+  if (IsEagerOp(op)) {
     static TfLiteRegistration null_op{
         nullptr, nullptr, &UnsupportedTensorFlowOp,
         nullptr, nullptr, BuiltinOperator_CUSTOM,
@@ -218,6 +221,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SUM, Register_SUM());
   AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
   AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
   AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
   AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
   AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
@@ -232,6 +236,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
   AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 6d4912ce3a..6ba7959752 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -40,19 +40,22 @@ namespace {
 struct OpData {
   int scratch_tensor_index;
   bool float_weights_time_initialized;
+
+  int activation_state_tensor_index;
 };
 
 static inline void ApplyTimeWeightsBiasAndActivation(
     int batch_size, int memory_size, int num_filters, int num_units, int rank,
     const TfLiteTensor* weights_time, const TfLiteTensor* bias,
-    TfLiteFusedActivation activation, TfLiteTensor* state,
+    TfLiteFusedActivation activation, TfLiteTensor* activation_state,
     TfLiteTensor* scratch, TfLiteTensor* output) {
   // Compute matmul(state, weights_time).
   // The right most column is used to save temporary output (with the size of
-  // num_filters). This is achieved by starting at state->data.f and having the
-  // stride equal to memory_size.
+  // num_filters). This is achieved by starting at activation_state->data.f,
+  // and having the stride equal to memory_size.
   for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+    float* state_ptr_batch =
+        activation_state->data.f + b * memory_size * num_filters;
     float* scratch_ptr_batch = scratch->data.f + b * num_filters;
     tensor_utils::BatchVectorBatchVectorDotProduct(
         weights_time->data.f, state_ptr_batch, memory_size, num_filters,
@@ -82,13 +85,14 @@ static inline void ApplyTimeWeightsBiasAndActivation(
                                           activation, output_ptr_batch);
   }
 
-  // Left shift the state to make room for next cycle's activation.
+  // Left shift the activation_state to make room for next cycle's activation.
   // TODO(alanchiao): explore collapsing this into a single loop.
   for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+    float* state_ptr_batch =
+        activation_state->data.f + b * memory_size * num_filters;
     for (int f = 0; f < num_filters; ++f) {
       tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
-                                    /*shift_value=*/0.0);
+                                    /*shift_value=*/0.0f);
       state_ptr_batch += memory_size;
     }
   }
@@ -96,12 +100,16 @@ static inline void ApplyTimeWeightsBiasAndActivation(
 
 }  // namespace
 
+// Input tensors.
 constexpr int kInputTensor = 0;
 constexpr int kWeightsFeatureTensor = 1;
 constexpr int kWeightsTimeTensor = 2;
 constexpr int kBiasTensor = 3;
-constexpr int kStateTensor = 0;
-constexpr int kOutputTensor = 1;
+// This is a variable tensor, and will be modified by this op.
+constexpr int kInputActivationStateTensor = 4;
+
+// Output tensor.
+constexpr int kOutputTensor = 0;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData();
@@ -121,8 +129,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int scratch_tensor_index = op_data->scratch_tensor_index;
 
   // Check we have all the inputs and outputs we need.
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+  op_data->activation_state_tensor_index =
+      node->inputs->data[kInputActivationStateTensor];
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* weights_feature =
@@ -148,22 +158,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
   }
 
-  TfLiteTensor* state = GetOutput(context, node, kStateTensor);
+  TfLiteTensor* activation_state =
+      &context->tensors[op_data->activation_state_tensor_index];
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  // Resize state.
-  // For each batch, the state is a 2-D tensor: memory_size * num_filters
-  // The left most column is used to save current cycle activation.
-  // The right most column is used to save temporary output which will be
-  // reduced to num_units outputs.
-  TfLiteIntArray* state_size_array = TfLiteIntArrayCreate(2);
-  state_size_array->data[0] = batch_size;
-  state_size_array->data[1] = memory_size * num_filters;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, state, state_size_array));
-
-  // Mark state as a persistent tensor.
-  state->allocation_type = kTfLiteArenaRwPersistent;
+  // Check the shape of input state tensors.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 0), batch_size);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(activation_state, 1),
+                    memory_size * num_filters);
 
   // Resize output.
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
@@ -220,8 +223,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        scaling_factors_size));
     }
 
-    // Used to store dequantized weights_time matrix for hybrid computation
-    // of matmul(state, weights_time), which occurs in floating point.
+    // Used to store dequantized weights_time matrix for hybrid computation of
+    // matmul(activation_state, weights_time), which occurs in floating point.
     node->temporaries->data[3] = scratch_tensor_index + 3;
     TfLiteTensor* float_weights_time = GetTemporary(context, node, /*index=*/3);
     float_weights_time->type = kTfLiteFloat32;
@@ -253,13 +256,13 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   const int memory_size = weights_time->dims->data[1];
 
   // Clear the activation (state left most column).
-  // TODO(ghodrat): Add a test which initialize state with invalid values in
-  // left most column and make sure it passes.
+  // TODO(ghodrat): Add a test which initialize activation_state with invalid
+  // values in left most column and make sure it passes.
   for (int b = 0; b < batch_size; ++b) {
     float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
     for (int c = 0; c < num_filters; ++c) {
       float* state_ptr = state_ptr_batch + c * memory_size;
-      state_ptr[memory_size - 1] = 0.0;
+      state_ptr[memory_size - 1] = 0.0f;
     }
   }
 
@@ -307,7 +310,7 @@ TfLiteStatus EvalHybrid(
 
   // Clear the activation (state left most column).
   // TODO(ghodrat): Add a test which initialize state with invalid values in
-  // left most column and make sure it passes.
+  // the left most column and make sure it passes.
   for (int b = 0; b < batch_size; ++b) {
     float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
     for (int c = 0; c < num_filters; ++c) {
@@ -329,9 +332,10 @@ TfLiteStatus EvalHybrid(
     }
 
     // Compute conv1d(inputs, weights_feature).
-    // The state right most column is used to save current cycle activation.
-    // This is achieved by starting at state->data.f[memory_size - 1] and having
-    // the stride equal to memory_size.
+    // The rightmost column of state is used to save the current cycle
+    // activation.
+    // This is achieved by starting at state->data.f[memory_size - 1]
+    // and having the stride equal to memory_size.
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         weights_feature_ptr, num_filters, input_size, quantized_input_ptr_batch,
         scaling_factors_ptr, batch_size, &state->data.f[memory_size - 1],
@@ -359,13 +363,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* state = GetOutput(context, node, kStateTensor);
+  TfLiteTensor* activation_state =
+      &context->tensors[op_data->activation_state_tensor_index];
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
       return EvalFloat(context, node, input, weights_feature, weights_time,
-                       bias, params, scratch, state, output);
+                       bias, params, scratch, activation_state, output);
       break;
     }
     case kTfLiteUInt8: {
@@ -392,7 +397,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       return EvalHybrid(context, node, input, weights_feature,
                         float_weights_time, bias, params, scratch,
-                        scaling_factors, input_quantized, state, output);
+                        scaling_factors, input_quantized, activation_state,
+                        output);
       break;
     }
     default:
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
index 5af3ff8500..6d60dc63f4 100644
--- a/tensorflow/contrib/lite/kernels/svdf_test.cc
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -141,16 +141,20 @@ class BaseSVDFOpModel : public SingleOpModel {
     weights_feature_ = AddInput(weights_feature_type);
     weights_time_ = AddInput(weights_time_type);
     bias_ = AddNullInput();
-    state_ = AddOutput(TensorType_FLOAT32);
+    const int num_filters = units * rank;
+    activation_state_ = AddInput(
+        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
+        /*is_variable=*/true);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
         CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE).Union());
     BuildInterpreter({
-        {batches_, input_size_},        // Input tensor
-        {units_ * rank, input_size_},   // weights_feature tensor
-        {units_ * rank, memory_size_},  // weights_time tensor
-        {units_}                        // bias tensor
+        {batches_, input_size_},              // input tensor
+        {units_ * rank, input_size_},         // weights_feature tensor
+        {units_ * rank, memory_size_},        // weights_time tensor
+        {units_},                             // bias tensor
+        {batches, memory_size * num_filters}  // activation_state tensor
     });
   }
 
@@ -169,15 +173,6 @@ class BaseSVDFOpModel : public SingleOpModel {
     PopulateTensor(input_, offset, begin, end);
   }
 
-  // Resets the state of SVDF op by filling it with 0's.
-  void ResetState() {
-    const int zero_buffer_size = rank_ * units_ * batches_ * memory_size_;
-    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
-    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
-    PopulateTensor(state_, 0, zero_buffer.get(),
-                   zero_buffer.get() + zero_buffer_size);
-  }
-
   // Extracts the output tensor from the SVDF op.
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
@@ -190,7 +185,7 @@ class BaseSVDFOpModel : public SingleOpModel {
   int weights_feature_;
   int weights_time_;
   int bias_;
-  int state_;
+  int activation_state_;
   int output_;
 
   int batches_;
@@ -274,7 +269,6 @@ TEST_F(SVDFOpTest, BlackBoxTestRank1) {
        -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
        -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
 
-  svdf.ResetState();
   VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input),
                 &svdf);
 }
@@ -314,7 +308,6 @@ TEST_F(SVDFOpTest, BlackBoxTestRank2) {
        0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
        0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
 
-  svdf.ResetState();
   VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input),
                 &svdf);
 }
@@ -339,7 +332,6 @@ TEST_F(SVDFOpTest, BlackBoxTestHybridRank1) {
        -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
        -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
 
-  svdf.ResetState();
   VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input),
                 &svdf,
                 /*tolerance=*/0.002945);
@@ -380,7 +372,6 @@ TEST_F(SVDFOpTest, BlackBoxTestHybridRank2) {
        0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
        0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
 
-  svdf.ResetState();
   VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input),
                 &svdf,
                 /*tolerance=*/0.00625109);
diff --git a/tensorflow/contrib/lite/kernels/unpack.cc b/tensorflow/contrib/lite/kernels/unpack.cc
new file mode 100644
index 0000000000..4998f88b41
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unpack.cc
@@ -0,0 +1,130 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unpack {
+namespace {
+
+constexpr int kInputTensor = 0;
+
+// Op data for unpack op.
+struct OpData {
+  int num;
+  int axis;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->axis = 0;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), data->num);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
+  TF_LITE_ENSURE(context, NumDimensions(input) > 1);
+  TF_LITE_ENSURE(context, NumDimensions(input) > data->axis);
+  // TODO(renjieliu): Support negative axis.
+  TF_LITE_ENSURE(context, data->axis >= 0);
+  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32) {
+    context->ReportError(context,
+                         "Currently pack only supports int32 and float32.");
+    return kTfLiteError;
+  }
+
+  const TfLiteIntArray* input_shape = input->dims;
+  // Num should be equal to the shape[axis].
+  // Resize outputs. rank will be R - 1.
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(input) - 1);
+  int o = 0;
+  for (int index = 0; index < NumDimensions(input); ++index) {
+    if (index != data->axis) {
+      output_shape->data[o++] = input_shape->data[index];
+    }
+  }
+
+  TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[data->axis]);
+  for (int i = 0; i < data->num; ++i) {
+    TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
+    TfLiteTensor* output = GetOutput(context, node, i);
+    TF_LITE_ENSURE_EQ(context, output->type, input->type);
+    TF_LITE_ENSURE_OK(
+        context, context->ResizeTensor(context, output, copied_output_shape));
+  }
+
+  TfLiteIntArrayFree(output_shape);
+  return kTfLiteOk;
+}
+
+template <typename T>
+void UnpackImpl(TfLiteContext* context, TfLiteNode* node,
+                const TfLiteTensor* input, int output_count, int axis) {
+  VectorOfTensors<T> all_outputs(*context, *node->outputs);
+  reference_ops::Unpack<T>(axis, GetTensorData<T>(input), GetTensorDims(input),
+                           NumDimensions(input), output_count,
+                           all_outputs.data(), **all_outputs.dims());
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      UnpackImpl<float>(context, node, input, data->num, data->axis);
+      break;
+    }
+    case kTfLiteInt32: {
+      UnpackImpl<int32_t>(context, node, input, data->num, data->axis);
+      break;
+    }
+    default: {
+      context->ReportError(context,
+                           "Currently pack only supports int32 and float32.");
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace
+}  // namespace unpack
+
+TfLiteRegistration* Register_UNPACK() {
+  static TfLiteRegistration r = {unpack::Init, unpack::Free, unpack::Prepare,
+                                 unpack::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/unpack_test.cc b/tensorflow/contrib/lite/kernels/unpack_test.cc
new file mode 100644
index 0000000000..4efc92a0fd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unpack_test.cc
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+template <typename T>
+class UnpackOpModel : public SingleOpModel {
+ public:
+  UnpackOpModel(const TensorData& input, int axis) {
+    CHECK_LE(axis, input.shape.size());
+    const int num_outputs = input.shape[axis];
+    input_ = AddInput(input);
+    for (int i = 0; i < num_outputs; ++i) {
+      outputs_.push_back(AddOutput(input.type));
+    }
+    SetBuiltinOp(BuiltinOperator_UNPACK, BuiltinOptions_UnpackOptions,
+                 CreatePackOptions(builder_, num_outputs, axis).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  std::vector<std::vector<T>> GetOutputDatas() {
+    std::vector<std::vector<T>> output_datas;
+    for (const int output : outputs_) {
+      std::cerr << "the output is " << output << std::endl;
+      output_datas.push_back(ExtractVector<T>(output));
+    }
+    return output_datas;
+  }
+
+  std::vector<std::vector<int>> GetOutputShapes() {
+    std::vector<std::vector<int>> output_shapes;
+    for (const int output : outputs_) {
+      output_shapes.push_back(GetTensorShape(output));
+    }
+    return output_shapes;
+  }
+
+ private:
+  int input_;
+  std::vector<int> outputs_;
+};
+
+// float32 tests.
+TEST(UnpackOpTest, FloatThreeOutputs) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, 0);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 3);
+  EXPECT_THAT(output_shapes[0], ElementsAre(2));
+  EXPECT_THAT(output_shapes[1], ElementsAre(2));
+  EXPECT_THAT(output_shapes[2], ElementsAre(2));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 3);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 2));
+  EXPECT_THAT(output_datas[1], ElementsAre(3, 4));
+  EXPECT_THAT(output_datas[2], ElementsAre(5, 6));
+}
+
+TEST(UnpackOpTest, FloatThreeOutputsAxisOne) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, 1);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 2);
+  EXPECT_THAT(output_shapes[0], ElementsAre(3));
+  EXPECT_THAT(output_shapes[1], ElementsAre(3));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 2);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5));
+  EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
+}
+
+TEST(UnpackOpTest, FloatOneOutput) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {1, 6}}, 0);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 1);
+  EXPECT_THAT(output_shapes[0], ElementsAre(6));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 1);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 2, 3, 4, 5, 6));
+}
+
+TEST(UnpackOpTest, FloatThreeDimensionsOutputs) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {2, 2, 2}}, 2);
+  model.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 2);
+  EXPECT_THAT(output_shapes[0], ElementsAre(2, 2));
+  EXPECT_THAT(output_shapes[1], ElementsAre(2, 2));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 2);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5, 7));
+  EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6, 8));
+}
+
+// int32 tests.
+TEST(UnpackOpTest, IntThreeOutputs) {
+  UnpackOpModel<int32_t> model({TensorType_INT32, {3, 2}}, 0);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 3);
+  EXPECT_THAT(output_shapes[0], ElementsAre(2));
+  EXPECT_THAT(output_shapes[1], ElementsAre(2));
+  EXPECT_THAT(output_shapes[2], ElementsAre(2));
+
+  // Check outputs values.
+  const std::vector<std::vector<int32_t>>& output_datas =
+      model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 3);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 2));
+  EXPECT_THAT(output_datas[1], ElementsAre(3, 4));
+  EXPECT_THAT(output_datas[2], ElementsAre(5, 6));
+}
+
+TEST(UnpackOpTest, IntThreeOutputsAxisOne) {
+  UnpackOpModel<int32_t> model({TensorType_INT32, {3, 2}}, 1);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 2);
+  EXPECT_THAT(output_shapes[0], ElementsAre(3));
+  EXPECT_THAT(output_shapes[1], ElementsAre(3));
+
+  // Check outputs values.
+  const std::vector<std::vector<int32_t>>& output_datas =
+      model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 2);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5));
+  EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
+}
+
+TEST(UnpackOpTest, IntOneOutput) {
+  UnpackOpModel<int32_t> model({TensorType_INT32, {1, 6}}, 0);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 1);
+  EXPECT_THAT(output_shapes[0], ElementsAre(6));
+
+  // Check outputs values.
+  const std::vector<std::vector<int32_t>>& output_datas =
+      model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 1);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 2, 3, 4, 5, 6));
+}
+
+TEST(UnpackOpTest, IntThreeDimensionsOutputs) {
+  UnpackOpModel<int32_t> model({TensorType_INT32, {2, 2, 2}}, 2);
+  model.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 2);
+  EXPECT_THAT(output_shapes[0], ElementsAre(2, 2));
+  EXPECT_THAT(output_shapes[1], ElementsAre(2, 2));
+
+  // Check outputs values.
+  const std::vector<std::vector<int32_t>>& output_datas =
+      model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 2);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5, 7));
+  EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6, 8));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
index b58ae26601..6195426d6d 100755
--- a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
+++ b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
+# TODO(ycling): Refactoring - Move this script into `tools/make`.
 set -e
 
 echo "Starting"
@@ -32,7 +33,7 @@ echo "Headers, populating: TensorFlow Lite"
 cd $TFLITE_DIR/../../..
 
 find tensorflow/contrib/lite -name '*.h' \
-    -not -path 'tensorflow/contrib/lite/downloads/*' \
+    -not -path 'tensorflow/contrib/lite/tools/*' \
     -not -path 'tensorflow/contrib/lite/examples/*' \
     -not -path 'tensorflow/contrib/lite/gen/*' \
     -not -path 'tensorflow/contrib/lite/toco/*' \
@@ -44,7 +45,7 @@ tar xf tmp.tar
 rm -f tmp.tar
 
 echo "Headers, populating: Flatbuffer"
-cd $TFLITE_DIR/downloads/flatbuffers/include/
+cd $TFLITE_DIR/tools/make/downloads/flatbuffers/include/
 find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
 cd $FW_DIR_TFLITE_HDRS
 tar xf tmp.tar
@@ -57,7 +58,7 @@ cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tens
    $FW_DIR_TFLITE
 
 echo "Copying static libraries"
-cp $TFLITE_DIR/gen/lib/libtensorflow-lite.a \
+cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \
    $FW_DIR_TFLITE/tensorflow_lite
 
 # This is required, otherwise they interfere with the documentation of the
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 9edf5ba38f..da3ed42e20 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -26,6 +26,9 @@ limitations under the License.
 #ifndef TFLITE_MCU
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
 #endif
+#if defined(TFLITE_EXTENDED)
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#endif
 #include "tensorflow/contrib/lite/version.h"
 
 namespace tflite {
@@ -619,6 +622,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     }
     case BuiltinOperator_MEAN:
     case BuiltinOperator_REDUCE_MAX:
+    case BuiltinOperator_REDUCE_MIN:
     case BuiltinOperator_REDUCE_PROD:
     case BuiltinOperator_SUM: {
       auto* params = MallocPOD<TfLiteReducerParams>();
@@ -741,6 +745,15 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = static_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNPACK: {
+      TfLiteUnpackParams* params = MallocPOD<TfLiteUnpackParams>();
+      if (auto* unpack_params = op->builtin_options_as_UnpackOptions()) {
+        params->num = unpack_params->num();
+        params->axis = unpack_params->axis();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_BATCH_TO_SPACE_ND:
@@ -786,6 +799,8 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_LOGICAL_OR:
     case BuiltinOperator_LOGICAL_AND:
     case BuiltinOperator_LOGICAL_NOT:
+    case BuiltinOperator_FLOOR_DIV:
+    case BuiltinOperator_REDUCE_ANY:
       break;
   }
   return kTfLiteOk;
@@ -797,6 +812,10 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
     const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
     Interpreter* interpreter) {
   TfLiteStatus status = kTfLiteOk;
+
+  // Reduce the number of redundant allocations
+  interpreter->ReserveNodes(operators->Length());
+
   for (int i = 0; i < operators->Length(); ++i) {
     const auto* op = operators->Get(i);
     int index = op->opcode_index();
@@ -1040,6 +1059,14 @@ TfLiteStatus InterpreterBuilder::operator()(
   }
   (**interpreter).SetVariables(std::move(variables));
 
+#if defined(TFLITE_EXTENDED)
+  if (auto delegate = EagerDelegate::Create()) {
+    (**interpreter)
+        .ModifyGraphWithDelegate(std::move(delegate),
+                                 /*allow_dynamic_tensors=*/true);
+  }
+#endif
+
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/models/speech_test.cc b/tensorflow/contrib/lite/models/speech_test.cc
index 206de1962d..8ecf0b6154 100644
--- a/tensorflow/contrib/lite/models/speech_test.cc
+++ b/tensorflow/contrib/lite/models/speech_test.cc
@@ -102,7 +102,7 @@ class SpeechTest : public ::testing::TestWithParam<int> {
   int GetMaxInvocations() { return GetParam(); }
 };
 
-TEST_P(SpeechTest, HotwordOkGoogleRank1Test) {
+TEST_P(SpeechTest, DISABLED_HotwordOkGoogleRank1Test) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_hotword_model_rank1.tflite", "speech_hotword_model_in.csv",
@@ -114,7 +114,7 @@ TEST_P(SpeechTest, HotwordOkGoogleRank1Test) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, HotwordOkGoogleRank2Test) {
+TEST_P(SpeechTest, DISABLED_HotwordOkGoogleRank2Test) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_hotword_model_rank2.tflite", "speech_hotword_model_in.csv",
@@ -126,7 +126,7 @@ TEST_P(SpeechTest, HotwordOkGoogleRank2Test) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, SpeakerIdOkGoogleTest) {
+TEST_P(SpeechTest, DISABLED_SpeakerIdOkGoogleTest) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_speakerid_model.tflite", "speech_speakerid_model_in.csv",
@@ -139,7 +139,7 @@ TEST_P(SpeechTest, SpeakerIdOkGoogleTest) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, AsrAmTest) {
+TEST_P(SpeechTest, DISABLED_AsrAmTest) {
   std::stringstream os;
   ASSERT_TRUE(
       ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
@@ -156,7 +156,7 @@ TEST_P(SpeechTest, AsrAmTest) {
 // through the interpreter and stored the sum of all the output, which was them
 // compared for correctness. In this test we are comparing all the intermediate
 // results.
-TEST_P(SpeechTest, AsrLmTest) {
+TEST_P(SpeechTest, DISABLED_AsrLmTest) {
   std::ifstream in_file;
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
   ASSERT_TRUE(Init("speech_asr_lm_model.test_spec", &test_driver, &in_file));
@@ -165,7 +165,7 @@ TEST_P(SpeechTest, AsrLmTest) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, EndpointerTest) {
+TEST_P(SpeechTest, DISABLED_EndpointerTest) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData(
       "speech_endpointer_model.tflite", "speech_endpointer_model_in.csv",
@@ -178,7 +178,7 @@ TEST_P(SpeechTest, EndpointerTest) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, TtsTest) {
+TEST_P(SpeechTest, DISABLED_TtsTest) {
   std::stringstream os;
   ASSERT_TRUE(ConvertCsvData("speech_tts_model.tflite",
                              "speech_tts_model_in.csv",
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 42b8163445..81dd459223 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef NN_API_SHIM_H0
-#define NN_API_SHIM_H0
+#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_NEURALNETWORKSSHIM_H_
+#define TENSORFLOW_CONTRIB_LITE_NNAPI_NEURALNETWORKSSHIM_H_
 
 #include <dlfcn.h>
 #include <stdint.h>
@@ -970,4 +970,4 @@ inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
 
 /**/
 
-#endif  // NN_API_SHIM_H0
+#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_NEURALNETWORKSSHIM_H_
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 45c92a8671..38f3e9881b 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -636,6 +636,7 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_NOT_EQUAL:
       case tflite::BuiltinOperator_SUM:
       case tflite::BuiltinOperator_REDUCE_MAX:
+      case tflite::BuiltinOperator_REDUCE_MIN:
       case tflite::BuiltinOperator_REDUCE_PROD:
       case tflite::BuiltinOperator_SQRT:
       case tflite::BuiltinOperator_RSQRT:
@@ -647,6 +648,9 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_ONE_HOT:
       case tflite::BuiltinOperator_LOGICAL_AND:
       case tflite::BuiltinOperator_LOGICAL_NOT:
+      case tflite::BuiltinOperator_UNPACK:
+      case tflite::BuiltinOperator_FLOOR_DIV:
+      case tflite::BuiltinOperator_REDUCE_ANY:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
diff --git a/tensorflow/contrib/lite/optional_debug_tools.h b/tensorflow/contrib/lite/optional_debug_tools.h
index 7fb4b8d8b7..82a6e114a6 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.h
+++ b/tensorflow/contrib/lite/optional_debug_tools.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Optional debugging functionality. For small sized binaries, these are not
 // needed.
-#ifndef TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
-#define TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_OPTIONAL_DEBUG_TOOLS_H_
+#define TENSORFLOW_CONTRIB_LITE_OPTIONAL_DEBUG_TOOLS_H_
 
 #include "tensorflow/contrib/lite/interpreter.h"
 
@@ -26,4 +26,4 @@ void PrintInterpreterState(Interpreter* interpreter);
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_OPTIONAL_DEBUG_TOOLS_H_
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 860aff9e7e..47f0c8e9a2 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -112,8 +112,11 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 11d4bdbe82..12cc66dc55 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os as _os
+import platform as _platform
 import subprocess as _subprocess
 import tempfile as _tempfile
 
@@ -26,6 +27,7 @@ from tensorflow.contrib.lite.python import lite_constants
 from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.python.platform import resource_loader as _resource_loader
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.lazy_loader import LazyLoader
 
 
@@ -90,12 +92,13 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
         fp_output.name
     ]
     cmdline = " ".join(cmd)
+    is_windows = _platform.system() == "Windows"
     proc = _subprocess.Popen(
         cmdline,
         shell=True,
         stdout=_subprocess.PIPE,
         stderr=_subprocess.STDOUT,
-        close_fds=True)
+        close_fds=not is_windows)
     stdout, stderr = proc.communicate()
     exitcode = proc.returncode
     if exitcode == 0:
@@ -223,7 +226,8 @@ def build_toco_convert_protos(input_tensors,
   return model, toco
 
 
-def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
+def toco_convert_impl(input_data, input_tensors, output_tensors, *args,
+                      **kwargs):
   """"Convert a model using TOCO.
 
   Typically this function is used to convert from TensorFlow GraphDef to TFLite.
@@ -252,3 +256,30 @@ def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
                              toco_flags.SerializeToString(),
                              input_data.SerializeToString())
   return data
+
+
+@deprecation.deprecated(None, "Use `lite.TocoConverter` instead.")
+def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
+  """"Convert a model using TOCO.
+
+  Typically this function is used to convert from TensorFlow GraphDef to TFLite.
+  Conversion can be customized by providing arguments that are forwarded to
+  `build_toco_convert_protos` (see documentation for details).
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`),
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    *args: See `build_toco_convert_protos`,
+    **kwargs: See `build_toco_convert_protos`.
+
+  Returns:
+    The converted data. For example if TFLite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    Defined in `build_toco_convert_protos`.
+  """
+  return toco_convert_impl(input_data, input_tensors, output_tensors, *args,
+                           **kwargs)
diff --git a/tensorflow/contrib/lite/python/convert_test.py b/tensorflow/contrib/lite/python/convert_test.py
index dc21a9b669..bc05514cec 100644
--- a/tensorflow/contrib/lite/python/convert_test.py
+++ b/tensorflow/contrib/lite/python/convert_test.py
@@ -113,12 +113,13 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
       # and 1 final output).
       self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
 
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output)]),
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
           ["cool_activation", "Const", "Identity"])
 
   def testScaleAndBiasAndIdentity(self):
@@ -139,12 +140,13 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
 
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
 
       self.assertCountEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output)]),
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
           ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
 
   def testTwoFunctions(self):
@@ -153,7 +155,7 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
     b = array_ops.constant([1.])
     def _double_values(x):
       custom = op_hint.OpHint("add_test")
-      x = custom.add_inputs(x)
+      x, = custom.add_inputs(x)
       output = math_ops.multiply(x, x)
       output, = custom.add_outputs(output)
       return output
@@ -164,13 +166,90 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
       # make sure one identity for each input (2) and output (2) => 2 + 2
       # +1 for the final output
       self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
-      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess)
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
       self.assertCountEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
-              output_nodes=[op_hint._tensor_name_base(output)]),
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
           ["add_test", "Const", "Identity", "Add"])
 
+  def _get_input_index(self, x):
+    return x.op.node_def.attr[op_hint.OpHint.FUNCTION_INPUT_INDEX_ATTR].i
+
+  def _get_output_index(self, x):
+    return x.op.node_def.attr[op_hint.OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i
+
+  def _get_sort_index(self, x):
+    return x.op.node_def.attr[op_hint.OpHint.FUNCTION_SORT_INDEX_ATTR].i
+
+  def testTags(self):
+    """Test if multiple args with the same tag are grouped."""
+    a = array_ops.constant([1.])
+    b = array_ops.constant([2.])
+    c = array_ops.constant([3.])
+    d = array_ops.constant([4.])
+    custom = op_hint.OpHint("test_tag")
+    a = custom.add_input(a, tag="mytag",
+                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    b, = custom.add_inputs(b)
+    c = custom.add_input(c, tag="mytag",
+                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    d = custom.add_input(d, tag="mytag2",
+                         aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    res = math_ops.add(math_ops.mul(a, b), math_ops.mul(c, b))
+    custom.add_outputs([res])
+    with self.test_session():
+      self.assertEqual(self._get_input_index(a), 0)
+      self.assertEqual(self._get_sort_index(a), 0)
+      self.assertEqual(self._get_input_index(b), 1)
+      self.assertEqual(self._get_input_index(c), 0)
+      self.assertEqual(self._get_sort_index(c), 1)
+
+  def testOverrideIndex(self):
+    a = array_ops.constant([1.])
+    b = array_ops.constant([2.])
+    c = array_ops.constant([3.])
+    custom = op_hint.OpHint("test_override")
+    b = custom.add_input(b)  # should auto assign 0
+    a = custom.add_input(a, index_override=1)
+    c = custom.add_input(c)  # should auto assign 2
+    with self.test_session():
+      self.assertEqual(self._get_input_index(a), 1)
+      self.assertEqual(self._get_input_index(b), 0)
+      self.assertEqual(self._get_input_index(c), 2)
+
+  def testAggregate(self):
+    a = array_ops.constant([3., 4.])
+    b = array_ops.constant([5., 6.])
+    hint = op_hint.OpHint("agg")
+    a0, a1 = array_ops.unstack(a)
+    b0, b1 = array_ops.unstack(b)
+
+    a0 = hint.add_input(a0, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    b0 = hint.add_input(b0, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    a1 = hint.add_input(a1, tag="c", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    b1 = hint.add_input(b1, tag="n", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+
+    c0 = math_ops.add(a0, b0, name="addleft")
+    c1 = math_ops.add(a1, b1, name="addright")
+    c0 = hint.add_output(
+        c0, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+    c1 = hint.add_output(
+        c1, tag="out", aggregate=op_hint.OpHint.AGGREGATE_STACK)
+
+    curr = array_ops.stack([c0, c1])
+    output = array_ops.identity(curr, name="FINAL_OUTPUT")
+    with self.test_session() as sess:
+      stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
+          graph_def=sess.graph_def)
+      print(stubbed_graphdef)
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef,
+              output_nodes=[op_hint._tensor_name_base(output.name)]),
+          ["agg", "Const", "Identity"])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 5ec52035ad..2313bfa3b6 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -41,7 +41,8 @@ from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
 from tensorflow.contrib.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.convert import tensor_name as _tensor_name
-from tensorflow.contrib.lite.python.convert import toco_convert
+from tensorflow.contrib.lite.python.convert import toco_convert  # pylint: disable=unused-import
+from tensorflow.contrib.lite.python.convert import toco_convert_impl as _toco_convert_impl
 from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model as _freeze_saved_model
 from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
@@ -110,6 +111,7 @@ class TocoConverter(object):
 
   Example usage:
 
+    ```python
     # Converting a GraphDef from session.
     converter = lite.TocoConverter.from_session(sess, in_tensors, out_tensors)
     tflite_model = converter.convert()
@@ -124,6 +126,11 @@ class TocoConverter(object):
     # Converting a SavedModel.
     converter = lite.TocoConverter.from_saved_model(saved_model_dir)
     tflite_model = converter.convert()
+
+    # Converting a tf.keras model.
+    converter = lite.TocoConverter.from_keras_model_file(keras_model)
+    tflite_model = converter.convert()
+    ```
   """
 
   def __init__(self, graph_def, input_tensors, output_tensors):
@@ -354,7 +361,7 @@ class TocoConverter(object):
       quantized_stats = None
 
     # Converts model.
-    result = toco_convert(
+    result = _toco_convert_impl(
         input_data=self._graph_def,
         input_tensors=self._input_tensors,
         output_tensors=self._output_tensors,
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py
index 7908689ce4..8c920132e5 100644
--- a/tensorflow/contrib/lite/python/op_hint.py
+++ b/tensorflow/contrib/lite/python/op_hint.py
@@ -25,9 +25,9 @@ Example:
   def tflite_cool_activation(input):
     # A cool activation function.
     custom = tf.contrib.lite.OpHint("cool_activation")
-    input = custom.add_inputs(input)
+    input, = custom.add_inputs(input)
     output = tf.sigmoid(input) * input
-    custom.add_outputs(output)
+    output, = custom.add_outputs(output)
     return output
 
   image = tf.placeholder(tf.float32, (1, 16, 16, 1))
@@ -64,18 +64,27 @@ ops don't actually exist in the normal TensorFlow runtime, but will be
 understood by toco later.
 """
 
+# TODO(aselle): Make this use generic graph transformations.
+# TODO(aselle): _tensor_name_base should be called _tensor_name_to_op_name.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import collections as _collections
-import itertools as _itertools
+import copy as _copy
 import uuid as _uuid
+import six as _six
 
-from tensorflow.contrib import framework as _framework
 from tensorflow.core.framework import attr_value_pb2 as _attr_value_pb2
+from tensorflow.core.framework import graph_pb2 as _graph_pb2
+from tensorflow.core.framework import node_def_pb2 as _node_def_pb2
 from tensorflow.python.framework import ops as _ops
+# TODO(aselle): publicize these apis if we continue to use these.
+from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
+from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -97,11 +106,174 @@ class OpHint(object):
   constructs, this mechanism can be retired and changed to use python defun's.
   """
 
-  # Attr constants that are used for representation in the GraphDef
+  # Attr constants that are used for representation in the GraphDef. These
+  # will be used on every Identity op that is involved in a total OpHint.
+
+  # Name of the OpHint function (cosmetic).
   FUNCTION_NAME_ATTR = "_tflite_function_name"
+  # UUID of the function (each OpHint gets a new uuid).
   FUNCTION_UUID_ATTR = "_tflite_function_uuid"
+  # The index index of the input (or nothing if it is an output).
   FUNCTION_INPUT_INDEX_ATTR = "_tflite_function_input_index"
+  # The output index of the output (or nothing if it is an input).
   FUNCTION_OUTPUT_INDEX_ATTR = "_tflite_function_output_index"
+  # An index that orders aggregate arguments. Aggregate arguments are ones
+  # that are separate but will be fused horizontally. For example a static LSTM
+  # has a lstm cell for each time step. Each one has a separate opHint, but a
+  # fused SequentialLSTM will treat this as a single tensor.
+  FUNCTION_SORT_INDEX_ATTR = "_tflite_function_sort_index"
+  # The way in which multiple parts of the aggregate argument will be joined
+  # into a fused operand. Valid options are OpHint.AGGREGATE_FIRST,
+  # OpHint.AGGREGATE_LAST, OpHint.AGGREGATE_STACK.
+  FUNCTION_AGGREGATE_ATTR = "_tflite_function_aggregate"
+  # On fused OpHint stub, the order of inputs that the final LSTM call will
+  # have. What this means is that the TensorFlow order might be
+  # "foo", "bar", "stuff" and you might want the TF lite op order to be
+  # "stuff", "foo", "bar", -1 (where -1 is unused). So you would set this
+  # attribute to [2, 0, 1, -1].
+  TFLITE_INPUT_INDICES = "_tflite_input_indices"
+
+  # Types of aggregations
+  #  stack: stacks all ophints with matching tags. i.e. for a static rnn.
+  #   specifically, this is good for an input or output to a static rnn cell.
+  AGGREGATE_STACK = _compat.as_bytes("stack")
+  # first: only takes the first output (one with lowest sort index)
+  # of matching tags. This is good for the input state to an RNN.
+  AGGREGATE_FIRST = _compat.as_bytes("first")
+  # aggregation last takes only the last tag (one with highest sort index).
+  # This is good for an output value on the last stack item of a
+  # static rnn.
+  AGGREGATE_LAST = _compat.as_bytes("last")
+
+  class OpHintArgumentTracker(object):
+    """Conceptually tracks indices of arguments of "OpHint functions".
+
+    The inputs and arguments of these functions both use an instance
+    of the class so they can have independent numbering."""
+
+    def __init__(self, function_name, unique_function_id, node_name_prefix,
+                 attr_name):
+      """Initialize ophint argument.
+
+      Args:
+        function_name: Name of the function that this tracks arguments for.
+        unique_function_id: UUID of function that this tracks arguments for.
+        node_name_prefix: How identities that are created are named.
+        attr_name: Name of attribute to use to store the index for this hint.
+          i.e. FUNCTION_INPUT_INDEX or FUNCTION_OUTPUT_INDEX
+      """
+
+      # The global index is the argument index of the op. This is in contrast
+      # to the sort index which is the sequence number of a particular instance
+      # of a given global index. For example, you may have called add hint
+      # twice with the tag "foo". Then the global index will be 0 for both
+      # and the sort index will be 0 for the first added and 1 for the second.
+      self._function_name = function_name
+      self._unique_function_id = unique_function_id
+      self._next_global_index = 0  # The absolute global index
+      self._used_global_indices = set()
+      self._tag_to_global_index = {}  # The argument index a given tag maps to
+      self._tag_to_next_sort_index = {}  # The current index for each tag
+      self._node_name_prefix = node_name_prefix
+      self._attr_name = attr_name
+
+    def _get_new_global_index(self, index_override):
+      """Return the next unused argument index in order or use an override.
+
+      Args:
+        index_override: An index to use instead of the next available or None
+          to use the next available.
+
+      Returns:
+        A valid global_index to use for the next hint argument.
+
+      Raises:
+        ValueError: If the index_override is already used by another hint.
+      """
+      if index_override is None:
+        global_index = self._next_global_index
+      else:
+        if index_override in self._used_global_indices:
+          raise ValueError("Index %d was already used by another call to add")
+        global_index = index_override
+      # Make next_global_index valid
+      self._used_global_indices.add(global_index)
+      while self._next_global_index in self._used_global_indices:
+        self._next_global_index += 1
+      return global_index
+
+    def add(self, arg, tag=None, name=None, aggregate=None,
+            index_override=None):
+      """Return a wrapped tensor of an input tensor as an argument.
+
+      Args:
+        arg: A TensorFlow tensor that should be considered an argument.
+        tag: String tag to identify arguments that should be packed.
+        name: Name of argument. This is included in the Identity hint op names.
+        aggregate: Strategy to aggregate.
+        Acceptable values are OpHint.AGGREGATE_FIRST, OpHint.AGGREGATE_LAST,
+          and OpHint.AGGREGATE_STACK.
+          Note, aggregate is only valid if tag is specified.
+        index_override: Specify what input/output index should this be in the
+          final stub. i.e. add(arg0, index=1); add(arg1, index=0) wil make the
+          final stub be as stub_func(inputs[arg1, arg0], outputs=[]) rather than
+          the default call order based ordering.
+
+      Returns:
+        A tensor representing the wrapped argument.
+
+      Raises:
+        ValueError: When indices are not consistent.
+      """
+
+      # Find the appropriate index
+      if tag is None:
+        if aggregate is not None:
+          raise ValueError("You must specify `tag` if using aggregate.")
+        global_index = self._get_new_global_index(index_override)
+        sort_index = None
+      else:
+        if aggregate is None:
+          raise ValueError("You must specify `aggregate` if using tag.")
+        if tag not in self._tag_to_global_index:
+          self._tag_to_global_index[tag] = (
+              self._get_new_global_index(index_override))
+          self._tag_to_next_sort_index[tag] = 0
+        elif (index_override and
+              index_override != self._tag_to_global_index[tag]):
+          raise ValueError(
+              "Tag %r was called with two indices %r and %r" %
+              (tag, index_override, self._tag_to_global_index[tag]))
+        global_index = self._tag_to_global_index[tag]
+        sort_index = self._tag_to_next_sort_index[tag]
+        self._tag_to_next_sort_index[tag] += 1
+
+      uuid = self._unique_function_id
+      name = "%s-%s-%s-%r-%r-%s" % (self._node_name_prefix, self._function_name,
+                                    uuid, global_index, sort_index, name)
+      identity_op = _array_ops.identity(arg, name=name)
+
+      # pylint: disable=protected-access
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_NAME_ATTR,
+          _attr_value_pb2.AttrValue(
+              s=_compat.as_bytes(self._function_name)))
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_UUID_ATTR,
+          _attr_value_pb2.AttrValue(
+              s=_compat.as_bytes(self._unique_function_id)))
+      identity_op.op._set_attr(
+          self._attr_name, _attr_value_pb2.AttrValue(i=global_index))
+      if sort_index is not None:
+        identity_op.op._set_attr(
+            OpHint.FUNCTION_SORT_INDEX_ATTR,
+            _attr_value_pb2.AttrValue(i=sort_index))
+      if aggregate is not None:
+        identity_op.op._set_attr(
+            OpHint.FUNCTION_AGGREGATE_ATTR,
+            _attr_value_pb2.AttrValue(s=_compat.as_bytes((aggregate))))
+      # pylint: enable=protected-access
+      return identity_op
 
   def __init__(self, function_name, **kwargs):
     """Create a OpHint.
@@ -112,10 +284,14 @@ class OpHint(object):
     """
     self._function_name = function_name
     self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
-    self._curr_input_index = 0
-    self._curr_output_index = 0
     self._attrs_to_store_later = kwargs
     self._stored_attrs = False
+    self._inputs = OpHint.OpHintArgumentTracker(
+        self._function_name, self._unique_function_id, "InputHint",
+        OpHint.FUNCTION_INPUT_INDEX_ATTR)
+    self._outputs = OpHint.OpHintArgumentTracker(
+        self._function_name, self._unique_function_id, "OutputHint",
+        OpHint.FUNCTION_OUTPUT_INDEX_ATTR)
 
   def _setattr(self, dest_op, name, value):
     tensor_value = _ops.convert_to_tensor(value)
@@ -124,68 +300,278 @@ class OpHint(object):
         tensor=tensor_value.op.node_def.attr["value"].tensor))
     # pylint: enable=protected-access
 
-  def add_inputs(self, *args):
+  def add_input(self, *args, **kwargs):
+    """Add a wrapped input argument to the hint.
+
+    Args:
+      *args: The input tensor.
+      **kwargs:
+        "name" label
+        "tag" a tag to group multiple arguments that will be aggregated. I.e.
+          a string like 'cool_input'. Basically multiple inputs can be added
+          to the same hint for parallel operations that will eventually be
+          combined. An example would be static_rnn which creates multiple copies
+          of state or inputs.
+        "aggregate" aggregation strategy that is valid only for tag non None.
+          Acceptable values are OpHint.AGGREGATE_FIRST, OpHint.AGGREGATE_LAST,
+          and OpHint.AGGREGATE_STACK.
+        "index_override" The global index to use. This corresponds to the
+          argument order in the final stub that will be generated.
+    Returns:
+      The wrapped input tensor.
+    """
+    return self._inputs.add(*args, **kwargs)
+
+  def add_output(self, *args, **kwargs):
+    """Add a wrapped output argument to the hint.
+
+    Args:
+      *args: The output tensor.
+      **kwargs:
+        "name" label
+        "tag" a tag to group multiple arguments that will be aggregated. I.e.
+          a string like 'cool_input'. Basically multiple inputs can be added
+          to the same hint for parallel operations that will eventually be
+          combined. An example would be static_rnn which creates multiple copies
+          of state or inputs.
+        "aggregate" aggregation strategy that is valid only for tag non None.
+          Acceptable values are OpHint.AGGREGATE_FIRST, OpHint.AGGREGATE_LAST,
+          and OpHint.AGGREGATE_STACK.
+        "index_override" The global index to use. This corresponds to the
+          argument order in the final stub that will be generated.
+    Returns:
+      The wrapped output tensor.
+    """
+    return self._outputs.add(*args, **kwargs)
+
+  def add_inputs(self, *args, **kwargs):
     """Add a sequence of inputs to the function invocation.
 
     Args:
       *args: List of inputs to be converted (should be Tf.Tensor).
+      **kwargs: This allows 'names' which should be a list of names.
     Returns:
       Wrapped inputs (identity standins that have additional metadata). These
       are also are also tf.Tensor's.
     """
-
-    def augmented_identity(arg):
-      identity_op = _array_ops.identity(arg)
-      # pylint: disable=protected-access
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_NAME_ATTR,
-          _attr_value_pb2.AttrValue(s=self._function_name))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_UUID_ATTR,
-          _attr_value_pb2.AttrValue(s=self._unique_function_id))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_INPUT_INDEX_ATTR,
-          _attr_value_pb2.AttrValue(i=self._curr_input_index))
-      # pylint: enable=protected-access
-      self._curr_input_index += 1
-      return identity_op
-
-    return [augmented_identity(arg) for arg in args]
-
-  def add_outputs(self, *args):
+    if "names" in kwargs:
+      return [
+          self._inputs.add(arg, name=name)
+          for arg, name in zip(args, kwargs["names"])
+      ]
+    else:
+      return [self._inputs.add(arg) for arg in args]
+
+  def add_outputs(self, *args, **kwargs):
     """Add a sequence of outputs to the function invocation.
 
     Args:
       *args: List of outputs to be converted (should be tf.Tensor).
+      **kwargs: See
     Returns:
       Wrapped outputs (identity standins that have additional metadata). These
       are also tf.Tensor's.
     """
+    if "names" in kwargs:
+      return [
+          self._outputs.add(arg, name=name)
+          for arg, name in zip(args, kwargs["names"])
+      ]
+    else:
+      return [self._outputs.add(arg) for arg in args]
+
+
+class _LiteOperand(object):
+  """Abstract operand for a tflite hint function.
+
+  This is a base class that handles representing arguments to an OpHint.
+  It also is able to serialize operands to the stubbed graph_def.
+  Child classes are responsible for being able to
+  store information about the hint identity operators. They are also responsible
+  for knowing how to serialize to output graphdefs.
+
+  Typically this will be implemented by holding one or more identity nodes
+  that were previously discovered as hints.
+  """
+
+  def aggregate_and_return_name_for_input(self, out_graphdef):
+    """This adds the node(s) to out_graphdef and returns the input node name.
+
+    Args:
+      out_graphdef: A graphdef that is ready to have this input added.
+
+    Returns:
+      The the output that the stub should use as an input for this operand.
+
+    Raises:
+      RuntimeError: if the method is not implemented.
+    """
+    del out_graphdef
+    raise RuntimeError("Unimplemented abstract method.")
+
+  def aggregate_and_return_name_for_output(self, fused_op_name, output_index,
+                                           out_graphdef):
+    """Add node(s) to graph representing output operands and returns type.
+
+    Args:
+      fused_op_name: name of the fused op stub name.
+      output_index: Output index that we are currently processing from stub.
+      out_graphdef: The destination graphdef we are currently building up.
+
+    Returns:
+      The datatype of this identity.
+
+    Raises:
+      RuntimeError: if the method is not implemented.
+    """
+    del fused_op_name, output_index, out_graphdef
+    raise RuntimeError("Unimplemented abstract method.")
 
-    def augmented_identity(arg):
-      identity_op = _array_ops.identity(arg)
-      # pylint: disable=protected-access
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_NAME_ATTR,
-          _attr_value_pb2.AttrValue(s=self._function_name))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_UUID_ATTR,
-          _attr_value_pb2.AttrValue(s=self._unique_function_id))
-      identity_op.op._set_attr(
-          OpHint.FUNCTION_OUTPUT_INDEX_ATTR,
-          _attr_value_pb2.AttrValue(i=self._curr_output_index))
-      # pylint: enable=protected-access
-      self._curr_output_index += 1
-      return identity_op
 
-    wrapped_outputs = [augmented_identity(arg) for arg in args]
+class _LiteSingleOperand(_LiteOperand):
+  """A simple operand that is non-aggregated (i.e. most hints)."""
 
-    if not self._stored_attrs:
-      for key, value in self._attrs_to_store_later.iteritems():
-        self._setattr(wrapped_outputs[0], "_tflite_attr_" + key, value)
-      self._stored_attrs = True
+  def __init__(self, node):
+    _LiteOperand.__init__(self)
+    self.node = node
+    self.name = _tensor_name_base(node.name)
 
-    return wrapped_outputs
+  def flatten(self):
+    return [self.name]
+
+  def aggregate_and_return_name_for_input(self, out_graphdef):
+    return self.name
+
+  def aggregate_and_return_name_for_output(self, fused_op_name, index,
+                                           out_graphdef):
+    output_node = _copy.deepcopy(self.node)
+    del output_node.input[:]
+    output_node.input.append(_tensorflow_output_name(fused_op_name, index))
+    out_graphdef.node.extend([output_node])
+    return self.node.attr["type"].i
+
+  def __str__(self):
+    return str(self.name)
+
+
+class _LiteAggregateOperand(_LiteOperand):
+  """An operand for a tflite hint function that is aggregated from many.
+
+  For example, an LSTM is a grid of operators that are all related. Inputs
+  going into them may need to be fused, so they should all be tracked as
+  related arguments.
+  """
+
+  def __init__(self, aggregation):
+    _LiteOperand.__init__(self)
+    self.aggregation = aggregation
+    self.names = {}
+    self.nodes = {}
+    self.flattened = None
+
+  def add(self, sort, node):
+    self.names[sort] = _tensor_name_base(node.name)
+    self.nodes[sort] = node
+
+  def flatten_nodes(self):
+    """Return a list of all the node protos in aggregation sorted order."""
+    if not self.flattened:
+      self.flattened = [None] * len(self.nodes)
+      for idx, node in _six.iteritems(self.nodes):
+        self.flattened[idx] = node
+      for n in self.nodes:
+        if n is None:
+          raise RuntimeError("Aggregate was missing argument.")
+      if self.aggregation == OpHint.AGGREGATE_FIRST:
+        self.flattened = self.flattened[:1]
+      elif self.aggregation == OpHint.AGGREGATE_LAST:
+        self.flattened = self.flattened[-1:]
+      elif self.aggregation == OpHint.AGGREGATE_STACK:
+        pass
+      else:
+        raise ValueError(
+            "Invalid aggregation type %r specified" % self.aggregation)
+    return self.flattened
+
+  def flatten(self):
+    """Return a list of all node names in aggregation sorted sorter."""
+    return [_tensor_name_base(x.name) for x in self.flatten_nodes()]
+
+  def aggregate_and_return_name_for_input(self, out_graphdef):
+    """This adds the nodes to out_graphdef and returns an aggregated output.
+
+    In particular, if you have 4 inputs to a hint stub, this will be the
+    node that you can use as an output. I.e. you have 4 timesteps from a
+    static rnn, then a fused UnidriecitonalLSTM will expect 1 input with
+    all 4 time steps. So here we make a pack and return the output name of
+    that pack.
+
+    Args:
+      out_graphdef: A graphdef that is ready to have this input added.
+
+    Returns:
+      The name of a pack that aggregates this node.
+    """
+    flattened = self.flatten_nodes()
+    if len(flattened) == 1:
+      return _tensor_name_base(flattened[0].name)
+    else:
+      new_node = _node_def_pb2.NodeDef()
+      new_node.op = "Pack"
+      new_node.name = "OpHintStack-%s" % flattened[0].name
+      new_node.attr["N"].i = len(flattened)
+      new_node.attr["T"].type = flattened[0].attr["T"].type
+      for discrete in flattened:
+        new_node.input.append(_tensor_name_base(discrete.name))
+      out_graphdef.node.extend([new_node])
+      return new_node.name
+
+  def aggregate_and_return_name_for_output(self, fused_op_name, output_index,
+                                           out_graphdef):
+    """This adds to `out_graphdef` all the unaggregated outputs.
+
+    I.e. we are outputting from a fused stub, but we need to make it compatible
+    with the unfused original graph so we insert an unpack. Ideally in a later
+    stage the unpack -> pack sequences will be removed.
+
+    Args:
+      fused_op_name: The name of the stub we are in the process of fusing.
+      output_index: The output output_index this object represents.
+      out_graphdef: The graphdef we are in the process of buildings
+
+    Returns:
+      The type of the aggregated output (so we can finish building the stub
+      op).
+    """
+    flattened = self.flatten_nodes()
+    if len(flattened) == 1:
+      temp_op = _LiteSingleOperand(flattened[0])
+      return temp_op.aggregate_and_return_name_for_output(
+          fused_op_name, output_index, out_graphdef)
+    else:
+      stack_node = _node_def_pb2.NodeDef()
+      stack_node.op = "Unpack"
+      stack_node.name = "OpHintUnstack-%s" % flattened[0].name
+      stack_node.attr["num"].i = len(flattened)
+      output_type = flattened[0].attr["T"].type
+      stack_node.attr["T"].type = output_type
+      stack_node.input.append(_tensorflow_output_name(
+          fused_op_name, output_index))
+      out_graphdef.node.extend([stack_node])
+
+      for idx, discrete in enumerate(flattened):
+        output_node = _copy.deepcopy(discrete)
+        del output_node.input[:]
+        output_node.input.append(_tensorflow_output_name(stack_node.name, idx))
+        out_graphdef.node.extend([output_node])
+
+      return output_type
+
+  def __str__(self):
+    s = "\t\t\tAGGREGATE %s\n" % self.aggregation
+    for sort, val in self.names.iteritems():
+      s += "\t\t\t%d: %s\n" % (sort, val)
+    return s
 
 
 class _LiteFuncCall(object):
@@ -212,46 +598,87 @@ class _LiteFuncCall(object):
     self.uuid = None
     self.params = {}
 
+  def flattened_inputs_and_outputs(self):
+    """Return a list of inputs and outputs in a flattened format.
+
+    Returns:
+      Tuple of (inputs, outputs). where input and output i a list of names.
+    """
+    def _flatten(input_or_output_dict):
+      flattened_items = []
+      for item in input_or_output_dict.values():
+        flattened_items.extend(item.flatten())
+      return flattened_items
+
+    return _flatten(self.inputs), _flatten(self.outputs)
+
   def __str__(self):
-    return "tflite function %s call %s\n\tinputs: %r\n\toutputs: %r" % (
-        self.function_name, self.uuid, self.inputs, self.outputs)
+    def format_args(items):
+      s = ""
+      for idx, item in items.iteritems():
+        s += ("\t\t%d:\n" % idx) + str(item)
+      return s
+
+    inputs_str = "\tInputs\n" + format_args(self.inputs)
+    outputs_str = "\tOutputs\n" + format_args(self.outputs)
 
+    return ("tflite function %s call %s\n\tinputs:\n\t\t%s\n\toutputs:\n\t\t%s"
+            % (self.function_name, self.uuid, inputs_str, outputs_str))
 
-def _find_all_hints_in_graph_def(session):
+
+def _find_all_hints_in_graph_def(graphdef):
   """Look at the current default graph and return a list of LiteFuncCall objs.
 
   Args:
-    session: A TensorFlow session that contains the graph to convert.
+    graphdef: A TensorFlow graph_def to look for LiteFuncCalls.
   Returns:
     a list of `LifeFuncCall` objects in the form
 
   """
   func_calls = _collections.defaultdict(_LiteFuncCall)
-  seen_ops = set()
-
-  for op in session.graph.get_operations():
-    for operand in _itertools.chain(op.inputs, op.outputs):
-      if operand in seen_ops:
-        continue
-      seen_ops.add(operand)
-      attr = operand.op.node_def.attr
-      uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
-      if OpHint.FUNCTION_UUID_ATTR not in attr:
-        continue
-      call_def = func_calls[uuid]
-      call_def.uuid = uuid
-      if OpHint.FUNCTION_UUID_ATTR in attr:
-        call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
-        if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
-          call_def.inputs[attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i] = operand
-        if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
-          call_def.outputs[attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i] = operand
-
-      for a in attr:
-        if a.startswith("_tflite_attr_"):
-          # TODO(aselle): Remember the attribute tensors so we can put them
-          # in collapse.
-          call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
+
+  for node in graphdef.node:
+    attr = node.attr
+    # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
+    uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
+    if (OpHint.FUNCTION_UUID_ATTR not in attr
+        or not attr[OpHint.FUNCTION_UUID_ATTR].s):
+      continue
+
+    # Start building function
+    call_def = func_calls[uuid]
+    call_def.uuid = uuid
+    call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+    # Get sorting and aggregation information
+
+    sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
+            if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
+    if sort == -1: sort = None
+    aggregation = None
+    if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
+      aggregation = attr[OpHint.FUNCTION_AGGREGATE_ATTR].s
+
+    # Add the input or output
+    def put_operand(stuff, index, sort, operand, aggregation):
+      """Add a given index into the function structure."""
+      if sort is None:
+        stuff[index] = _LiteSingleOperand(operand)
+      else:
+        if index not in stuff:
+          stuff[index] = _LiteAggregateOperand(aggregation)
+        stuff[index].add(sort, operand)
+
+    if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
+      put_operand(call_def.inputs, attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i,
+                  sort, node, aggregation)
+    if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
+      put_operand(call_def.outputs, attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i,
+                  sort, node, aggregation)
+
+    # Remember attributes
+    for a in attr:
+      if a.startswith("_tflite_attr_"):
+        call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
 
   return func_calls
 
@@ -267,42 +694,305 @@ def _tensor_name_base(full_tensor_name):
   Returns:
     A name without any device assignment.
   """
-  return full_tensor_name.name.split(":")[0]
+  if full_tensor_name.startswith("^"):
+    return full_tensor_name[1:]
+  return full_tensor_name.split(":")[0]
+
+
+def _tensorflow_output_name(tensor_name, output_index):
+  return tensor_name if output_index == 0 else "%s:%d" % (tensor_name,
+                                                          output_index)
+
+
+# TODO(aselle): This should be converted to grappler in the future.
+def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
+                           name_to_input_name):
+  """Checks to make sure node only connects to predecessor graph through inputs.
+
+  Args:
+    n: Node to check
+    reachable_by_input: Nodes that are reachable by all inputs of subgraph
+    input_nodes_set: The set of nodes that are "inputs".
+    name_to_input_name: Maps from name to the list of inputs.
+
+  Raises:
+    TypeError: If the given node uses items past inputs directly.
+  """
+  next_to_visit = [n]
+  visited = set()
+  while next_to_visit:
+    current_node = next_to_visit.pop()
+    visited.add(current_node)
+    if (current_node in reachable_by_input
+        and current_node not in input_nodes_set):
+      raise TypeError(
+          "Node %s uses input %s not in input_nodes." % (n, current_node))
+    if current_node not in input_nodes_set:
+      next_to_visit += [
+          input_node for input_node in name_to_input_name[current_node]
+          if input_node not in visited
+      ]
+
+
+# TODO(aselle): This should be converted to grappler in the future.
+def _convert_single_op_hint_to_stub(call, graph_def):
+  """Given a graph_def, converts `call` into a stub and returns a new graph_def.
 
+  Args:
+    call: A single function call to be converted.
+    graph_def: A graph_def to use as input (that hass call obviously).
+  Returns:
+    A new transformed graph-def that has call as a stub (single op).
 
-def convert_op_hints_to_stubs(session):
+  Note: after this process, the graph_def can no longer be loaded into
+      the tensorflow runtime, so all future manipulations are done in graph_def
+      level.
+  """
+  name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
+      graph_def)
+  input_names, output_names = call.flattened_inputs_and_outputs()
+
+  reachable_by_input = _bfs_for_reachable_nodes(input_names, name_to_input_name)
+  reachable_by_output = _bfs_for_reachable_nodes(output_names,
+                                                 name_to_input_name)
+  input_nodes_set = set(input_names)
+  output_nodes_set = set(output_names)
+  nodes_after_fuse = []
+  nodes_deleted_by_fuse = set()
+  # Classify each node. We want to keep everything reachable by input, but
+  # we don't know if things that are not reachable by output or input (things
+  # after fusing).
+  for node in graph_def.node:
+    n = _tensor_name_base(node.name)
+    if n in reachable_by_output:
+      if n not in reachable_by_input and n not in output_nodes_set:
+        # n is an internal node. Check to make sure it is really internal.
+        # TODO(aselle): this could be done more efficiently by flooding
+        # the graph first.
+        _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
+                               name_to_input_name)
+        nodes_deleted_by_fuse.add(n)
+    elif n not in reachable_by_input:
+      # n is a node that after all the fusings, so keep it.
+      nodes_after_fuse.append(n)
+    else:
+      # n is a node that is randomly in the graph but not connected to
+      # the chain of dependencies.
+      pass
+
+  # Make a new graphdef with all the pre-input and input nodes
+  out = _graph_pb2.GraphDef()
+  reachable_by_input_sorted = sorted(
+      list(reachable_by_input), key=lambda n: name_to_seq_num[n])
+  for node in reachable_by_input_sorted:
+    out.node.extend([_copy.deepcopy(name_to_node[node])])
+
+  # Create any stacks to aggregate arguments into to a single input
+  # i.e. for static_rnn's.
+  # TODO(aselle): Check that the inputs are complete i.e. 0 to n-1
+  sorted_input_indices = list(call.inputs.keys())
+  sorted_input_indices.sort()
+  sorted_output_indices = list(call.outputs.keys())
+  sorted_output_indices.sort()
+  new_node = _node_def_pb2.NodeDef()
+  # Delegate to each operand to produce the proper new input for this stub node.
+  # In particular, an aggregate input will now be a Pack of some previously
+  # non-fused things.
+  for input_index in sorted_input_indices:
+    inputs = call.inputs[input_index]
+    new_node.input.append(inputs.aggregate_and_return_name_for_input(out))
+  new_node.attr[OpHint.TFLITE_INPUT_INDICES].list.i.extend(sorted_input_indices)
+
+  # Ceate the function
+  new_node.op = call.function_name
+  new_node.name = call.uuid
+  out.node.extend([new_node])
+
+  # Now call each output argument to give them a chance to make the proper
+  # output type and add it to our new_node.
+  output_dtypes = []
+  for output_index in sorted_output_indices:
+    output = call.outputs[output_index]
+    output_dtype = (
+        output.aggregate_and_return_name_for_output(new_node.name, output_index,
+                                                    out))
+    output_dtypes.append(output_dtype)
+  new_node.attr["_output_types"].list.type[:] = output_dtypes
+  # TODO(aselle): what is right here?
+  new_node.attr["_output_quantized"].b = False
+
+  # Add post output nodes that do not depend on the outputs
+  for n in nodes_after_fuse:
+    should_keep = True
+    for input_name in name_to_input_name[n]:
+      if input_name in nodes_deleted_by_fuse:
+        should_keep = False
+    if should_keep:
+      out.node.extend([_copy.deepcopy(name_to_node[n])])
+
+  # Misc. graph_def data that needs copying.
+  out.library.CopyFrom(graph_def.library)
+  out.versions.CopyFrom(graph_def.versions)
+
+  return out
+
+
+# TODO(aselle): This should be converted to grappler in the future.
+def _remove_one_redundant_stack_unstack(in_graph_def):
+  """Removes a stack->unstack pattern from in_graph_def in a returned graph.
+
+  Args:
+    in_graph_def: Graph def to use as input.
+  Returns:
+    Simplified tuple (graph_def, changed_something) where changed_something
+    is true if anything was done.
+  """
+  name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
+      in_graph_def)
+  del name_to_seq_num
+
+  # TODO(aselle): Make this not hardcoded.
+  do_generic_pack_unpack = True
+
+  out = _graph_pb2.GraphDef()
+  out.library.CopyFrom(in_graph_def.library)
+  out.versions.CopyFrom(in_graph_def.versions)
+  for n in in_graph_def.node:
+    node_name = _tensor_name_base(n.name)
+    if not node_name.startswith("OpHintStack") and not n.op.startswith("Pack"):
+      continue
+    next_to_visit = [node_name]
+    visited = set()
+
+    unpack_nodes = set()
+    pack_node = node_name
+
+    # Find a pattern of unstack connected to a stack (with identities
+    # in between.
+    matches_pattern = True
+    is_hint_created_stack = False
+    while next_to_visit:
+      current_node_name = next_to_visit[0]
+      visited.add(current_node_name)
+      del next_to_visit[0]
+      node = name_to_node[current_node_name]
+      is_op_hint_stack = node.name.startswith("OpHintStack")
+      is_op_hint_unstack = node.name.startswith("OpHintUnstack")
+      if (node.op == "Identity" or is_op_hint_stack
+          or (do_generic_pack_unpack and node.op == "Pack")):
+        is_hint_created_stack |= is_op_hint_stack
+        next_to_visit += [
+            input_node for input_node in name_to_input_name[current_node_name]
+            if input_node not in visited
+        ]
+      elif (is_op_hint_unstack
+            or (do_generic_pack_unpack and node.op == "Unpack")):
+        unpack_nodes.add(node.name)
+        is_hint_created_stack &= is_op_hint_unstack
+      else:
+        matches_pattern = False
+        break
+      visited.add(node.name)
+
+    if matches_pattern and len(unpack_nodes) == 1:
+      pack_node = node_name
+
+      # Check to see if anyone depends on the intermediate identity or the
+      # Unstacked form
+      no_external_dependency = True
+      for other_n in in_graph_def.node:
+        if other_n.name in visited: continue
+        for input_tensor in name_to_input_name[other_n.name]:
+          input_op = _tensor_name_base(input_tensor)
+          if input_op in visited and input_op != pack_node:
+            no_external_dependency = False
+      # Proceed with the substitution if the stack/unstack pair was created
+      # through hints, or that it was not, but nobody is consuming things
+      # between the stack and unstack.
+      if is_hint_created_stack or no_external_dependency:
+        end = unpack_nodes.pop()
+        end_input = name_to_node[end].input[0]
+        # All nodes that depend on the final stack need to be redone to use
+        for other_n in in_graph_def.node:
+          node_name = _tensor_name_base(other_n.name)
+          if node_name not in visited:
+            new_node = _copy.deepcopy(other_n)
+            new_node.input[:] = [
+                (end_input if stripped == pack_node else
+                 non_stripped) for stripped, non_stripped in zip(
+                     name_to_input_name[node_name], new_node.input[:])
+            ]
+            out.node.extend([new_node])
+        return out, True
+  return in_graph_def, False
+
+
+def _remove_redundant_stack_unstack(graph_def):
+  curr = graph_def
+  del graph_def
+  changed_stuff = True
+  while changed_stuff:
+    curr, changed_stuff = _remove_one_redundant_stack_unstack(curr)
+  return curr
+
+
+def _convert_op_hints_to_stubs_helper(
+    graph_def, write_callback=lambda sess, graph_def: None):
+  """Converts a graph_def to a new graph_def where all op hints are stubbed.
+
+  Args:
+    graph_def: A graph def that we should convert.
+    write_callback: A function pointer that can be used to write intermediate
+      steps of graph transformation (optional).
+  Returns:
+    A new stubbed graph_def.
+  """
+
+  hints = _find_all_hints_in_graph_def(graph_def)
+  curr_graph_def = graph_def
+  del graph_def  # prevent using graph_def again (common source of error)
+  for hint in _six.itervalues(hints):
+    curr_graph_def = _convert_single_op_hint_to_stub(
+        hint, curr_graph_def)
+    write_callback(curr_graph_def, "initial")
+  # The stubbing process can create stacks/unstacks in the case of LSTMs
+  # remove them.
+  curr_graph_def = _remove_redundant_stack_unstack(curr_graph_def)
+  return curr_graph_def
+
+
+def convert_op_hints_to_stubs(session=None,
+                              graph_def=None,
+                              write_callback=lambda graph_def, comments: None):
   """Converts a graphdef with LiteOp hints into stub operations.
 
   This is used to prepare for toco conversion of complex intrinsic usages.
+  Note: only one of session or graph_def should be used, not both.
 
   Args:
     session: A TensorFlow session that contains the graph to convert.
+    graph_def: A graph def that we should convert.
+    write_callback: A function pointer that can be used to write intermediate
+      steps of graph transformation (optional).
   Returns:
     A new graphdef with all ops contained in OpHints being replaced by
     a single op call with the right parameters.
+  Raises:
+    ValueError: If both session and graph_def are provided.
   """
-  hints = _find_all_hints_in_graph_def(session)
-  current_graph_def = session.graph_def
-  for call in hints.values():
-    input_names = [None] * len(call.inputs)
-    output_names = [None] * len(call.outputs)
-    output_dtypes = [None] * len(call.outputs)
-    output_quantized = False
-    for input_index, tensor in call.inputs.items():
-      input_names[input_index] = _tensor_name_base(tensor)
-    for output_index, tensor in call.outputs.items():
-      output_names[output_index] = _tensor_name_base(tensor)
-      output_dtypes[output_index] = tensor.dtype.as_datatype_enum
-    # TODO(aselle): Support quantized flag properly
-    current_graph_def = _framework.fuse_op(
-        current_graph_def, input_names, output_names, output_dtypes,
-        output_quantized, call.uuid, call.function_name)
-    for node in current_graph_def.node:
-      if node.name == call.uuid:
-        for param, tensor in call.params.items():
-          node.attr[param].tensor.CopyFrom(tensor)
-  return current_graph_def
-
-
-_allowed_symbols = ["OpHint", "convert_op_hints_to_stubs"]
+
+  if session is not None and graph_def is not None:
+    raise ValueError("Provide only one of session and graph_def.")
+
+  if session is not None:
+    return _convert_op_hints_to_stubs_helper(session.graph_def, write_callback)
+  elif graph_def is not None:
+    return _convert_op_hints_to_stubs_helper(graph_def, write_callback)
+  else:
+    raise ValueError("Must specify session or graph_def as input.")
+
+
+_allowed_symbols = [
+    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new"
+]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index a76cc39635..7d7a4ba94a 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -47,6 +47,9 @@ def _get_toco_converter(flags):
 
   Returns:
     TocoConverter object.
+
+  Raises:
+    ValueError: Invalid flags.
   """
   # Parse input and output arrays.
   input_arrays = _parse_array(flags.input_arrays)
@@ -77,6 +80,9 @@ def _get_toco_converter(flags):
   elif flags.keras_model_file:
     converter_fn = lite.TocoConverter.from_keras_model_file
     converter_kwargs["model_file"] = flags.keras_model_file
+  else:
+    raise ValueError("--graph_def_file, --saved_model_dir, or "
+                     "--keras_model_file must be specified.")
 
   return converter_fn(**converter_kwargs)
 
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
index b616e449e6..28a7e50003 100644
--- a/tensorflow/contrib/lite/schema/BUILD
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -48,7 +48,7 @@ exports_files([
     "schema_v3.fbs",
 ])
 
-load("//third_party/flatbuffers:build_defs.bzl", "flatbuffer_cc_library")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 
 # Generic schema for inference on device.
 flatbuffer_cc_library(
diff --git a/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc b/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
index cd46a06f7d..11057203a8 100644
--- a/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
+++ b/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <fstream>
 #include <gtest/gtest.h>
-#include "flatbuffers/flatc.h"
+#include "flatbuffers/flatc.h"  // flatbuffers
 #include "tensorflow/core/platform/platform.h"
 
 #ifdef PLATFORM_GOOGLE
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 14f88b4c00..cf66403ec9 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -169,6 +169,10 @@ enum BuiltinOperator : byte {
   ONE_HOT = 85,
   LOGICAL_AND = 86,
   LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
 }
 
 // Options for the builtin operators.
@@ -236,6 +240,8 @@ union BuiltinOptions {
   OneHotOptions,
   LogicalAndOptions,
   LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -565,6 +571,14 @@ table LogicalAndOptions {
 table LogicalNotOptions {
 }
 
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
@@ -631,9 +645,9 @@ table SubGraph {
 }
 
 // Table of raw data buffers (used for constant tensors). Referenced by tensors
-// by index.
+// by index. The generous alignment accommodates mmap-friendly data structures.
 table Buffer {
-  data:[ubyte];
+  data:[ubyte] (force_align: 16);
 }
 
 table Model {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 3efa153e2c..6d9630d75e 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -220,6 +220,12 @@ struct LogicalAndOptionsT;
 struct LogicalNotOptions;
 struct LogicalNotOptionsT;
 
+struct UnpackOptions;
+struct UnpackOptionsT;
+
+struct FloorDivOptions;
+struct FloorDivOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -373,11 +379,15 @@ enum BuiltinOperator {
   BuiltinOperator_ONE_HOT = 85,
   BuiltinOperator_LOGICAL_AND = 86,
   BuiltinOperator_LOGICAL_NOT = 87,
+  BuiltinOperator_UNPACK = 88,
+  BuiltinOperator_REDUCE_MIN = 89,
+  BuiltinOperator_FLOOR_DIV = 90,
+  BuiltinOperator_REDUCE_ANY = 91,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_LOGICAL_NOT
+  BuiltinOperator_MAX = BuiltinOperator_REDUCE_ANY
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[87] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[91] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -465,7 +475,11 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[87] {
     BuiltinOperator_LOGICAL_OR,
     BuiltinOperator_ONE_HOT,
     BuiltinOperator_LOGICAL_AND,
-    BuiltinOperator_LOGICAL_NOT
+    BuiltinOperator_LOGICAL_NOT,
+    BuiltinOperator_UNPACK,
+    BuiltinOperator_REDUCE_MIN,
+    BuiltinOperator_FLOOR_DIV,
+    BuiltinOperator_REDUCE_ANY
   };
   return values;
 }
@@ -560,6 +574,10 @@ inline const char **EnumNamesBuiltinOperator() {
     "ONE_HOT",
     "LOGICAL_AND",
     "LOGICAL_NOT",
+    "UNPACK",
+    "REDUCE_MIN",
+    "FLOOR_DIV",
+    "REDUCE_ANY",
     nullptr
   };
   return names;
@@ -635,11 +653,13 @@ enum BuiltinOptions {
   BuiltinOptions_OneHotOptions = 61,
   BuiltinOptions_LogicalAndOptions = 62,
   BuiltinOptions_LogicalNotOptions = 63,
+  BuiltinOptions_UnpackOptions = 64,
+  BuiltinOptions_FloorDivOptions = 65,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_LogicalNotOptions
+  BuiltinOptions_MAX = BuiltinOptions_FloorDivOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[64] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[66] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -704,7 +724,9 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[64] {
     BuiltinOptions_LogicalOrOptions,
     BuiltinOptions_OneHotOptions,
     BuiltinOptions_LogicalAndOptions,
-    BuiltinOptions_LogicalNotOptions
+    BuiltinOptions_LogicalNotOptions,
+    BuiltinOptions_UnpackOptions,
+    BuiltinOptions_FloorDivOptions
   };
   return values;
 }
@@ -775,6 +797,8 @@ inline const char **EnumNamesBuiltinOptions() {
     "OneHotOptions",
     "LogicalAndOptions",
     "LogicalNotOptions",
+    "UnpackOptions",
+    "FloorDivOptions",
     nullptr
   };
   return names;
@@ -1041,6 +1065,14 @@ template<> struct BuiltinOptionsTraits<LogicalNotOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_LogicalNotOptions;
 };
 
+template<> struct BuiltinOptionsTraits<UnpackOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnpackOptions;
+};
+
+template<> struct BuiltinOptionsTraits<FloorDivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorDivOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1576,6 +1608,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_LogicalNotOptions ?
       reinterpret_cast<const LogicalNotOptionsT *>(value) : nullptr;
   }
+  UnpackOptionsT *AsUnpackOptions() {
+    return type == BuiltinOptions_UnpackOptions ?
+      reinterpret_cast<UnpackOptionsT *>(value) : nullptr;
+  }
+  const UnpackOptionsT *AsUnpackOptions() const {
+    return type == BuiltinOptions_UnpackOptions ?
+      reinterpret_cast<const UnpackOptionsT *>(value) : nullptr;
+  }
+  FloorDivOptionsT *AsFloorDivOptions() {
+    return type == BuiltinOptions_FloorDivOptions ?
+      reinterpret_cast<FloorDivOptionsT *>(value) : nullptr;
+  }
+  const FloorDivOptionsT *AsFloorDivOptions() const {
+    return type == BuiltinOptions_FloorDivOptions ?
+      reinterpret_cast<const FloorDivOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -5649,6 +5697,112 @@ inline flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(
 
 flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct UnpackOptionsT : public flatbuffers::NativeTable {
+  typedef UnpackOptions TableType;
+  int32_t num;
+  int32_t axis;
+  UnpackOptionsT()
+      : num(0),
+        axis(0) {
+  }
+};
+
+struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UnpackOptionsT NativeTableType;
+  enum {
+    VT_NUM = 4,
+    VT_AXIS = 6
+  };
+  int32_t num() const {
+    return GetField<int32_t>(VT_NUM, 0);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           verifier.EndTable();
+  }
+  UnpackOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnpackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UnpackOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnpackOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num(int32_t num) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_NUM, num, 0);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_AXIS, axis, 0);
+  }
+  explicit UnpackOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UnpackOptionsBuilder &operator=(const UnpackOptionsBuilder &);
+  flatbuffers::Offset<UnpackOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UnpackOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num = 0,
+    int32_t axis = 0) {
+  UnpackOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_num(num);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FloorDivOptionsT : public flatbuffers::NativeTable {
+  typedef FloorDivOptions TableType;
+  FloorDivOptionsT() {
+  }
+};
+
+struct FloorDivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FloorDivOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FloorDivOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorDivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FloorDivOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FloorDivOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit FloorDivOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  FloorDivOptionsBuilder &operator=(const FloorDivOptionsBuilder &);
+  flatbuffers::Offset<FloorDivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FloorDivOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorDivOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -5971,6 +6125,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const LogicalNotOptions *builtin_options_as_LogicalNotOptions() const {
     return builtin_options_type() == BuiltinOptions_LogicalNotOptions ? static_cast<const LogicalNotOptions *>(builtin_options()) : nullptr;
   }
+  const UnpackOptions *builtin_options_as_UnpackOptions() const {
+    return builtin_options_type() == BuiltinOptions_UnpackOptions ? static_cast<const UnpackOptions *>(builtin_options()) : nullptr;
+  }
+  const FloorDivOptions *builtin_options_as_FloorDivOptions() const {
+    return builtin_options_type() == BuiltinOptions_FloorDivOptions ? static_cast<const FloorDivOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -6254,6 +6414,14 @@ template<> inline const LogicalNotOptions *Operator::builtin_options_as<LogicalN
   return builtin_options_as_LogicalNotOptions();
 }
 
+template<> inline const UnpackOptions *Operator::builtin_options_as<UnpackOptions>() const {
+  return builtin_options_as_UnpackOptions();
+}
+
+template<> inline const FloorDivOptions *Operator::builtin_options_as<FloorDivOptions>() const {
+  return builtin_options_as_FloorDivOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -8441,6 +8609,58 @@ inline flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(flatbuffer
       _fbb);
 }
 
+inline UnpackOptionsT *UnpackOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new UnpackOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void UnpackOptions::UnPackTo(UnpackOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num(); _o->num = _e; };
+  { auto _e = axis(); _o->axis = _e; };
+}
+
+inline flatbuffers::Offset<UnpackOptions> UnpackOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnpackOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnpackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnpackOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num = _o->num;
+  auto _axis = _o->axis;
+  return tflite::CreateUnpackOptions(
+      _fbb,
+      _num,
+      _axis);
+}
+
+inline FloorDivOptionsT *FloorDivOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new FloorDivOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void FloorDivOptions::UnPackTo(FloorDivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<FloorDivOptions> FloorDivOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFloorDivOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorDivOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FloorDivOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFloorDivOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -8882,6 +9102,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const LogicalNotOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const UnpackOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const FloorDivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -9152,6 +9380,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const LogicalNotOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const UnpackOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const FloorDivOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -9410,6 +9646,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const LogicalNotOptionsT *>(value);
       return CreateLogicalNotOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<const UnpackOptionsT *>(value);
+      return CreateUnpackOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<const FloorDivOptionsT *>(value);
+      return CreateFloorDivOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -9668,6 +9912,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new LogicalNotOptionsT(*reinterpret_cast<LogicalNotOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_UnpackOptions: {
+      value = new UnpackOptionsT(*reinterpret_cast<UnpackOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      value = new FloorDivOptionsT(*reinterpret_cast<FloorDivOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -9990,6 +10242,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_UnpackOptions: {
+      auto ptr = reinterpret_cast<UnpackOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FloorDivOptions: {
+      auto ptr = reinterpret_cast<FloorDivOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/string.h b/tensorflow/contrib/lite/string.h
index 7f8f4e851e..af3fadfcb3 100644
--- a/tensorflow/contrib/lite/string.h
+++ b/tensorflow/contrib/lite/string.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Abstract string. We don't want even absl at this level.
-#ifndef _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
-#define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_STRING_H_
+#define TENSORFLOW_CONTRIB_LITE_STRING_H_
 
 #include <string>
 
@@ -26,4 +26,4 @@ using std::string;
 
 }  // namespace tflite
 
-#endif  // _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_STRING_H_
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index a788d41ba7..89912fd116 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -162,11 +162,12 @@ cc_library(
         ":test_runner",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/delegates/eager:delegate",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "tflite_driver_test",
     size = "small",
     srcs = ["tflite_driver_test.cc"],
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 52ef0d5b86..a329bb3a25 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -926,6 +926,11 @@ def make_reduce_max_tests(zip_path):
   return make_reduce_tests(tf.reduce_max)(zip_path)
 
 
+def make_reduce_min_tests(zip_path):
+  """Make a set of tests to do min."""
+  return make_reduce_tests(tf.reduce_min)(zip_path)
+
+
 def make_exp_tests(zip_path):
   """Make a set of tests to do exp."""
 
@@ -1255,6 +1260,140 @@ def make_conv_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+# Note: This is a regression test for a bug (b/112436267) that Toco incorrectly
+# fuses weights when multiple Conv2D/FULLY_CONNECTED ops share the same constant
+# weight tensor.
+def make_conv_with_shared_weights_tests(zip_path):
+  """Make a test where 2 Conv ops shared the same constant weight tensor."""
+
+  test_parameters = [{
+      "input_shape": [[1, 10, 10, 3]],
+      "filter_shape": [[3, 3]],
+      "strides": [[1, 1, 1, 1]],
+      "dilations": [[1, 1, 1, 1]],
+      "padding": ["SAME"],
+      "data_format": ["NHWC"],
+      "channel_multiplier": [1],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_shape"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
+
+  def build_graph(parameters):
+    """Build a conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=input_shape)
+
+    # Construct a constant weights tensor which will be used by both Conv2D.
+    filter_tensor = tf.constant(
+        create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
+    input_tensors = [input_tensor]
+
+    # Construct 2 Conv2D operations which use exactly the same input and
+    # weights.
+    result1 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    result2 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    # Add MUL ops after Conv2D ops. These MUL ops should be fused into the
+    # weights of Conv2D.
+    result1 = result1 * 2
+    result2 = result2 * 3
+    # Add the 2 results up.
+    out = result1 + result2
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    input_shape, unused_filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+# Note: This is a regression test for a bug (b/112303004) that Toco incorrectly
+# transforms Conv into DepthwiseConv when two Conv ops share the same constant
+# weight tensor.
+def make_conv_to_depthwiseconv_with_shared_weights_tests(zip_path):
+  """Make a test where 2 Conv ops shared the same constant weight tensor."""
+
+  test_parameters = [{
+      "input_shape": [[1, 10, 10, 1]],
+      "filter_shape": [[3, 3]],
+      "strides": [[1, 1, 1, 1]],
+      "dilations": [[1, 1, 1, 1]],
+      "padding": ["SAME"],
+      "data_format": ["NHWC"],
+      "channel_multiplier": [3],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_shape"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
+
+  def build_graph(parameters):
+    """Build a conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=input_shape)
+
+    # Construct a constant weights tensor which will be used by both Conv2D.
+    filter_tensor = tf.constant(
+        create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
+    input_tensors = [input_tensor]
+
+    # Construct 2 Conv2D operations which use exactly the same input and
+    # weights.
+    result1 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    result2 = tf.nn.conv2d(
+        input_tensor,
+        filter_tensor,
+        strides=parameters["strides"],
+        dilations=parameters["dilations"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    # Add the 2 results up.
+    out = result1 + result2
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    input_shape, unused_filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_depthwiseconv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
@@ -2239,7 +2378,7 @@ def make_lstm_tests(zip_path):
           "time_step_size": [1],
           "input_vec_size": [3],
           "num_cells": [4],
-          "split_tflite_lstm_inputs": [True, False],
+          "split_tflite_lstm_inputs": [False],
       },
   ]
 
@@ -3010,6 +3149,36 @@ def make_pack_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_unpack_tests(zip_path):
+  """Make a set of tests to do unstack."""
+
+  test_parameters = [{
+      "base_shape": [[3, 4, 3], [3, 4], [5, 6, 7, 8]],
+      "axis": [0, 1, 2, 3],
+  }]
+
+  def get_valid_axis(parameters):
+    """Return a tweaked version of 'axis'."""
+    axis = parameters["axis"]
+    shape = parameters["base_shape"][:]
+    while axis > len(shape) - 1:
+      axis -= 1
+    return axis
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name=("input"), shape=parameters["base_shape"])
+    outs = tf.unstack(input_tensor, axis=get_valid_axis(parameters))
+    return [input_tensor], outs
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(np.float32, shape=parameters["base_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index f29c188e6c..62cbeccd33 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -114,7 +114,13 @@ bool GenerateTestSpecFromTensorflowModel(
     // different set.
     std::vector<string> input_values =
         GenerateInputValues(input_layer, input_layer_type, input_layer_shape);
-    if (input_values.empty()) return false;
+    if (input_values.empty()) {
+      std::cerr << "Unable to generate input values for the TensorFlow model. "
+                   "Make sure the correct values are defined for "
+                   "input_layer, input_layer_type, and input_layer_shape."
+                << std::endl;
+      return false;
+    }
 
     // Run TensorFlow.
     for (int j = 0; j < input_values.size(); j++) {
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.h b/tensorflow/contrib/lite/testing/parse_testdata.h
index d94361d735..26ee825866 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.h
+++ b/tensorflow/contrib/lite/testing/parse_testdata.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
-#define TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_PARSE_TESTDATA_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_PARSE_TESTDATA_H_
 
 #include <vector>
 #include "tensorflow/contrib/lite/interpreter.h"
@@ -72,4 +72,4 @@ bool ParseAndRunTests(std::istream* input, TestRunner* test_runner,
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_PARSE_TESTDATA_H_
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index ec435ca60d..30381ba028 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -179,7 +179,9 @@ void TfDriver::Invoke() {
   auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
                               output_names_, {}, &output_tensors_);
   if (!status.ok()) {
-    Invalidate("Failed to run input data on graph");
+    Invalidate(
+        "Failed to run input data on graph. Make sure the correct value is "
+        "defined for the input and output arrays.");
   }
 }
 
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 695c2a3de6..3874bc31d7 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -33,6 +33,7 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
     string input_layer_shape;
     string output_layer;
     int32_t num_runs_per_pass = 100;
+    string delegate;
   } values;
 
   std::vector<tensorflow::Flag> flags = {
@@ -42,18 +43,21 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
                        "Path of tensorflow lite model."),
       tensorflow::Flag("input_layer", &values.input_layer,
                        "Names of input tensors, separated by comma. Example: "
-                       "input_1,input_2"),
+                       "input_1,input_2."),
       tensorflow::Flag("input_layer_type", &values.input_layer_type,
                        "Data types of input tensors, separated by comma. "
-                       "Example: float,int"),
+                       "Example: float,int."),
       tensorflow::Flag(
           "input_layer_shape", &values.input_layer_shape,
-          "Shapes of input tensors, separated by colon. Example: 1,3,4,1:2"),
+          "Shapes of input tensors, separated by colon. Example: 1,3,4,1:2."),
       tensorflow::Flag("output_layer", &values.output_layer,
-                       "Names of output tensors, separated by comma. Example "
-                       "output_1,output_2"),
+                       "Names of output tensors, separated by comma. Example: "
+                       "output_1,output_2."),
       tensorflow::Flag("num_runs_per_pass", &values.num_runs_per_pass,
-                       "Number of full runs in each pass."),
+                       "[optional] Number of full runs in each pass."),
+      tensorflow::Flag("delegate", &values.delegate,
+                       "[optional] Delegate to use for executing ops. Must be "
+                       "`{\"\", EAGER}`"),
   };
 
   bool no_inputs = *argc == 1;
@@ -61,6 +65,14 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
   if (!success || no_inputs || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
     return {};
+  } else if (values.tensorflow_model.empty() || values.tflite_model.empty() ||
+             values.input_layer.empty() || values.input_layer_type.empty() ||
+             values.input_layer_shape.empty() || values.output_layer.empty()) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
+  } else if (!(values.delegate == "" || values.delegate == "EAGER")) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
   }
 
   return {values.tensorflow_model,
@@ -69,7 +81,8 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
           Split<string>(values.input_layer_type, ","),
           Split<string>(values.input_layer_shape, ":"),
           Split<string>(values.output_layer, ","),
-          values.num_runs_per_pass};
+          values.num_runs_per_pass,
+          values.delegate};
 }
 
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.cc b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
index 19f34c0a51..c6ca796ac2 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
@@ -33,7 +33,7 @@ bool RunDiffTest(const DiffOptions& options, int num_invocations) {
           options.input_layer_shape, options.output_layer)) {
     return false;
   }
-  TfLiteDriver tflite_driver(/*use_nnapi=*/true);
+  TfLiteDriver tflite_driver(/*use_nnapi=*/true, options.delegate);
   tflite_driver.LoadModel(options.tflite_model);
   return tflite::testing::ParseAndRunTests(&tflite_stream, &tflite_driver);
 }
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.h b/tensorflow/contrib/lite/testing/tflite_diff_util.h
index 4ab2f230fd..f67992139f 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.h
@@ -44,6 +44,9 @@ struct DiffOptions {
   // each of the passes. The first pass has a single inference, while the
   // second pass does multiple inferences back to back.
   int num_runs_per_pass;
+  // Path to the delegate library to be loaded in order to execute ops. Must be
+  // `{"", EAGER}`.
+  string delegate;
 };
 
 // Run a single TensorFLow Lite diff test with a given options.
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 4d08fb5458..1836eb53b9 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <iostream>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
 #include "tensorflow/contrib/lite/testing/split.h"
 
 namespace tflite {
@@ -135,7 +136,13 @@ class TfLiteDriver::Expectation {
   size_t num_elements_;
 };
 
-TfLiteDriver::TfLiteDriver(bool use_nnapi) : use_nnapi_(use_nnapi) {}
+TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
+    : use_nnapi_(use_nnapi) {
+  if (delegate_name == "EAGER") {
+    delegate_ = EagerDelegate::Create();
+  }
+}
+
 TfLiteDriver::~TfLiteDriver() {}
 
 void TfLiteDriver::AllocateTensors() {
@@ -165,6 +172,15 @@ void TfLiteDriver::LoadModel(const string& bin_file_path) {
   }
   interpreter_->UseNNAPI(use_nnapi_);
 
+  if (delegate_) {
+    if (interpreter_->ModifyGraphWithDelegate(delegate_.get(),
+                                              /*allow_dynamic_tensors=*/true) !=
+        kTfLiteOk) {
+      Invalidate("Unable to the build graph using the delegate");
+      return;
+    }
+  }
+
   must_allocate_tensors_ = true;
 }
 
@@ -286,28 +302,6 @@ bool TfLiteDriver::CheckResults() {
 
 void TfLiteDriver::ResetLSTMStateTensors() {
   interpreter_->ResetVariableTensorsToZero();
-
-  // Below is a workaround for initializing state tensors for LSTM.
-  // TODO(ycling): Remove the code below after nobody is using the 18-inputs
-  // definition.
-  for (auto node_index : interpreter_->execution_plan()) {
-    const auto& node_and_reg = interpreter_->node_and_registration(node_index);
-    const auto& node = node_and_reg->first;
-    const auto& registration = node_and_reg->second;
-
-    if (registration.builtin_code == tflite::BuiltinOperator_LSTM) {
-      const auto* params =
-          reinterpret_cast<const TfLiteLSTMParams*>(node.builtin_data);
-      if (params->kernel_type == kTfLiteLSTMFullKernel &&
-          node.inputs->size == 18 && node.outputs->size >= 2) {
-        // The first 2 outputs of LSTM are state tensors.
-        for (int i = 0; i < 2; ++i) {
-          int node_index = node.outputs->data[i];
-          ResetTensor(node_index);
-        }
-      }
-    }
-  }
 }
 
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
index 5493ba3631..aed35f877d 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.h
+++ b/tensorflow/contrib/lite/testing/tflite_driver.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <map>
 
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
@@ -28,7 +29,7 @@ namespace testing {
 // A test runner that feeds inputs into TF Lite and verifies its outputs.
 class TfLiteDriver : public TestRunner {
  public:
-  explicit TfLiteDriver(bool use_nnapi);
+  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "");
   ~TfLiteDriver() override;
 
   void LoadModel(const string& bin_file_path) override;
@@ -52,6 +53,7 @@ class TfLiteDriver : public TestRunner {
 
   class Expectation;
 
+  std::unique_ptr<EagerDelegate> delegate_;
   bool use_nnapi_ = false;
   std::unique_ptr<FlatBufferModel> model_;
   std::unique_ptr<Interpreter> interpreter_;
diff --git a/tensorflow/contrib/lite/testing/tokenize.h b/tensorflow/contrib/lite/testing/tokenize.h
index 7ed8eb96b7..8195391851 100644
--- a/tensorflow/contrib/lite/testing/tokenize.h
+++ b/tensorflow/contrib/lite/testing/tokenize.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZE_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZE_H_
 
 #include <istream>
 #include <string>
@@ -39,4 +39,4 @@ void Tokenize(std::istream* input, TokenProcessor* processor);
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZE_H_
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 1f3ea2e1c7..18c904c6d4 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -106,6 +106,17 @@ class Allocator {
 
   // Core allocation routine.
   void Allocate(std::size_t size, Alloc* result) {
+    if (size == 0) {
+      // zero-sized arrays get a dummy alloc of (0, 0) that does not
+      // need to be kept in the books (no need to insert that into
+      // live_allocs_).
+      // Note: zero-sized arrays shouldn't exist, but handling that case
+      // here allows such pathological cases to get a cleaner error message
+      // later instead of generating spurious allocator failures.
+      result->start = 0;
+      result->end = 0;
+      return;
+    }
     // Naive algorithm: pick the first gap between live allocations,
     // that is wide enough for the new array.
     std::size_t pos = 0;
@@ -128,6 +139,11 @@ class Allocator {
   }
 
   void Deallocate(const Alloc& a) {
+    // Special-case dummy allocs for zero-sized arrays.
+    if (a.start == 0 && a.end == 0) {
+      // Nothing needs to be done, these aren't kept in the books.
+      return;
+    }
     auto iter = std::lower_bound(live_allocs_.begin(), live_allocs_.end(), a);
     CHECK(iter != live_allocs_.end());
     CHECK(*iter == a);
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 02671f0408..94602445c2 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1967,6 +1967,20 @@ void ConvertCTCBeamSearchDecoderOperator(
   (*op->mutable_attr())["merge_repeated"].set_b(src_op.merge_repeated);
 }
 
+void ConvertUnpackOperator(const Model& model, const UnpackOperator& src_op,
+                           const char* op_name, GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* unpack_op = tensorflow_graph->add_node();
+  unpack_op->set_op(op_name);
+  unpack_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *unpack_op->add_input() = src_op.inputs[0];
+  const tensorflow::DataType data_type =
+      GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*unpack_op->mutable_attr())["T"].set_type(data_type);
+  (*unpack_op->mutable_attr())["num"].set_i(src_op.num);
+  (*unpack_op->mutable_attr())["axis"].set_i(src_op.axis);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -2118,7 +2132,7 @@ void ConvertOperator(const Model& model, const Operator& src_op,
                           tensorflow_graph, "Prod");
   } else if (src_op.type == OperatorType::kReduceMin) {
     ConvertReduceOperator(model,
-                          static_cast<const TensorFlowMaxOperator&>(src_op),
+                          static_cast<const TensorFlowMinOperator&>(src_op),
                           tensorflow_graph, "Min");
   } else if (src_op.type == OperatorType::kReduceMax) {
     ConvertReduceOperator(model,
@@ -2228,6 +2242,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertCTCBeamSearchDecoderOperator(
         model, static_cast<const CTCBeamSearchDecoderOperator&>(src_op),
         "CTCBeamSearchDecoder", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kUnpack) {
+    ConvertUnpackOperator(model, static_cast<const UnpackOperator&>(src_op),
+                          "Unpack", tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index 1ea83abf8e..e88839be5d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -48,7 +48,17 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
     // dimension.
     return false;
   }
-  auto& weights_array = model->GetArray(conv_op->inputs[1]);
+
+  const auto& weights_name = conv_op->inputs[1];
+  if (CountOpsWithInput(*model, weights_name) > 1) {
+    // TODO(yunluli): Come up with a way to do the weights shuffling only once.
+    AddMessageF(
+        "Not changing %s to DepthwiseConv because the weights is consumed by "
+        "another op.",
+        LogName(*conv_op));
+    return false;
+  }
+  auto& weights_array = model->GetArray(weights_name);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
     return false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 76c6be00d4..b324631579 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -274,8 +274,14 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  const auto& weights = model->GetArray(preceding_op->inputs[1]);
-  const auto& bias = model->GetArray(preceding_op->inputs[2]);
+  const auto& weights_name = preceding_op->inputs[1];
+  const auto& bias_name = preceding_op->inputs[2];
+  const auto& weights = model->GetArray(weights_name);
+  const auto& bias = model->GetArray(bias_name);
+  const int count_ops_consuming_bias = CountOpsWithInput(*model, bias_name);
+  const int count_ops_consuming_weights =
+      CountOpsWithInput(*model, weights_name);
+
   if (binary_op->type == OperatorType::kAdd ||
       binary_op->type == OperatorType::kSub) {
     if (!bias.buffer) {
@@ -285,6 +291,13 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
           LogName(*binary_op), LogName(*preceding_op));
       return false;
     }
+    if (count_ops_consuming_bias > 1) {
+      AddMessageF(
+          "Not fusing %s because the bias of the preceding %s is consumed by "
+          "another op",
+          LogName(*binary_op), LogName(*preceding_op));
+      return false;
+    }
   } else {
     if (!weights.buffer || !bias.buffer) {
       AddMessageF(
@@ -293,6 +306,13 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
           LogName(*binary_op), LogName(*preceding_op));
       return false;
     }
+    if (count_ops_consuming_weights > 1 || count_ops_consuming_bias > 1) {
+      AddMessageF(
+          "Not fusing %s because the weights or bias of the preceding %s is "
+          "consumed by another op",
+          LogName(*binary_op), LogName(*preceding_op));
+      return false;
+    }
   }
 
   int count_ops_consuming_output =
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index d26c3b2878..502de88f7c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -274,6 +274,19 @@ bool PropagateMinMaxAmongArrays(Model* model,
   return changed;
 }
 
+bool HardcodeMinMaxForReshape(Model* model, Operator* op) {
+  Array& input = model->GetArray(op->inputs[0]);
+  Array& output = model->GetArray(op->outputs[0]);
+
+  // If input and output both exist or do not exist, do nothing.
+  if ((!input.minmax && !output.minmax) || (input.minmax && output.minmax)) {
+    return false;
+  }
+
+  // Otherwise propagate info amongst the input and output array.
+  return PropagateMinMaxAmongArrays(model, {op->inputs[0], op->outputs[0]});
+}
+
 bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
   CHECK_EQ(op->inputs.size(), LstmCellOperator::NUM_INPUTS);
   CHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
@@ -370,7 +383,6 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
     case OperatorType::kSlice:
     case OperatorType::kStridedSlice:
     case OperatorType::kSqueeze:
-    case OperatorType::kReshape:
     case OperatorType::kExpandDims:
     case OperatorType::kPad:
     case OperatorType::kGather:
@@ -416,6 +428,10 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
       changed = HardcodeMinMaxForLstmCell(model, op);
       break;
 
+    case OperatorType::kReshape:
+      changed = HardcodeMinMaxForReshape(model, op);
+      break;
+
     default:
       break;
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index c8310161cb..323eefcd3a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -227,6 +227,15 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
           ArrayDataType::kFloat;
       break;
     }
+    case OperatorType::kUnpack: {
+      CHECK_EQ(op->inputs.size(), 1);
+      const int output_size = op->outputs.size();
+      for (int i = 0; i < output_size; ++i) {
+        model->GetArray(op->outputs[i]).data_type =
+            model->GetArray(op->inputs[0]).data_type;
+      }
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 91e290439a..fa2be961f5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1629,6 +1629,32 @@ void ProcessOneHotOperator(Model* model, OneHotOperator* op) {
   }
 }
 
+void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
+  CHECK_EQ(op->inputs.size(), 1);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  const std::vector<int>& input_dims = input_array.shape().dims();
+  std::vector<int> output_dims;
+
+  output_dims.reserve(input_dims.size() - 1);
+  for (int i = 0; i < input_dims.size(); ++i) {
+    if (i != op->axis) {
+      output_dims.push_back(input_dims[i]);
+    }
+  }
+  for (const string& output_name : op->outputs) {
+    auto& output_array = model->GetArray(output_name);
+    if (output_array.has_shape()) {
+      return;
+    }
+    *output_array.mutable_shape()->mutable_dims() = output_dims;
+  }
+}
+
 }  // namespace
 
 bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
@@ -1880,6 +1906,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kOneHot:
       ProcessOneHotOperator(model, static_cast<OneHotOperator*>(op));
       break;
+    case OperatorType::kUnpack:
+      ProcessUnpackOperator(model, static_cast<UnpackOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index d395d7a6a0..f5f2f77460 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -117,6 +117,7 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
                                 &quantized_max);
   if (fakequant_op->narrow_range) {
     quantized_min++;
+    output_array.narrow_range = true;
   }
 
   // It is important for matching accuracy between TF training and TFLite
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
index 41562ab393..a6f665b5f0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -100,13 +100,7 @@ bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
 
   AddMessageF("Resolving constant reshape of %s", LogName(*op));
 
-  if (input_array.minmax) {
-    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
-  }
-  if (input_array.quantization_params) {
-    output_array.GetOrCreateQuantizationParams() =
-        input_array.GetQuantizationParams();
-  }
+  CopyMinMaxAndQuantizationRelatedFields(input_array, &output_array);
 
   // Erase input arrays if no longer used.
   for (const auto& input : op->inputs) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
index 0b0d070714..5cfa1a5582 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
@@ -128,15 +128,7 @@ bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
         multiples_array.data_type == ArrayDataType::kInt64)
       << "Only int32/int64 indices are supported";
 
-  // Copy min/max info if present. The ranges of the selected values may be
-  // a subset of the original range but we want to ensure the quantization
-  // params stay the same.
-  if (input_array.minmax) {
-    const auto& input_minmax = input_array.GetMinMax();
-    auto& output_minmax = output_array.GetOrCreateMinMax();
-    output_minmax.min = input_minmax.min;
-    output_minmax.max = input_minmax.max;
-  }
+  CopyMinMaxAndQuantizationRelatedFields(input_array, &output_array);
 
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
index 1fd20314b1..fe15dfa06f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -128,13 +128,7 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
   }
   const Array& input_array = model->GetArray(op->inputs[0]);
 
-  if (input_array.minmax) {
-    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
-  }
-  if (input_array.quantization_params) {
-    output_array.GetOrCreateQuantizationParams() =
-        input_array.GetQuantizationParams();
-  }
+  CopyMinMaxAndQuantizationRelatedFields(input_array, &output_array);
 
   if (op->perm.empty()) {
     // Yield until perm has been populated by ResolveTransposeAttributes.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index fe3882c28d..475415e481 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -246,8 +246,8 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
       }
       output_float_data[i] = outval;
     }
-  } else if (unary_op->type == OperatorType::kRelu6 &&
-             unary_op->type == OperatorType::kRelu1 &&
+  } else if (unary_op->type == OperatorType::kRelu6 ||
+             unary_op->type == OperatorType::kRelu1 ||
              unary_op->type == OperatorType::kRelu) {
     for (size_t i = 0; i < output_buffer_size; ++i) {
       const float value = (*input_float_data)[i];
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 5f0cece67a..fedf4441e2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -154,6 +154,7 @@ bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
   pack_op->inputs = pack_inputs;
   pack_op->outputs = {batch_op->outputs[0]};
   pack_op->axis = 0;
+  pack_op->values_count = pack_inputs.size();
   model->operators.emplace(tail_it, pack_op);
 
   // Remove the old batch matmul now that we've unrolled.
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index b7fffbce22..0e04ee4ccb 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1576,6 +1576,26 @@ tensorflow::Status ConvertPackOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertUnpackOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "Unpack");
+  auto op = absl::make_unique<UnpackOperator>();
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  QCHECK_EQ(num_inputs, 1);
+  op->inputs.push_back(node.input(0));
+  op->num = GetIntAttr(node, "num");
+  op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0;
+  op->dtype = ConvertDataType(toco::GetDataTypeAttr(node, "T"));
+
+  op->outputs.push_back(node.name());  // Implicit :0.
+  for (int i = 1; i < op->num; ++i) {
+    op->outputs.push_back(node.name() + ":" + std::to_string(i));
+  }
+  model->operators.emplace_back(std::move(op));
+  return tensorflow::Status::OK();
+}
+
 // Some TensorFlow ops only occur in graph cycles, representing
 // control flow. We do not currently support control flow, so we wouldn't
 // be able to fully support such graphs, including performing inference,
@@ -2020,6 +2040,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"TopK", ConvertTopKV2Operator},
       {"TopKV2", ConvertTopKV2Operator},
       {"Transpose", ConvertSimpleOperator<TransposeOperator, 2>},
+      {"Unpack", ConvertUnpackOperator},
   });
 }
 
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 412e14c4ad..3a909c3d8e 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -149,6 +149,7 @@ enum class OperatorType : uint8 {
   kLogicalNot,
   kLogicalOr,
   kCTCBeamSearchDecoder,
+  kUnpack,
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -1828,6 +1829,20 @@ struct LogicalOrOperator : Operator {
   LogicalOrOperator() : Operator(OperatorType::kLogicalOr) {}
 };
 
+// Unpack operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean input tensor.
+// Inputs[1]: required: reduction_indices.
+//
+// TensorFlow equivalent: tf.unstack.
+struct UnpackOperator : Operator {
+  UnpackOperator() : Operator(OperatorType::kUnpack) {}
+  int num;
+  int axis;
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.h b/tensorflow/contrib/lite/toco/python/toco_python_api.h
index 7e8ad9c1da..ee054bbed9 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.h
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
-#define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
 
 #include <Python.h>
 #include <string>
@@ -33,4 +33,4 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
 
 }  // namespace toco
 
-#endif  // _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h
index 18ff73ac39..fda7743a27 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
 
 #include <string>
 #include <vector>
@@ -98,4 +98,4 @@ class ClusterFactoryInterface {
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h
index a15e480e70..b57bded305 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTERUTILS_H
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTERUTILS_H
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
 
 #include <string>
 
@@ -30,4 +30,4 @@ void Transpose2DTensor(const float* tensor, int row, int col,
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTERUTILS_H
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h
index 7d33dd1885..3334552afb 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
 
 #include <string>
 #include <unordered_map>
@@ -60,4 +60,4 @@ std::unique_ptr<tensorflow::GraphDef> MaybeReplaceCompositeSubgraph(
 
 }  // end namespace toco
 
-#endif  // CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h
index c4c6c34117..383fd99dff 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
 
 #include <string>
 #include <vector>
@@ -79,4 +79,4 @@ class SvdfClusterFactory : public ClusterFactoryInterface {
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 75808f2b69..e9383098cc 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -787,6 +787,25 @@ class ReduceMax
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class ReduceMin
+    : public BuiltinOperator<TensorFlowSumOperator, ::tflite::ReducerOptions,
+                             ::tflite::BuiltinOptions_ReducerOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateReducerOptions(*builder, op.keep_dims);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->keep_dims = options.keep_dims();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class ReduceProd
     : public BuiltinOperator<TensorFlowSumOperator, ::tflite::ReducerOptions,
                              ::tflite::BuiltinOptions_ReducerOptions> {
@@ -1091,6 +1110,24 @@ class CTCBeamSearchDecoder
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
+                                      ::tflite::BuiltinOptions_UnpackOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateUnpackOptions(*builder, op.num, op.axis);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->num = options.num();
+    op->axis = options.axis();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -1297,6 +1334,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
                                        OperatorType::kReduceProd));
   ops.push_back(MakeUnique<ReduceMax>(::tflite::BuiltinOperator_REDUCE_MAX,
                                       OperatorType::kReduceMax));
+  ops.push_back(MakeUnique<ReduceMin>(::tflite::BuiltinOperator_REDUCE_MIN,
+                                      OperatorType::kReduceMin));
   ops.push_back(
       MakeUnique<ResizeBilinear>(::tflite::BuiltinOperator_RESIZE_BILINEAR,
                                  OperatorType::kResizeBilinear));
@@ -1332,6 +1371,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       MakeUnique<Pack>(::tflite::BuiltinOperator_PACK, OperatorType::kPack));
   ops.push_back(MakeUnique<OneHot>(::tflite::BuiltinOperator_ONE_HOT,
                                    OperatorType::kOneHot));
+  ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
+                                   OperatorType::kUnpack));
 
   // Custom Operators.
   ops.push_back(
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index fc854461b4..bb0b457483 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -476,6 +476,16 @@ TEST_F(OperatorTest, BuiltinOneHot) {
   EXPECT_EQ(op.axis, output_toco_op->axis);
 }
 
+TEST_F(OperatorTest, BuiltinUnpack) {
+  UnpackOperator op;
+  op.num = 5;
+  op.axis = 2;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("UNPACK", OperatorType::kUnpack), op);
+  EXPECT_EQ(op.num, output_toco_op->num);
+  EXPECT_EQ(op.axis, output_toco_op->axis);
+}
+
 TEST_F(OperatorTest, CustomCTCBeamSearchDecoder) {
   CTCBeamSearchDecoderOperator op;
   op.beam_width = 3;
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index 14168fa33f..204c0d101e 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -138,13 +138,15 @@ namespace port {
 #define close _close
 #define open _open
 #define read _read
-#define O_RDONLY _O_RDONLY
-#define O_CREAT _O_CREAT
-#define O_WRONLY _O_WRONLY
-// Windows does not support the same set of file permissions as other platforms.
+// Windows does not support the same set of file permissions as other platforms,
+// and also requires an explicit flag for binary file read/write support.
 constexpr int kFileCreateMode = _S_IREAD | _S_IWRITE;
+constexpr int kFileReadFlags = _O_RDONLY | _O_BINARY;
+constexpr int kFileWriteFlags = _O_WRONLY | _O_BINARY | _O_CREAT;
 #else
 constexpr int kFileCreateMode = 0664;
+constexpr int kFileReadFlags = O_RDONLY;
+constexpr int kFileWriteFlags = O_CREAT | O_WRONLY;
 #endif  // _WIN32
 
 static bool port_initialized = false;
@@ -197,7 +199,7 @@ tensorflow::Status GetContents(const string& path, string* output,
                                const file::Options& options) {
   output->clear();
 
-  int fd = open(path.c_str(), O_RDONLY);
+  int fd = open(path.c_str(), kFileReadFlags);
   if (fd == -1) {
     return tensorflow::errors::NotFound("can't open() for read");
   }
@@ -226,7 +228,7 @@ tensorflow::Status GetContents(const string& path, string* output,
 
 tensorflow::Status SetContents(const string& filename, const string& contents,
                                const file::Options& options) {
-  int fd = open(filename.c_str(), O_WRONLY | O_CREAT, kFileCreateMode);
+  int fd = open(filename.c_str(), kFileWriteFlags, kFileCreateMode);
   if (fd == -1) {
     return tensorflow::errors::Internal("can't open() for write");
   }
diff --git a/tensorflow/contrib/lite/toco/toco_types.h b/tensorflow/contrib/lite/toco/toco_types.h
index d72a3bd1f3..319f1066cd 100644
--- a/tensorflow/contrib/lite/toco/toco_types.h
+++ b/tensorflow/contrib/lite/toco/toco_types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TYPES_H_
 
 #include <string>
 #include "tensorflow/core/platform/platform.h"
@@ -42,4 +42,4 @@ using tensorflow::uint8;
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 2ad2719811..6ab93d9316 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -405,6 +405,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(LogicalNot)
     HANDLE_OPERATORTYPENAME_CASE(LogicalOr)
     HANDLE_OPERATORTYPENAME_CASE(CTCBeamSearchDecoder)
+    HANDLE_OPERATORTYPENAME_CASE(Unpack)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -2278,4 +2279,14 @@ void UndoWeightsShuffling(Model* model) {
   }
 }
 
+void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst) {
+  if (src.minmax) {
+    dst->GetOrCreateMinMax() = src.GetMinMax();
+  }
+  if (src.quantization_params) {
+    dst->GetOrCreateQuantizationParams() = src.GetQuantizationParams();
+  }
+  dst->narrow_range = src.narrow_range;
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index b99e6111fe..bdeb203024 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -348,6 +348,9 @@ tensorflow::Status NumElements(const std::vector<T>& shape, U* num_elements) {
 // so that the rest of toco doesn't need to know about shuffled weights.
 void UndoWeightsShuffling(Model* model);
 
+// Copies minmax, quantization_params, and narrow_range.
+void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst);
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/BUILD b/tensorflow/contrib/lite/tools/accuracy/BUILD
new file mode 100644
index 0000000000..21941f5c8b
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/BUILD
@@ -0,0 +1,314 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+
+common_linkopts = tflite_linkopts() + select({
+    "//conditions:default": [],
+    "//tensorflow:android": [
+        "-pie",
+        "-llog",
+    ],
+})
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)",
+    ],
+    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        ":utils",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:lib",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "run_tflite_model_op",
+    srcs = ["run_tflite_model_op.cc"],
+    copts = tflite_copts(),
+    deps = [
+        ":utils",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/core:protos_all_cc",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:ops",
+            ],
+        },
+    ),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "android_required_build_flags",
+    srcs = ["android_required_build_flags.cc"],
+    copts = tflite_copts(),
+)
+
+tf_cc_test(
+    name = "run_tflite_model_op_test",
+    srcs = ["run_tflite_model_op_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)",
+    ],
+    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        ":run_tflite_model_op",
+        ":android_required_build_flags",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:protos_all_cc",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "stage",
+    hdrs = ["stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/cc:scope",
+    ],
+)
+
+cc_library(
+    name = "file_reader_stage",
+    srcs = ["file_reader_stage.cc"],
+    hdrs = ["file_reader_stage.h"],
+    deps = [
+        ":stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ],
+)
+
+tf_cc_test(
+    name = "file_reader_stage_test",
+    srcs = ["file_reader_stage_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        ":file_reader_stage",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core/kernels:android_whole_file_read_ops",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "run_tflite_model_stage",
+    srcs = ["run_tflite_model_stage.cc"],
+    hdrs = ["run_tflite_model_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":run_tflite_model_op",
+        ":stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ],
+)
+
+cc_library(
+    name = "accuracy_eval_stage",
+    hdrs = ["accuracy_eval_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "eval_pipeline",
+    srcs = ["eval_pipeline.cc"],
+    hdrs = ["eval_pipeline.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":accuracy_eval_stage",
+        ":stage",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "eval_pipeline_test",
+    srcs = ["eval_pipeline_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        ":eval_pipeline",
+        "//tensorflow/cc:cc_ops",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "eval_pipeline_builder",
+    srcs = ["eval_pipeline_builder.cc"],
+    hdrs = ["eval_pipeline_builder.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":eval_pipeline",
+        ":accuracy_eval_stage",
+        ":stage",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/cc:cc_ops",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "eval_pipeline_builder_test",
+    srcs = ["eval_pipeline_builder_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        ":eval_pipeline_builder",
+        "//tensorflow/cc:cc_ops",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "csv_writer",
+    hdrs = ["csv_writer.h"],
+    copts = tflite_copts(),
+    deps = select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:lib",
+            ],
+        },
+    ),
+)
diff --git a/tensorflow/contrib/lite/tools/accuracy/README.md b/tensorflow/contrib/lite/tools/accuracy/README.md
new file mode 100644
index 0000000000..769ef201d2
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/README.md
@@ -0,0 +1,40 @@
+## TFLite accuracy library.
+
+This library provides evaluation pipelines that can be used to evaluate
+accuracy and other metrics of a model. The resulting binary can be run on
+a desktop or on a mobile device.
+
+## Usage
+The tool provides an evaluation pipeline with different stages. Each
+stage outputs a Tensorflow graph.
+A sample usage is shown below.
+
+```C++
+// First build the pipeline.
+EvalPipelineBuilder builder;
+std::unique_ptr<EvalPipeline> eval_pipeline;
+auto status = builder.WithInput("pipeline_input", DT_FLOAT)
+     .WithInputStage(&input_stage)
+     .WithRunModelStage(&run_model_stage)
+     .WithPreprocessingStage(&preprocess_stage)
+     .WithAccuracyEval(&eval)
+     .Build(scope, &eval_pipeline);
+TF_CHECK_OK(status);
+
+// Now run the pipeline with inputs and outputs.
+std::unique_ptr<Session> session(NewSession(SessionOptions()));
+TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
+Tensor input = ... read input for the model ...
+Tensor ground_truth = ... read ground truth for the model ...
+TF_CHECK_OK(eval_pipeline.Run(input1, ground_truth1));
+```
+For further examples, check the usage in [imagenet accuracy evaluation binary]
+(https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc)
+
+## Measuring accuracy of published models.
+
+### ILSVRC (Imagenet Large Scale Visual Recognition Contest) classification task
+For measuring accuracy for [ILSVRC 2012 image classification task]
+(http://www.image-net.org/challenges/LSVRC/2012/), the binary can be built
+using these
+[instructions.](ilsvrc/)
diff --git a/tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h b/tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h
new file mode 100644
index 0000000000..9cb843729a
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace metrics {
+
+// Base class for evaluation stage that evaluates the accuracy of the model.
+// This stage calculates the accuracy metrics given the model outputs and
+// expected ground truth.
+class AccuracyEval {
+ public:
+  AccuracyEval() = default;
+  AccuracyEval(const AccuracyEval&) = delete;
+  AccuracyEval& operator=(const AccuracyEval&) = delete;
+
+  AccuracyEval(const AccuracyEval&&) = delete;
+  AccuracyEval& operator=(const AccuracyEval&&) = delete;
+
+  virtual ~AccuracyEval() = default;
+
+  // Evaluates the accuracy of the model for given `model_outputs` and the
+  // `ground truth`.
+  // Derived classes can do additional book keeping, calculate aggregrate
+  // statistics etc for the given model.
+  virtual Status ComputeEval(const std::vector<Tensor>& model_outputs,
+                             const Tensor& ground_truth) = 0;
+};
+}  //  namespace metrics
+}  //  namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/android_required_build_flags.cc b/tensorflow/contrib/lite/tools/accuracy/android_required_build_flags.cc
new file mode 100644
index 0000000000..7fa8986716
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/android_required_build_flags.cc
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tensorflow on Android requires selective registration to be enabled in order
+// for certain types (e.g. DT_UINT8) to work.
+// Checks below ensure that for Android build, the right flags are passed to
+// the compiler.
+
+#if defined(__ANDROID__) && (!defined(__ANDROID_TYPES_FULL__) || \
+                             !defined(SUPPORT_SELECTIVE_REGISTRATION))
+#error \
+    "Binary needs custom kernel support. For enabling custom kernels on " \
+    "Android, please pass -D__ANDROID_TYPES_FULL__ && " \
+    "-DSUPPORT_SELECTIVE_REGISTRATION for including the kernel in the binary."
+#endif
diff --git a/tensorflow/contrib/lite/tools/accuracy/csv_writer.h b/tensorflow/contrib/lite/tools/accuracy/csv_writer.h
new file mode 100644
index 0000000000..806b0d9418
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/csv_writer.h
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
+
+#include <fstream>
+#include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace metrics {
+// A simple CSV writer that writes values of same type for fixed number of
+// columns. This supports a very limited set of CSV spec and doesn't do any
+// escaping.
+// Usage:
+// std::ofstream * output_stream = ...
+// CSVWriter writer({"column1", "column2"}, output_stream);
+// writer.WriteRow({4, 5});
+// writer.Flush(); // flush results immediately.
+class CSVWriter {
+ public:
+  CSVWriter(const std::vector<string>& columns, std::ofstream* output_stream)
+      : num_columns_(columns.size()), output_stream_(output_stream) {
+    TF_CHECK_OK(WriteRow(columns, output_stream_));
+  }
+
+  template <typename T>
+  Status WriteRow(const std::vector<T>& values) {
+    if (values.size() != num_columns_) {
+      return errors::InvalidArgument("Invalid size for row:", values.size(),
+                                     " expected: ", num_columns_);
+    }
+    return WriteRow(values, output_stream_);
+  }
+
+  void Flush() { output_stream_->flush(); }
+
+  ~CSVWriter() { output_stream_->flush(); }
+
+ private:
+  template <typename T>
+  static Status WriteRow(const std::vector<T>& values,
+                         std::ofstream* output_stream) {
+    bool first = true;
+    for (const auto& v : values) {
+      if (!first) {
+        (*output_stream) << ", ";
+      } else {
+        first = false;
+      }
+      (*output_stream) << v;
+    }
+    (*output_stream) << "\n";
+    if (!output_stream->good()) {
+      return errors::Internal("Writing to stream failed.");
+    }
+    return Status::OK();
+  }
+  const size_t num_columns_;
+  std::ofstream* output_stream_;
+};
+}  // namespace metrics
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.cc b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.cc
new file mode 100644
index 0000000000..a03aba6a26
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
+
+namespace tensorflow {
+namespace metrics {
+
+Status EvalPipeline::AttachSession(std::unique_ptr<Session> session) {
+  session_ = std::move(session);
+  TF_RETURN_IF_ERROR(session_->Create(model_graph_));
+  return Status::OK();
+}
+
+Status EvalPipeline::Run(const Tensor& input, const Tensor& ground_truth) {
+  if (session_ == nullptr) {
+    return errors::Internal("No session is associated with the graph.");
+  }
+  std::vector<Tensor> outputs;
+  TF_RETURN_IF_ERROR(session_->Run({{params_.model_input_node_name, input}},
+                                   {params_.model_output_node_name}, {},
+                                   &outputs));
+  TF_RETURN_IF_ERROR(eval_->ComputeEval(outputs, ground_truth));
+  return Status::OK();
+}
+}  //  namespace metrics
+}  //  namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h
new file mode 100644
index 0000000000..c9cfc86613
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
+
+#include <string>
+
+#include "tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h"
+#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace metrics {
+
+// Pipeline for evaluating a model.
+// Runs the graph and passes the output of graph to
+// the provided instance of AccuracyEval.
+// Example usage:
+// AccuracyEval *eval;
+// GraphDef graph_def;
+// ... populate graph_def...
+//
+// EvalPipeline eval_pipeline(&graph_def,
+//    {.model_input_node_name = "model_input",
+//     .model_output_node_name = "model_output"},
+//     eval);
+//  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+//  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
+//  Tensor input = ... read input for the model ...
+//  Tensor ground_truth = ... read ground truth for the model ...
+//  TF_CHECK_OK(eval_pipeline.Run(input, ground_truth));
+//
+class EvalPipeline {
+ public:
+  struct Params {
+    string model_input_node_name;
+    string model_output_node_name;
+  };
+
+  // Creates a new `EvalPipeline` object. The ownership of the `accuracy_eval`
+  // is retained by the caller. Lifetime of `accuracy_eval` instance should
+  // be longer than the lifetime of this instance of pipeline.
+  EvalPipeline(const GraphDef& graph, const Params& params,
+               AccuracyEval* accuracy_eval)
+      : model_graph_(graph),
+        params_(params),
+        eval_(accuracy_eval),
+        session_(nullptr) {}
+
+  EvalPipeline(const EvalPipeline&) = delete;
+  EvalPipeline& operator=(const EvalPipeline&) = delete;
+
+  EvalPipeline(const EvalPipeline&&) = delete;
+  EvalPipeline& operator=(const EvalPipeline&&) = delete;
+
+  // Attaches the given session to this instance of pipeline.
+  // The provided session object will be reused for subsequent calls to
+  // EvalPipeline::Run.
+  Status AttachSession(std::unique_ptr<Session> session);
+
+  // Runs the model by feeding `input` and then passes the output of the model
+  // along with provided `ground_truth` to the AccuracyEval instance by calling
+  // AccuracyEval::ComputeEval.
+  Status Run(const Tensor& input, const Tensor& ground_truth);
+
+ private:
+  GraphDef model_graph_;
+  Params params_;
+  AccuracyEval* eval_;
+  std::unique_ptr<Session> session_;
+};
+}  //  namespace metrics
+}  //  namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.cc b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.cc
new file mode 100644
index 0000000000..2e16437e15
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.cc
@@ -0,0 +1,100 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace metrics {
+
+EvalPipelineBuilder& EvalPipelineBuilder::WithInputStage(Stage* input_stage) {
+  input_stage_ = input_stage;
+  return *this;
+}
+
+EvalPipelineBuilder& EvalPipelineBuilder::WithPreprocessingStage(
+    Stage* preprocessing_stage) {
+  preprocessing_stage_ = preprocessing_stage;
+  return *this;
+}
+
+EvalPipelineBuilder& EvalPipelineBuilder::WithRunModelStage(
+    Stage* run_model_stage) {
+  run_model_stage_ = run_model_stage;
+  return *this;
+}
+
+EvalPipelineBuilder& EvalPipelineBuilder::WithAccuracyEval(
+    AccuracyEval* accuracy_eval) {
+  accuracy_eval_ = accuracy_eval;
+  return *this;
+}
+
+EvalPipelineBuilder& EvalPipelineBuilder::WithInput(const string& input_name,
+                                                    DataType input_type) {
+  input_name_ = input_name;
+  input_type_ = input_type;
+  return *this;
+}
+
+Status EvalPipelineBuilder::Build(
+    const Scope& scope, std::unique_ptr<EvalPipeline>* eval_pipeline) {
+  if (input_stage_ == nullptr) {
+    return errors::InvalidArgument("Input stage is null.");
+  }
+  if (preprocessing_stage_ == nullptr) {
+    return errors::InvalidArgument("Preprocessing stage is null.");
+  }
+  if (run_model_stage_ == nullptr) {
+    return errors::InvalidArgument("Run model stage is null.");
+  }
+  if (accuracy_eval_ == nullptr) {
+    return errors::InvalidArgument("accuracy_eval is null.");
+  }
+  if (input_name_.empty()) {
+    return errors::InvalidArgument("input name is not set.");
+  }
+  if (input_type_ == DT_INVALID) {
+    return errors::InvalidArgument("input type is not set.");
+  }
+
+  auto input_placeholder =
+      ops::Placeholder(scope.WithOpName(input_name_), input_type_);
+  TF_RETURN_IF_ERROR(scope.status());
+
+  input_stage_->AddToGraph(scope, input_placeholder);
+  TF_RETURN_IF_ERROR(scope.status());
+
+  preprocessing_stage_->AddToGraph(scope, input_stage_->Output());
+  TF_RETURN_IF_ERROR(scope.status());
+
+  run_model_stage_->AddToGraph(scope, preprocessing_stage_->Output());
+  TF_RETURN_IF_ERROR(scope.status());
+
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(scope.ToGraphDef(&graph_def));
+  EvalPipeline::Params params;
+  params.model_input_node_name = input_name_;
+  params.model_output_node_name = run_model_stage_->output_name();
+  *eval_pipeline =
+      absl::make_unique<EvalPipeline>(graph_def, params, accuracy_eval_);
+
+  return Status::OK();
+}
+
+}  //  namespace metrics
+}  //  namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h
new file mode 100644
index 0000000000..692db022f8
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h"
+#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
+#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+
+namespace tensorflow {
+namespace metrics {
+
+// A builder to simplify construction of an `EvalPipeline` instance.
+// The `Build` method creates an |EvalPipeline| with the following structure:
+// |input| -> |input_stage|
+//               |--> |preprocessing_stage|
+//                         |--> |run_model_stage| ->  |accuracy_eval_stage|.
+// The stages are chained in the order shown above. Any missing stage results in
+// an error. The ownership of the stage object is retained by the caller. Stage
+// objects need to exist until the |Build| method is called.
+//
+// Currently only single inputs are supported.
+//
+// Example Usage:
+// EvalPipelineBuilder builder;
+// std::unique_ptr<EvalPipeline> eval_pipeline;
+// auto status = builder.WithInput("pipeline_input", DT_FLOAT)
+//      .WithInputStage(&input_stage)
+//      .WithRunModelStage(&run_model_stage)
+//      .WithPreprocessingStage(&preprocess_stage)
+//      .WithAccuracyEval(&eval)
+//      .Build(scope, &eval_pipeline);
+// TF_CHECK_OK(status);
+class EvalPipelineBuilder {
+ public:
+  EvalPipelineBuilder() = default;
+  EvalPipelineBuilder(const EvalPipelineBuilder&) = delete;
+  EvalPipeline& operator=(const EvalPipelineBuilder&) = delete;
+
+  EvalPipelineBuilder(const EvalPipelineBuilder&&) = delete;
+  EvalPipeline& operator=(const EvalPipelineBuilder&&) = delete;
+
+  // Sets the input stage for the pipeline.
+  // Input stage converts the input, say filename into appropriate format
+  // that can be consumed by the preprocessing stage.
+  EvalPipelineBuilder& WithInputStage(Stage* input_stage);
+
+  // Sets the preprocessing stage for the pipeline.
+  // Preprocessing stage converts the input into a format that can be used to
+  // run the model.
+  EvalPipelineBuilder& WithPreprocessingStage(Stage* preprocessing_stage);
+
+  // Sets the run model stage for the pipeline.
+  // This stage receives the preprocessing input and output of this stage is
+  // fed to the accuracy eval stage.
+  EvalPipelineBuilder& WithRunModelStage(Stage* run_model_stage);
+
+  // Sets the accuracy eval for the pipeline.
+  // Results of evaluating the pipeline are fed to the `accuracy_eval` instance.
+  EvalPipelineBuilder& WithAccuracyEval(AccuracyEval* accuracy_eval);
+
+  // Sets the name and type of input for the pipeline.
+  // TODO(shashishekhar): Support multiple inputs for the pipeline, use a vector
+  // here.
+  EvalPipelineBuilder& WithInput(const string& input_name, DataType input_type);
+
+  // Builds the pipeline and assigns the pipeline to `eval_pipeline`.
+  // If the pipeline creation fails `eval_pipeline` is untouched.
+  Status Build(const Scope& scope,
+               std::unique_ptr<EvalPipeline>* eval_pipeline);
+
+ private:
+  Stage* input_stage_ = nullptr;
+  Stage* preprocessing_stage_ = nullptr;
+  Stage* run_model_stage_ = nullptr;
+  AccuracyEval* accuracy_eval_ = nullptr;
+  string input_name_;
+  DataType input_type_ = DT_INVALID;
+};
+
+}  //  namespace metrics
+}  //  namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder_test.cc b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder_test.cc
new file mode 100644
index 0000000000..2d41929b79
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder_test.cc
@@ -0,0 +1,229 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h"
+#include <gtest/gtest.h>
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace metrics {
+namespace {
+
+class IdentityStage : public Stage {
+ public:
+  IdentityStage(const string& name, const string& output)
+      : name_(name), output_(output) {}
+
+  void AddToGraph(const Scope& scope, const Input& input) override {
+    called_count_++;
+    inputs_.push_back(input.node()->name());
+    stage_output_ = ops::Identity(scope.WithOpName(output_), input);
+  }
+
+  string name() const override { return name_; }
+  string output_name() const override { return output_; }
+
+  int times_called() const { return called_count_; }
+
+  const std::vector<string> input_params() { return inputs_; }
+
+ private:
+  string name_;
+  string output_;
+  int called_count_ = 0;
+  std::vector<string> inputs_;
+};
+
+class FailingStage : public Stage {
+ public:
+  FailingStage(const string& name, const string& output)
+      : name_(name), output_(output) {}
+
+  void AddToGraph(const Scope& scope, const Input& input) override {
+    called_count_++;
+    scope.UpdateStatus(errors::Internal("Stage failed:", name_));
+  }
+
+  string name() const override { return name_; }
+  string output_name() const override { return output_; }
+
+  int times_called() const { return called_count_; }
+
+ private:
+  string name_;
+  string output_;
+  int called_count_ = 0;
+};
+
+class SimpleAccuracyEval : public AccuracyEval {
+ public:
+  SimpleAccuracyEval() {}
+
+  Status ComputeEval(const std::vector<Tensor>& model_outputs,
+                     const Tensor& ground_truth) override {
+    return Status::OK();
+  }
+};
+
+TEST(EvalPipelineBuilder, MissingPipelineStages) {
+  IdentityStage input_stage("input_stage", "input_stage_out");
+  IdentityStage run_model_stage("run_model", "run_model_out");
+  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
+  const string pipeline_input = "pipeline_input";
+
+  SimpleAccuracyEval eval;
+
+  Scope scope = Scope::NewRootScope();
+  std::unique_ptr<EvalPipeline> eval_pipeline;
+  EvalPipelineBuilder builder;
+  auto status =
+      builder.WithInputStage(&input_stage).Build(scope, &eval_pipeline);
+  EXPECT_FALSE(status.ok());
+  EXPECT_FALSE(eval_pipeline);
+
+  status =
+      builder.WithRunModelStage(&run_model_stage).Build(scope, &eval_pipeline);
+  EXPECT_FALSE(status.ok());
+  EXPECT_FALSE(eval_pipeline);
+
+  status = builder.WithPreprocessingStage(&preprocess_stage)
+               .Build(scope, &eval_pipeline);
+  EXPECT_FALSE(status.ok());
+  EXPECT_FALSE(eval_pipeline);
+
+  status =
+      builder.WithInput(pipeline_input, DT_FLOAT).Build(scope, &eval_pipeline);
+  EXPECT_FALSE(status.ok());
+  EXPECT_FALSE(eval_pipeline);
+
+  status = builder.WithAccuracyEval(&eval).Build(scope, &eval_pipeline);
+  TF_CHECK_OK(status);
+  EXPECT_TRUE(eval_pipeline);
+}
+
+TEST(EvalPipeline, InputStageFailure) {
+  FailingStage input_stage("input_stage", "input_stage_out");
+  IdentityStage run_model_stage("run_model", "run_model_out");
+  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
+  const string pipeline_input = "pipeline_input";
+
+  SimpleAccuracyEval eval;
+
+  Scope scope = Scope::NewRootScope();
+  std::unique_ptr<EvalPipeline> eval_pipeline;
+  EvalPipelineBuilder builder;
+  auto status = builder.WithInputStage(&input_stage)
+                    .WithRunModelStage(&run_model_stage)
+                    .WithPreprocessingStage(&preprocess_stage)
+                    .WithInput(pipeline_input, DT_FLOAT)
+                    .WithAccuracyEval(&eval)
+                    .Build(scope, &eval_pipeline);
+
+  EXPECT_FALSE(scope.status().ok());
+  // None of the other stages would have been called.
+  EXPECT_EQ(1, input_stage.times_called());
+  EXPECT_EQ(0, preprocess_stage.times_called());
+  EXPECT_EQ(0, run_model_stage.times_called());
+}
+
+TEST(EvalPipeline, PreprocessingFailure) {
+  IdentityStage input_stage("input_stage", "input_stage_out");
+  FailingStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
+  IdentityStage run_model_stage("run_model", "run_model_out");
+  const string pipeline_input = "pipeline_input";
+
+  SimpleAccuracyEval eval;
+
+  Scope scope = Scope::NewRootScope();
+  std::unique_ptr<EvalPipeline> eval_pipeline;
+  EvalPipelineBuilder builder;
+  auto status = builder.WithInputStage(&input_stage)
+                    .WithRunModelStage(&run_model_stage)
+                    .WithPreprocessingStage(&preprocess_stage)
+                    .WithInput(pipeline_input, DT_FLOAT)
+                    .WithAccuracyEval(&eval)
+                    .Build(scope, &eval_pipeline);
+
+  EXPECT_FALSE(status.ok());
+  // None of the other stages would have been called.
+  EXPECT_EQ(1, input_stage.times_called());
+  EXPECT_EQ(1, preprocess_stage.times_called());
+  EXPECT_EQ(0, run_model_stage.times_called());
+}
+
+TEST(EvalPipeline, GraphEvalFailure) {
+  IdentityStage input_stage("input_stage", "input_stage_out");
+  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
+  FailingStage run_model_stage("run_model", "run_model_out");
+  const string pipeline_input = "pipeline_input";
+
+  SimpleAccuracyEval eval;
+
+  Scope scope = Scope::NewRootScope();
+  std::unique_ptr<EvalPipeline> eval_pipeline;
+  EvalPipelineBuilder builder;
+  auto status = builder.WithInputStage(&input_stage)
+                    .WithRunModelStage(&run_model_stage)
+                    .WithPreprocessingStage(&preprocess_stage)
+                    .WithInput(pipeline_input, DT_FLOAT)
+                    .WithAccuracyEval(&eval)
+                    .Build(scope, &eval_pipeline);
+
+  EXPECT_FALSE(status.ok());
+  // None of the other stages would have been called.
+  EXPECT_EQ(1, input_stage.times_called());
+  EXPECT_EQ(1, preprocess_stage.times_called());
+  EXPECT_EQ(1, run_model_stage.times_called());
+}
+
+TEST(EvalPipeline, PipelineHasCorrectSequence) {
+  IdentityStage input_stage("input_stage", "input_stage_out");
+  IdentityStage preprocess_stage("preprocess_stage", "preprocess_stage_out");
+  IdentityStage run_model_stage("run_model", "run_model_out");
+  const string pipeline_input = "pipeline_input";
+
+  SimpleAccuracyEval eval;
+
+  Scope scope = Scope::NewRootScope();
+  std::unique_ptr<EvalPipeline> eval_pipeline;
+  EvalPipelineBuilder builder;
+  auto status = builder.WithInputStage(&input_stage)
+                    .WithRunModelStage(&run_model_stage)
+                    .WithPreprocessingStage(&preprocess_stage)
+                    .WithInput(pipeline_input, DT_FLOAT)
+                    .WithAccuracyEval(&eval)
+                    .Build(scope, &eval_pipeline);
+  TF_CHECK_OK(status);
+
+  ASSERT_EQ(1, input_stage.times_called());
+  ASSERT_EQ(1, run_model_stage.times_called());
+  ASSERT_EQ(1, preprocess_stage.times_called());
+
+  EXPECT_EQ(pipeline_input, input_stage.input_params()[0]);
+  EXPECT_EQ(input_stage.output_name(), preprocess_stage.input_params()[0]);
+  EXPECT_EQ(preprocess_stage.output_name(), run_model_stage.input_params()[0]);
+}
+
+}  // namespace
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_test.cc b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_test.cc
new file mode 100644
index 0000000000..ea0f6e19df
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
+#include <gtest/gtest.h>
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace metrics {
+namespace {
+
+Tensor CreateFloatTensor(float value) {
+  Tensor tensor(DT_FLOAT, TensorShape({}));
+  tensor.scalar<float>()() = value;
+  return tensor;
+}
+
+class NoOpAccuracyEval : public AccuracyEval {
+ public:
+  explicit NoOpAccuracyEval(const Status& status_to_return)
+      : status_to_return_(status_to_return) {}
+
+  Status ComputeEval(const std::vector<Tensor>& model_outputs,
+                     const Tensor& ground_truth) override {
+    model_outputs_ = model_outputs;
+    ground_truth_ = ground_truth;
+    was_called_ = true;
+    return status_to_return_;
+  }
+
+  bool WasCalled() { return was_called_; }
+  std::vector<Tensor> model_outputs() { return model_outputs_; }
+  Tensor ground_truth() { return ground_truth_; }
+
+ private:
+  std::vector<Tensor> model_outputs_;
+  Tensor ground_truth_;
+  Status status_to_return_;
+  bool was_called_ = false;
+};
+
+TEST(EvalPipeline, AccuracyEvalIsCalled) {
+  Scope scope = Scope::NewRootScope();
+  // A graph that adds 1 to input.
+  auto input = ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
+  auto add_node = ops::Add(scope.WithOpName("output"), input, 1.0f);
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  EvalPipeline::Params params;
+  params.model_input_node_name = "input";
+  params.model_output_node_name = "output";
+  NoOpAccuracyEval accuracy_eval(Status::OK());
+
+  EvalPipeline eval_pipeline(graph_def, params, &accuracy_eval);
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
+  TF_CHECK_OK(eval_pipeline.Run(CreateFloatTensor(5), CreateFloatTensor(27)));
+
+  EXPECT_TRUE(accuracy_eval.WasCalled());
+  auto outputs = accuracy_eval.model_outputs();
+  ASSERT_EQ(1, outputs.size());
+  EXPECT_EQ(6.0f, outputs[0].scalar<float>()());
+  // Ground truth is unchanged.
+  EXPECT_EQ(27, accuracy_eval.ground_truth().scalar<float>()());
+}
+
+TEST(EvalPipeline, EvalIsNotCalledOnGraphRunFailure) {
+  Scope scope = Scope::NewRootScope();
+  // A graph that adds 1 to input.
+  auto input = ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
+  auto add_node = ops::Add(scope.WithOpName("output"), input, 1.0f);
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  EvalPipeline::Params params;
+  params.model_input_node_name = "input";
+  params.model_output_node_name = "output";
+  NoOpAccuracyEval accuracy_eval(Status::OK());
+
+  EvalPipeline eval_pipeline(graph_def, params, &accuracy_eval);
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
+
+  // Pass a string tensor instead of a float tensor.
+  Tensor string_tensor(DT_STRING, TensorShape{});
+  auto status = eval_pipeline.Run(string_tensor, CreateFloatTensor(27));
+  EXPECT_FALSE(accuracy_eval.WasCalled());
+  EXPECT_FALSE(status.ok());
+}
+
+TEST(EvalPipeline, AccuracyEvalFailureResultsInFailure) {
+  Scope scope = Scope::NewRootScope();
+  // A graph that adds 1 to input.
+  auto input = ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
+  auto add_node = ops::Add(scope.WithOpName("output"), input, 1.0f);
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  EvalPipeline::Params params;
+  params.model_input_node_name = "input";
+  params.model_output_node_name = "output";
+  NoOpAccuracyEval accuracy_eval(errors::Internal("accuracy_fail"));
+
+  EvalPipeline eval_pipeline(graph_def, params, &accuracy_eval);
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(eval_pipeline.AttachSession(std::move(session)));
+  auto status = eval_pipeline.Run(CreateFloatTensor(5), CreateFloatTensor(27));
+
+  EXPECT_TRUE(accuracy_eval.WasCalled());
+  EXPECT_FALSE(status.ok());
+}
+
+}  // namespace
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/delegates/eager/constants.h b/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.cc
index 7ed6ab7552..61bed369f8 100644
--- a/tensorflow/contrib/lite/delegates/eager/constants.h
+++ b/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.cc
@@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_
 
-namespace tflite {
-namespace eager {
+#include "tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h"
 
-// The prefix of Eager op custom code.
-// This will be matched agains the `custom_code` field in `OperatorCode`
-// Flatbuffer Table.
-constexpr char kCustomCodePrefix[] = "Eager";
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 
-}  // namespace eager
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_
+namespace tensorflow {
+namespace metrics {
+void FileReaderStage::AddToGraph(const Scope& scope, const Input& input) {
+  if (!scope.ok()) return;
+  Scope s = scope.WithOpName(name());
+  this->stage_output_ = ops::ReadFile(s.WithOpName(output_name()), input);
+}
+}  //  namespace metrics
+}  //  namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h b/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h
new file mode 100644
index 0000000000..18db5837c1
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
+
+#include <string>
+
+#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+
+namespace tensorflow {
+namespace metrics {
+// A stage for reading a file into |string|.
+// Inputs: a string tensor: |file_name|.
+// Outputs: a string tensor: contents of |file_name|.
+class FileReaderStage : public Stage {
+ public:
+  string name() const override { return "stage_filereader"; }
+  string output_name() const override { return "stage_filereader_output"; }
+
+  void AddToGraph(const Scope& scope, const Input& input) override;
+};
+}  //  namespace metrics
+}  //  namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage_test.cc b/tensorflow/contrib/lite/tools/accuracy/file_reader_stage_test.cc
new file mode 100644
index 0000000000..a75f99187d
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/file_reader_stage_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdio>
+#include <fstream>
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace metrics {
+namespace {
+
+class TempFile {
+ public:
+  TempFile() {
+    string file_path;
+    if (Env::Default()->LocalTempFilename(&file_path)) {
+      file_path_ = file_path;
+      created_ = true;
+    }
+  }
+
+  string filepath() { return file_path_; }
+  bool CreateFileWithContents(const std::string& contents) {
+    if (!created_) {
+      return false;
+    }
+    std::fstream file(file_path_, std::ios_base::out);
+    if (file) {
+      file << contents;
+    }
+    return file.good();
+  }
+
+  ~TempFile() {
+    if (created_) {
+      std::remove(file_path_.c_str());
+    }
+  }
+
+ private:
+  bool created_ = false;
+  string file_path_;
+};
+
+TEST(FileReaderStageTest, FileIsRead) {
+  TempFile file;
+  const string kFileContents = "Hello world.";
+  ASSERT_TRUE(file.CreateFileWithContents(kFileContents));
+  Scope scope = Scope::NewRootScope();
+  FileReaderStage reader_stage;
+  reader_stage.AddToGraph(scope, file.filepath());
+  TF_CHECK_OK(scope.status());
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+  std::vector<Tensor> outputs;
+  auto run_status =
+      session->Run({},                               /*inputs*/
+                   {reader_stage.output_name()}, {}, /*target node names */
+                   &outputs);
+  TF_CHECK_OK(run_status);
+  EXPECT_EQ(1, outputs.size());
+  string contents = outputs[0].scalar<string>()();
+  EXPECT_EQ(kFileContents, contents);
+}
+
+TEST(FileReaderStageTest, InvalidFile) {
+  Scope scope = Scope::NewRootScope();
+  FileReaderStage reader_stage;
+  reader_stage.AddToGraph(scope, string("non_existent_file"));
+  TF_CHECK_OK(scope.status());
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+  std::vector<Tensor> outputs;
+  auto run_status =
+      session->Run({},                               /*inputs*/
+                   {reader_stage.output_name()}, {}, /*target node names */
+                   &outputs);
+  EXPECT_FALSE(run_status.ok());
+}
+
+}  // namespace
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD
new file mode 100644
index 0000000000..db4b688a45
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD
@@ -0,0 +1,171 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+
+common_linkopts = tflite_linkopts() + select({
+    "//conditions:default": [],
+    "//tensorflow:android": [
+        "-pie",
+        "-llog",
+    ],
+})
+
+cc_library(
+    name = "inception_preprocessing",
+    srcs = ["inception_preprocessing.cc"],
+    hdrs = ["inception_preprocessing.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
+        "//tensorflow/contrib/lite/tools/accuracy:stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core/kernels:android_tensorflow_image_op",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/core:protos_all_cc",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:ops",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "inception_preprocessing_test",
+    srcs = ["inception_preprocessing_test.cc"],
+    args = [
+        "--test_image=$(location :testdata/grace_hopper.jpg)",
+    ],
+    data = [":testdata/grace_hopper.jpg"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        ":inception_preprocessing",
+        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:lib",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "imagenet_topk_eval",
+    srcs = ["imagenet_topk_eval.cc"],
+    hdrs = ["imagenet_topk_eval.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite/tools/accuracy:accuracy_eval_stage",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "imagenet_topk_eval_test",
+    srcs = ["imagenet_topk_eval_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    deps = [
+        ":imagenet_topk_eval",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "imagenet_model_evaluator",
+    srcs = ["imagenet_model_evaluator.cc"],
+    hdrs = ["imagenet_model_evaluator.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":imagenet_topk_eval",
+        ":inception_preprocessing",
+        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
+        "//tensorflow/contrib/lite/tools/accuracy:eval_pipeline",
+        "//tensorflow/contrib/lite/tools/accuracy:eval_pipeline_builder",
+        "//tensorflow/contrib/lite/tools/accuracy:file_reader_stage",
+        "//tensorflow/contrib/lite/tools/accuracy:run_tflite_model_stage",
+        "//tensorflow/contrib/lite/tools/accuracy:utils",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core/kernels:android_whole_file_read_ops",
+                "//tensorflow/core/kernels:android_tensorflow_image_op",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:core_cpu",
+            ],
+        },
+    ),
+)
+
+tf_cc_binary(
+    name = "imagenet_accuracy_eval",
+    srcs = ["imagenet_accuracy_eval.cc"],
+    copts = tflite_copts(),
+    linkopts = common_linkopts,
+    deps = [
+        ":imagenet_model_evaluator",
+        ":imagenet_topk_eval",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
+        "//tensorflow/contrib/lite/tools/accuracy:csv_writer",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:lib",
+                "//tensorflow/core:framework_internal",
+            ],
+        },
+    ),
+)
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md
new file mode 100644
index 0000000000..3c6a0d85b3
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md
@@ -0,0 +1,138 @@
+## Accuracy evaluation for ILSVRC 2012 (Imagenet Large Scale Visual Recognition Challenge) image classification task
+
+This binary can evaluate the accuracy of TFLite models trained for the [ILSVRC 2012 image classification task]
+(http://www.image-net.org/challenges/LSVRC/2012/).
+The binary takes the path to validation images and labels as inputs. It outputs the accuracy after running the TFLite model on the validation sets.
+
+To run the binary download the ILSVRC 2012 devkit [see instructions](#downloading-ilsvrc) and run the [`generate_validation_ground_truth` script](#ground-truth-label-generation) to generate the ground truth labels.
+
+## Parameters
+The binary takes the following parameters:
+
+*   `model_file` : `string` \
+    Path to the TFlite model file.
+
+*   `ground_truth_images_path`: `string` \
+    The path to the directory containing ground truth images.
+
+*   `ground_truth_labels`: `string` \
+    Path to ground truth labels file. This file should contain the same number of labels as    the number images in the ground truth directory. The labels are assumed to be in the
+    same order as the sorted filename of images. See [ground truth label generation](#ground-truth-label-generation)
+    section for more information about how to generate labels for images.
+
+*    `model_output_labels`: `string` \
+    Path to the file containing labels, that is used to interpret the output of
+    the model. E.g. in case of mobilenets, this is the path to
+    `mobilenet_labels.txt` where each label is in the same order as the output
+    1001 dimension tensor.
+
+*   `output_path`: `string` \
+    This is the path to the output file. The output is a CSV file that has top-10 accuracies in each row. Each line of output file is the cumulative accuracy after processing images in a sorted order. So first line is accuracy after processing the first image, second line is accuracy after procesing first two images. The last line of the file is accuracy after processing the entire validation set.
+
+and the following optional parameters:
+*   `num_images`: `int` (default=0) \
+    The number of images to process, if 0, all images in the directory are processed otherwise only num_images will be processed.
+
+## Downloading ILSVRC
+In order to use this tool to run evaluation on the full 50K ImageNet dataset,
+download the data set from http://image-net.org/request.
+
+## Ground truth label generation
+The ILSVRC 2012 devkit `validation_ground_truth.txt` contains IDs that correspond to synset of the image. 
+The accuracy binary however expects the ground truth labels to contain the actual name of 
+category instead of synset ids. A conversion script has been provided to convert the validation ground truth to
+category labels. The `validation_ground_truth.txt` can be converted by the following steps:
+
+```
+ILSVRC_2012_DEVKIT_DIR=[set to path to ILSVRC 2012 devkit]
+VALIDATION_LABELS=[set to  path to output]
+
+python generate_validation_labels -- \
+--ilsvrc_devkit_dir=${ILSVRC_2012_DEVKIT_DIR} \
+--validation_labels_output=${VALIDATION_LABELS}
+```
+
+## Running the binary
+
+### On Android
+
+(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android for configuring NDK and SDK.
+
+(1) Build using the following command:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --config=monolithic \
+  --cxxopt='--std=c++11' \
+  --copt=-D__ANDROID_TYPES_FULL__ \
+  --copt=-DSUPPORT_SELECTIVE_REGISTRATION \
+  //tensorflow/contrib/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval
+```
+
+(2) Connect your phone. Push the binary to your phone with adb push
+     (make the directory if required):
+
+```
+adb push bazel-bin/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval /data/local/tmp
+```
+
+(3) Make the binary executable.
+
+```
+adb shell chmod +x /data/local/tmp/imagenet_accuracy_eval
+```
+
+(4) Push the TFLite model  that you need to test. For example:
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(5) Push the imagenet images to device, make sure device has sufficient storage available before pushing the dataset:
+
+```
+adb shell mkdir /data/local/tmp/ilsvrc_images && \
+adb push ${IMAGENET_IMAGES_DIR} /data/local/tmp/ilsvrc_images
+```
+
+(6) Push the generated validation ground labels to device.
+
+```
+adb push ${VALIDATION_LABELS} /data/local/tmp/ilsvrc_validation_labels.txt
+```
+
+(7) Push the model labels text file to device.
+
+```
+adb push ${MODEL_LABELS_TXT} /data/local/tmp/model_output_labels.txt
+```
+
+(8) Run the binary.
+
+```
+adb shell /data/local/tmp/imagenet_accuracy_eval \
+  --model_file=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --ground_truth_images_path=/data/local/tmp/ilsvrc_images \
+  --ground_truth_labels=/data/local/tmp/ilsvrc_validation_labels.txt \
+  --model_output_labels=/data/local/tmp/model_output_labels.txt \
+  --output_file_path=/data/local/tmp/accuracy_output.txt \
+  --num_images=0 # Run on all images.
+```
+
+###  On Desktop
+
+(1) Build and run using the following command:
+
+```
+bazel run -c opt \
+  --cxxopt='--std=c++11' \
+  -- \
+  //tensorflow/contrib/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval \
+  --model_file=mobilenet_quant_v1_224.tflite \
+  --ground_truth_images_path=${IMAGENET_IMAGES_DIR} \
+  --ground_truth_labels=${VALIDATION_LABELS} \
+  --model_output_labels=${MODEL_LABELS_TXT} \
+  --output_file_path=/tmp/accuracy_output.txt \
+  --num_images=0 # Run on all images.
+```
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
new file mode 100644
index 0000000000..f361341f7c
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iomanip>
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/contrib/lite/tools/accuracy/csv_writer.h"
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace metrics {
+
+namespace {
+
+std::vector<double> GetAccuracies(
+    const ImagenetTopKAccuracy::AccuracyStats& accuracy_stats) {
+  std::vector<double> results;
+  results.reserve(accuracy_stats.number_of_images);
+  if (accuracy_stats.number_of_images > 0) {
+    for (int n : accuracy_stats.topk_counts) {
+      double accuracy = 0;
+      if (accuracy_stats.number_of_images > 0) {
+        accuracy = (n * 100.0) / accuracy_stats.number_of_images;
+      }
+      results.push_back(accuracy);
+    }
+  }
+  return results;
+}
+
+}  // namespace
+
+// Writes results to a CSV file.
+class ResultsWriter : public ImagenetModelEvaluator::Observer {
+ public:
+  explicit ResultsWriter(std::unique_ptr<CSVWriter> writer)
+      : writer_(std::move(writer)) {}
+
+  void OnEvaluationStart(int total_number_of_images) override {}
+
+  void OnSingleImageEvaluationComplete(
+      const ImagenetTopKAccuracy::AccuracyStats& stats,
+      const string& image) override;
+
+ private:
+  std::unique_ptr<CSVWriter> writer_;
+};
+
+void ResultsWriter::OnSingleImageEvaluationComplete(
+    const ImagenetTopKAccuracy::AccuracyStats& stats, const string& image) {
+  TF_CHECK_OK(writer_->WriteRow(GetAccuracies(stats)));
+  writer_->Flush();
+}
+
+// Logs results to standard output with `kLogDelayUs` microseconds.
+class ResultsLogger : public ImagenetModelEvaluator::Observer {
+ public:
+  void OnEvaluationStart(int total_number_of_images) override;
+
+  void OnSingleImageEvaluationComplete(
+      const ImagenetTopKAccuracy::AccuracyStats& stats,
+      const string& image) override;
+
+ private:
+  int total_num_images_ = 0;
+  uint64 last_logged_time_us_ = 0;
+  static constexpr int kLogDelayUs = 500 * 1000;
+};
+
+void ResultsLogger::OnEvaluationStart(int total_number_of_images) {
+  total_num_images_ = total_number_of_images;
+  LOG(ERROR) << "Starting model evaluation: " << total_num_images_;
+}
+
+void ResultsLogger::OnSingleImageEvaluationComplete(
+    const ImagenetTopKAccuracy::AccuracyStats& stats, const string& image) {
+  int num_evaluated = stats.number_of_images;
+
+  double current_percent = num_evaluated * 100.0 / total_num_images_;
+  auto now_us = Env::Default()->NowMicros();
+
+  if ((now_us - last_logged_time_us_) >= kLogDelayUs) {
+    last_logged_time_us_ = now_us;
+
+    LOG(ERROR) << "Evaluated " << num_evaluated << "/" << total_num_images_
+               << " images, " << std::setprecision(2) << std::fixed
+               << current_percent << "%";
+  }
+}
+
+int Main(int argc, char* argv[]) {
+  // TODO(shashishekhar): Make this binary configurable and model
+  // agnostic.
+  string output_file_path;
+  std::vector<Flag> flag_list = {
+      Flag("output_file_path", &output_file_path, "Path to output file."),
+  };
+  Flags::Parse(&argc, argv, flag_list);
+
+  std::unique_ptr<ImagenetModelEvaluator> evaluator;
+  CHECK(!output_file_path.empty()) << "Invalid output file path.";
+
+  TF_CHECK_OK(ImagenetModelEvaluator::Create(argc, argv, &evaluator));
+
+  std::ofstream output_stream(output_file_path, std::ios::out);
+  CHECK(output_stream) << "Unable to open output file path: '"
+                       << output_file_path << "'";
+
+  output_stream << std::setprecision(3) << std::fixed;
+  std::vector<string> columns;
+  columns.reserve(evaluator->params().num_ranks);
+  for (int i = 0; i < evaluator->params().num_ranks; i++) {
+    string column_name = "Top ";
+    tensorflow::strings::StrAppend(&column_name, i + 1);
+    columns.push_back(column_name);
+  }
+
+  ResultsWriter results_writer(
+      absl::make_unique<CSVWriter>(columns, &output_stream));
+  ResultsLogger logger;
+  evaluator->AddObserver(&results_writer);
+  evaluator->AddObserver(&logger);
+  TF_CHECK_OK(evaluator->EvaluateModel());
+  return 0;
+}
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+int main(int argc, char* argv[]) {
+  return tensorflow::metrics::Main(argc, argv);
+}
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
new file mode 100644
index 0000000000..a88a4a0fce
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -0,0 +1,206 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
+
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
+#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h"
+#include "tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h"
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
+#include "tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h"
+#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+using tensorflow::string;
+
+string StripTrailingSlashes(const string& path) {
+  int end = path.size();
+  while (end > 0 && path[end - 1] == '/') {
+    end--;
+  }
+  return path.substr(0, end);
+}
+
+tensorflow::Tensor CreateStringTensor(const string& value) {
+  tensorflow::Tensor tensor(tensorflow::DT_STRING, tensorflow::TensorShape({}));
+  tensor.scalar<string>()() = value;
+  return tensor;
+}
+
+template <typename T>
+std::vector<T> GetFirstN(const std::vector<T>& v, int n) {
+  if (n >= v.size()) return v;
+  std::vector<T> result(v.begin(), v.begin() + n);
+  return result;
+}
+
+// File pattern for imagenet files.
+const char* const kImagenetFilePattern = "*.[jJ][pP][eE][gG]";
+
+}  // namespace
+
+namespace tensorflow {
+namespace metrics {
+
+/*static*/ Status ImagenetModelEvaluator::Create(
+    int argc, char* argv[],
+    std::unique_ptr<ImagenetModelEvaluator>* model_evaluator) {
+  Params params;
+  const std::vector<Flag> flag_list = {
+      Flag("model_output_labels", &params.model_output_labels_path,
+           "Path to labels that correspond to output of model."
+           " E.g. in case of mobilenet, this is the path to label "
+           "file where each label is in the same order as the output"
+           " of the model."),
+      Flag("ground_truth_images_path", &params.ground_truth_images_path,
+           "Path to ground truth images."),
+      Flag("ground_truth_labels", &params.ground_truth_labels_path,
+           "Path to ground truth labels."),
+      Flag("num_images", &params.number_of_images,
+           "Number of examples to evaluate, pass 0 for all "
+           "examples. Default: 100"),
+      tensorflow::Flag("model_file", &params.model_file_path,
+                       "Path to test tflite model file."),
+  };
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result)
+    return errors::InvalidArgument("Invalid command line flags");
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      Env::Default()->IsDirectory(params.ground_truth_images_path),
+      "Invalid ground truth data path.");
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      Env::Default()->FileExists(params.ground_truth_labels_path),
+      "Invalid ground truth labels path.");
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      Env::Default()->FileExists(params.model_output_labels_path),
+      "Invalid model output labels path.");
+
+  if (params.number_of_images < 0) {
+    return errors::InvalidArgument("Invalid: num_examples");
+  }
+
+  utils::ModelInfo model_info;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      utils::GetTFliteModelInfo(params.model_file_path, &model_info),
+      "Invalid TFLite model.");
+
+  *model_evaluator =
+      absl::make_unique<ImagenetModelEvaluator>(model_info, params);
+  return Status::OK();
+}
+
+Status ImagenetModelEvaluator::EvaluateModel() {
+  if (model_info_.input_shapes.size() != 1) {
+    return errors::InvalidArgument("Invalid input shape");
+  }
+
+  const TensorShape& input_shape = model_info_.input_shapes[0];
+  // Input should be of the shape {1, height, width, 3}
+  if (input_shape.dims() != 4 || input_shape.dim_size(3) != 3) {
+    return errors::InvalidArgument("Invalid input shape for the model.");
+  }
+
+  const int image_height = input_shape.dim_size(1);
+  const int image_width = input_shape.dim_size(2);
+  const bool is_quantized = (model_info_.input_types[0] == DT_UINT8);
+
+  RunTFLiteModelStage::Params tfl_model_params;
+  tfl_model_params.model_file_path = params_.model_file_path;
+  if (is_quantized) {
+    tfl_model_params.input_type = {DT_UINT8};
+    tfl_model_params.output_type = {DT_UINT8};
+  } else {
+    tfl_model_params.input_type = {DT_FLOAT};
+    tfl_model_params.output_type = {DT_FLOAT};
+  }
+
+  Scope root = Scope::NewRootScope();
+  FileReaderStage reader;
+  InceptionPreprocessingStage inc(image_height, image_width, is_quantized);
+  RunTFLiteModelStage tfl_model_stage(tfl_model_params);
+  EvalPipelineBuilder builder;
+  std::vector<string> model_labels;
+  TF_RETURN_IF_ERROR(
+      utils::ReadFileLines(params_.model_output_labels_path, &model_labels));
+  if (model_labels.size() != 1001) {
+    return errors::InvalidArgument("Invalid number of labels: ",
+                                   model_labels.size());
+  }
+
+  ImagenetTopKAccuracy eval(model_labels, params_.num_ranks);
+  std::unique_ptr<EvalPipeline> eval_pipeline;
+
+  auto build_status = builder.WithInputStage(&reader)
+                          .WithPreprocessingStage(&inc)
+                          .WithRunModelStage(&tfl_model_stage)
+                          .WithAccuracyEval(&eval)
+                          .WithInput("input_file", DT_STRING)
+                          .Build(root, &eval_pipeline);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(build_status,
+                                  "Failure while building eval pipeline.");
+
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+
+  TF_RETURN_IF_ERROR(eval_pipeline->AttachSession(std::move(session)));
+  string data_path =
+      StripTrailingSlashes(params_.ground_truth_images_path) + "/";
+
+  const string imagenet_file_pattern = data_path + kImagenetFilePattern;
+  std::vector<string> image_files;
+  TF_CHECK_OK(
+      Env::Default()->GetMatchingPaths(imagenet_file_pattern, &image_files));
+  std::vector<string> image_labels;
+  TF_CHECK_OK(
+      utils::ReadFileLines(params_.ground_truth_labels_path, &image_labels));
+  CHECK_EQ(image_files.size(), image_labels.size());
+
+  // Process files in filename sorted order.
+  std::sort(image_files.begin(), image_files.end());
+  if (params_.number_of_images > 0) {
+    image_files = GetFirstN(image_files, params_.number_of_images);
+    image_labels = GetFirstN(image_labels, params_.number_of_images);
+  }
+
+  for (Observer* observer : observers_) {
+    observer->OnEvaluationStart(image_files.size());
+  }
+
+  for (int i = 0; i < image_files.size(); i++) {
+    TF_CHECK_OK(eval_pipeline->Run(CreateStringTensor(image_files[i]),
+                                   CreateStringTensor(image_labels[i])));
+    auto stats = eval.GetTopKAccuracySoFar();
+
+    for (Observer* observer : observers_) {
+      observer->OnSingleImageEvaluationComplete(stats, image_files[i]);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace metrics
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
new file mode 100644
index 0000000000..5f42b2a50e
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -0,0 +1,113 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace metrics {
+
+// Evaluates models accuracy for ILSVRC dataset.
+//
+// Generates the top-1, top-k accuracy counts where k is
+// controlled by |num_ranks|.
+// Usage:
+// ModelInfo model_info = ..
+// ImagenetModelEvaluator::Params params;
+// .. set params to image, label, output label and model file path..
+// SomeObserver observer;
+// ImagenetModelEvaluator evaluator(model_info, params);
+// evaluator.AddObserver(&observer);
+// TF_CHECK_OK(evaluator.EvaluateModel());
+class ImagenetModelEvaluator {
+ public:
+  struct Params {
+    // Path to ground truth images.
+    string ground_truth_images_path;
+
+    // Path to labels file for ground truth image.
+    // This file should be generated with the scripts.
+    string ground_truth_labels_path;
+
+    // This is word labels generated by the model. The category
+    // indices of output probabilities generated by the model maybe different
+    // from the indices in the imagenet dataset.
+    string model_output_labels_path;
+
+    // Path to the model file.
+    string model_file_path;
+
+    // The maximum number of images to calculate accuracy.
+    // 0 means all images, a positive number means only the specified
+    // number of images.
+    int number_of_images = 0;
+
+    // Number of ranks, top K.
+    int num_ranks = 10;
+  };
+
+  // An evaluation observer.
+  class Observer {
+   public:
+    Observer() = default;
+    Observer(const Observer&) = delete;
+    Observer& operator=(const Observer&) = delete;
+
+    Observer(const Observer&&) = delete;
+    Observer& operator=(const Observer&&) = delete;
+
+    // Called on start of evaluation.
+    virtual void OnEvaluationStart(int total_number_of_images) = 0;
+
+    // Called when evaluation was complete for `image`.
+    virtual void OnSingleImageEvaluationComplete(
+        const ImagenetTopKAccuracy::AccuracyStats& stats,
+        const string& image) = 0;
+
+    virtual ~Observer() = default;
+  };
+
+  ImagenetModelEvaluator(const utils::ModelInfo& model_info,
+                         const Params& params)
+      : model_info_(model_info), params_(params) {}
+
+  // Factory method to create the evaluator by parsing command line arguments.
+  static Status Create(int argc, char* argv[],
+                       std::unique_ptr<ImagenetModelEvaluator>* evaluator);
+
+  // Adds an observer that can observe evaluation events..
+  void AddObserver(Observer* observer) { observers_.push_back(observer); }
+
+  const Params& params() { return params_; }
+
+  // Evaluates the provided model over the dataset.
+  Status EvaluateModel();
+
+ private:
+  std::vector<Observer*> observers_;
+  const utils::ModelInfo model_info_;
+  const Params params_;
+};
+
+}  // namespace metrics
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
new file mode 100644
index 0000000000..d46075d234
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+
+#include <numeric>
+
+namespace {
+constexpr int kNumCategories = 1001;
+std::vector<int> GetTopK(const std::vector<float>& values, int k) {
+  CHECK_LE(k, values.size());
+  std::vector<int> indices(values.size());
+
+  std::iota(indices.begin(), indices.end(), 0);
+  std::sort(indices.begin(), indices.end(),
+            [&values](int a, int b) { return values[a] > values[b]; });
+
+  indices.resize(k);
+  return indices;
+}
+}  // namespace
+
+namespace tensorflow {
+namespace metrics {
+ImagenetTopKAccuracy::ImagenetTopKAccuracy(
+    const std::vector<string>& ground_truth_labels, int k)
+    : ground_truth_labels_(ground_truth_labels),
+      k_(k),
+      accuracy_counts_(k_, 0),
+      num_samples_(0) {
+  CHECK_EQ(kNumCategories, ground_truth_labels.size());
+}
+
+Status ImagenetTopKAccuracy::ComputeEval(
+    const std::vector<Tensor>& model_outputs, const Tensor& ground_truth) {
+  if (model_outputs.size() != 1) {
+    return errors::InvalidArgument("Invalid model output: ",
+                                   model_outputs.size());
+  }
+  const Tensor& output = model_outputs[0];
+  if (!output.shape().IsSameSize({1, kNumCategories})) {
+    return errors::InvalidArgument("Invalid shape of model output: ",
+                                   output.shape().DebugString());
+  }
+  if (ground_truth.dtype() != DT_STRING && ground_truth.dims() != 0) {
+    return errors::InvalidArgument("Invalid ground truth type: ",
+                                   ground_truth.DebugString());
+  }
+  string ground_truth_label = ground_truth.scalar<string>()();
+
+  std::vector<float> probabilities;
+  probabilities.reserve(kNumCategories);
+  if (output.dtype() == DT_FLOAT) {
+    auto probs = output.flat<float>();
+    for (size_t i = 0; i < probs.size(); i++) {
+      probabilities.push_back(probs(i));
+    }
+  } else {
+    auto probs = output.flat<uint8>();
+    for (size_t i = 0; i < probs.size(); i++) {
+      probabilities.push_back(probs(i));
+    }
+  }
+
+  CHECK_EQ(kNumCategories, probabilities.size());
+  std::vector<int> topK = GetTopK(probabilities, k_);
+  int ground_truth_index = GroundTruthIndex(ground_truth_label);
+  for (size_t i = 0; i < topK.size(); ++i) {
+    if (ground_truth_index == topK[i]) {
+      for (size_t j = i; j < topK.size(); j++) {
+        accuracy_counts_[j] += 1;
+      }
+      break;
+    }
+  }
+  num_samples_++;
+  return Status::OK();
+}
+
+const ImagenetTopKAccuracy::AccuracyStats
+ImagenetTopKAccuracy::GetTopKAccuracySoFar() const {
+  AccuracyStats stats;
+  stats.number_of_images = num_samples_;
+  stats.topk_counts = accuracy_counts_;
+  return stats;
+}
+
+int ImagenetTopKAccuracy::GroundTruthIndex(const string& label) const {
+  auto index = std::find(ground_truth_labels_.cbegin(),
+                         ground_truth_labels_.cend(), label);
+  CHECK(index != ground_truth_labels_.end()) << "Invalid label: " << label;
+  return std::distance(ground_truth_labels_.cbegin(), index);
+}
+}  //  namespace metrics
+}  //  namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
new file mode 100644
index 0000000000..5a575ff244
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace metrics {
+// An |AccuracyEval| stage that calculates the top K error rate for model
+// evaluations on imagenet like datasets.
+// Inputs: A {1, 1001} shaped tensor that contains the probabilities for objects
+// predicted by the model.
+// Ground truth: A |string| label for the image.
+// From the input object probabilities, the stage computes the predicted labels
+// and finds the top K error rates by comparing the predictions with ground
+// truths.
+class ImagenetTopKAccuracy : public AccuracyEval {
+ public:
+  // Accuracy statistics.
+  struct AccuracyStats {
+    // Number of images evaluated.
+    int number_of_images;
+    // A vector of size |k| that contains the number of images
+    // that have correct labels in top K.
+    // E.g. topk_counts[0] contains number of images for which
+    // model returned the correct label as the first result.
+    // Similarly topk_counts[4] contains the number of images for which
+    // model returned the correct label in top 5 results.
+    // This can be used to compute the top K error-rate for the model.
+    std::vector<int> topk_counts;
+  };
+
+  // Creates a new instance of |ImagenetTopKAccuracy| with the given
+  // |ground_truth_labels| and |k|.
+  // Args:
+  // |ground_truth_labels| : an ordered vector of labels for images. This is
+  // used to compute the index for the predicted labels and ground_truth label.
+  ImagenetTopKAccuracy(const std::vector<string>& ground_truth_labels, int k);
+
+  // Computes accuracy for a given  image. The |model_outputs| should
+  // be a vector containing exactly one Tensor of shape: {1, 1001} where each
+  // item is a probability of the predicted object representing the image as
+  // output by the model.
+  // Uses |ground_truth_labels| to compute the index of |model_outputs| and
+  // |ground_truth| and computes the top K error rate.
+  Status ComputeEval(const std::vector<Tensor>& model_outputs,
+                     const Tensor& ground_truth) override;
+
+  // Gets the topK accuracy for images that have been evaluated till now.
+  const AccuracyStats GetTopKAccuracySoFar() const;
+
+ private:
+  int GroundTruthIndex(const string& label) const;
+  std::vector<string> ground_truth_labels_;
+  const int k_;
+  std::vector<int> accuracy_counts_;
+  int num_samples_;
+};
+}  //  namespace metrics
+}  //  namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
new file mode 100644
index 0000000000..ff332af5c5
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace metrics {
+namespace {
+
+const int kNumCategories = 1001;
+
+Tensor CreateStringTensor(const string& value) {
+  Tensor tensor(DT_STRING, TensorShape({}));
+  tensor.scalar<string>()() = value;
+  return tensor;
+}
+
+Tensor CreateOutputTensor() {
+  Tensor tensor(DT_FLOAT, TensorShape({1, kNumCategories}));
+  for (int i = 0; i < kNumCategories; i++) {
+    tensor.flat<float>()(i) = 0;
+  }
+  return tensor;
+}
+
+std::vector<string> CreateGroundTruth() {
+  std::vector<string> ground_truth;
+  ground_truth.reserve(kNumCategories);
+  for (int i = 0; i < kNumCategories; i++) {
+    string category;
+    strings::StrAppend(&category, i);
+    ground_truth.push_back(category);
+  }
+  return ground_truth;
+}
+
+TEST(ImagenetTopKAccuracy, AllCorrect) {
+  ImagenetTopKAccuracy acc_top_5(CreateGroundTruth(), 5);
+  auto accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(0, accuracies.number_of_images);
+  EXPECT_EQ(5, accuracies.topk_counts.size());
+
+  for (int i : accuracies.topk_counts) {
+    EXPECT_EQ(0, i);
+  }
+  // First image was correctly identified as "0".
+  Tensor tensor = CreateOutputTensor();
+  tensor.flat<float>()(0) = 0.8;
+
+  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("0")));
+  accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(1, accuracies.number_of_images);
+
+  for (int i : accuracies.topk_counts) {
+    EXPECT_EQ(1, i);
+  }
+  tensor.flat<float>()(1) = 0.9;
+  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("1")));
+  accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(2, accuracies.number_of_images);
+
+  for (int i : accuracies.topk_counts) {
+    EXPECT_EQ(2, i);
+  }
+}
+
+TEST(ImagenetTopKAccuracy, Top5) {
+  ImagenetTopKAccuracy acc_top_5(CreateGroundTruth(), 5);
+  auto accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(0, accuracies.number_of_images);
+  EXPECT_EQ(5, accuracies.topk_counts.size());
+
+  // For first image, with ground truth "0" probabilities were
+  // 0.5 for "0",
+  // "0.6" for 1,
+  // "0.7" for 2,
+  // "0.8" for 3,
+  // "0.9" for 4.
+  // remaining all zeroes.
+
+  // First image was correctly identified as "0".
+  Tensor tensor = CreateOutputTensor();
+  tensor.flat<float>()(0) = 0.5;
+  tensor.flat<float>()(1) = 0.6;
+  tensor.flat<float>()(2) = 0.7;
+  tensor.flat<float>()(3) = 0.8;
+  tensor.flat<float>()(4) = 0.9;
+
+  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("0")));
+  accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(1, accuracies.number_of_images);
+  EXPECT_EQ(1, accuracies.topk_counts[4]);
+
+  for (int i = 0; i < 4; i++) {
+    EXPECT_EQ(0, accuracies.topk_counts[i]);
+  }
+
+  // Now for "1" only last two buckets are going to be affected.
+  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("1")));
+  accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(2, accuracies.number_of_images);
+  EXPECT_EQ(1, accuracies.topk_counts[3]);
+  EXPECT_EQ(2, accuracies.topk_counts[4]);
+  for (int i = 0; i < 3; i++) {
+    EXPECT_EQ(0, accuracies.topk_counts[i]);
+  }
+
+  // All buckets will be affected.
+  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("4")));
+  accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(3, accuracies.number_of_images);
+  EXPECT_EQ(1, accuracies.topk_counts[0]);
+  EXPECT_EQ(1, accuracies.topk_counts[1]);
+  EXPECT_EQ(1, accuracies.topk_counts[2]);
+  EXPECT_EQ(2, accuracies.topk_counts[3]);
+  EXPECT_EQ(3, accuracies.topk_counts[4]);
+
+  // No buckets will be affected
+  TF_CHECK_OK(acc_top_5.ComputeEval({tensor}, CreateStringTensor("10")));
+  accuracies = acc_top_5.GetTopKAccuracySoFar();
+  EXPECT_EQ(4, accuracies.number_of_images);
+  EXPECT_EQ(1, accuracies.topk_counts[0]);
+  EXPECT_EQ(1, accuracies.topk_counts[1]);
+  EXPECT_EQ(1, accuracies.topk_counts[2]);
+  EXPECT_EQ(2, accuracies.topk_counts[3]);
+  EXPECT_EQ(3, accuracies.topk_counts[4]);
+}
+
+}  // namespace
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
new file mode 100644
index 0000000000..7512b39c32
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
+
+#include <memory>
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace metrics {
+
+namespace {
+void CentralCropImage(const Scope& s, const tensorflow::Output& decoded_image,
+                      double crop_fraction, tensorflow::Output* cropped_image) {
+  auto image_dims = ops::Slice(s, ops::Shape(s, decoded_image), {0}, {2});
+  auto height_width = ops::Cast(s, image_dims, DT_DOUBLE);
+  auto cropped_begin = ops::Div(
+      s, ops::Sub(s, height_width, ops::Mul(s, height_width, crop_fraction)),
+      2.0);
+  auto bbox_begin = ops::Cast(s, cropped_begin, DT_INT32);
+  auto bbox_size = ops::Sub(s, image_dims, ops::Mul(s, bbox_begin, 2));
+  auto slice_begin = ops::Concat(s, {bbox_begin, Input({0})}, 0);
+  auto slice_size = ops::Concat(s, {bbox_size, {-1}}, 0);
+  *cropped_image = ops::Slice(s, decoded_image, slice_begin, slice_size);
+}
+
+}  // namespace
+
+void InceptionPreprocessingStage::AddToGraph(const Scope& scope,
+                                             const Input& input) {
+  if (!scope.ok()) return;
+  Scope s = scope.WithOpName(name());
+  ops::DecodeJpeg::Attrs attrs;
+  attrs.channels_ = 3;
+  auto decoded_jpeg = ops::DecodeJpeg(s, input, attrs);
+  tensorflow::Output cropped_image;
+  CentralCropImage(s, decoded_jpeg, params_.cropping_fraction, &cropped_image);
+  auto dims_expander = ops::ExpandDims(s, cropped_image, 0);
+  auto resized_image = ops::ResizeBilinear(
+      s, dims_expander,
+      ops::Const(s.WithOpName("size"), {image_height_, image_width_}));
+  if (is_quantized_) {
+    this->stage_output_ =
+        ops::Cast(s.WithOpName(output_name()), resized_image, DT_UINT8);
+  } else {
+    auto squeezed_image = ops::Squeeze(s, resized_image);
+    auto normalized_image =
+        ops::Div(s,
+                 ops::Sub(s, squeezed_image,
+                          {params_.input_means[0], params_.input_means[1],
+                           params_.input_means[2]}),
+                 {params_.scale});
+    this->stage_output_ =
+        ops::ExpandDims(s.WithOpName(output_name()), normalized_image, {0});
+  }
+}
+
+}  // namespace metrics
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
new file mode 100644
index 0000000000..15df719817
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+
+#include <utility>
+
+#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace metrics {
+
+// A stage that does inception preprocessing.
+// Inputs: A tensor containing bytes of a JPEG image.
+// Outputs: A tensor containing rescaled and preprocessed image that has
+// shape {1, image_height, image_width, 3}, where 3 is the number of channels.
+class InceptionPreprocessingStage : public Stage {
+ public:
+  struct Params {
+    std::vector<float> input_means;
+    float scale;
+    double cropping_fraction;
+  };
+
+  static Params DefaultParams() {
+    return {.input_means = {127.5, 127.5, 127.5},
+            .scale = 127.5,
+            .cropping_fraction = 0.875};
+  }
+
+  // Creates a new preprocessing stage object with provided |image_width|
+  // |image_height| as the size of output image.
+  // If |is_quantized| is set to true then |params| is ignored since quantized
+  // images don't go through any preprocessing.
+  InceptionPreprocessingStage(int image_width, int image_height,
+                              bool is_quantized,
+                              Params params = DefaultParams())
+      : image_width_(image_width),
+        image_height_(image_height),
+        is_quantized_(is_quantized),
+        params_(std::move(params)) {}
+
+  string name() const override { return "stage_inception_preprocess"; }
+  string output_name() const override {
+    return "stage_inception_preprocess_output";
+  }
+
+  void AddToGraph(const Scope& scope, const Input& input) override;
+
+ private:
+  int image_width_;
+  int image_height_;
+  bool is_quantized_;
+  Params params_;
+};
+
+}  // namespace metrics
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
new file mode 100644
index 0000000000..3587878ba3
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <fstream>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+tensorflow::string* g_test_image_file = nullptr;
+}  // namespace
+
+namespace tensorflow {
+namespace metrics {
+
+namespace {
+
+using tensorflow::Status;
+using tensorflow::Tensor;
+
+Status GetContents(const string& filename, string* output) {
+  std::ifstream input(filename, std::ios::binary);
+  const int kBufferSize = 2048;
+  char buffer[kBufferSize];
+  while (true) {
+    input.read(buffer, kBufferSize);
+    output->append(buffer, input.gcount());
+    if (!input.good()) {
+      if (input.eof()) return Status::OK();
+      return Status(tensorflow::error::ABORTED, "Failed to read file.");
+    }
+  }
+}
+
+TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
+  ASSERT_TRUE(g_test_image_file != nullptr);
+  string image_contents;
+  string image_path = *g_test_image_file;
+  auto status = GetContents(image_path, &image_contents);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+  const int width = 224;
+  const int height = 224;
+  const bool is_quantized = true;
+  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  Scope scope = Scope::NewRootScope();
+  preprocess_stage.AddToGraph(scope, image_contents);
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+  std::vector<Tensor> outputs;
+  auto run_status =
+      session->Run({},                                   /*inputs*/
+                   {preprocess_stage.output_name()}, {}, /*target node names */
+                   &outputs);
+  TF_CHECK_OK(run_status);
+  EXPECT_EQ(1, outputs.size());
+  EXPECT_EQ(DT_UINT8, outputs[0].dtype());
+  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
+}
+
+TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
+  ASSERT_TRUE(g_test_image_file != nullptr);
+  string image_contents;
+  string image_path = *g_test_image_file;
+  auto status = GetContents(image_path, &image_contents);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+  const int width = 224;
+  const int height = 224;
+  const bool is_quantized = false;
+  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  Scope scope = Scope::NewRootScope();
+  preprocess_stage.AddToGraph(scope, image_contents);
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+  std::vector<Tensor> outputs;
+  auto run_status =
+      session->Run({},                                   /*inputs*/
+                   {preprocess_stage.output_name()}, {}, /*target node names */
+                   &outputs);
+  TF_CHECK_OK(run_status);
+  EXPECT_EQ(1, outputs.size());
+  EXPECT_EQ(DT_FLOAT, outputs[0].dtype());
+  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
+}
+
+}  // namespace
+}  // namespace metrics
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  g_test_image_file = new tensorflow::string();
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_image", g_test_image_file,
+                       "Path to image file for test."),
+  };
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  CHECK(parse_result) << "Required test_model_file";
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg
new file mode 100644
index 0000000000..d2a427810f
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op.cc b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op.cc
new file mode 100644
index 0000000000..da4258f1c1
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op.cc
@@ -0,0 +1,158 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace {
+Status ValidateInputsMatch(const OpInputList& input_tensors,
+                           const tflite::Interpreter& interpreter) {
+  std::vector<int> tflite_tensor_indices = interpreter.inputs();
+  if (tflite_tensor_indices.size() != input_tensors.size()) {
+    return errors::InvalidArgument(
+        "size mismatch, interpreter size: ", tflite_tensor_indices.size(),
+        " actual: ", input_tensors.size());
+  }
+
+  for (int i = 0; i < input_tensors.size(); i++) {
+    const TfLiteTensor* tflite_tensor =
+        interpreter.tensor(tflite_tensor_indices[i]);
+    if (tflite_tensor == nullptr) {
+      return errors::InvalidArgument("Tensor is null at index: ", i);
+    }
+
+    const Tensor& tensor = input_tensors[i];
+    auto i_type = metrics::utils::GetTFDataType(tflite_tensor->type);
+    auto i_shape = metrics::utils::GetTFLiteTensorShape(*tflite_tensor);
+    if (i_type != tensor.dtype()) {
+      return errors::InvalidArgument("Data types mismatch for tensors: ", i,
+                                     " expected: ", i_type,
+                                     " got: ", tensor.dtype());
+    }
+
+    if (i_shape != tensor.shape()) {
+      return errors::InvalidArgument("Data shapes mismatch for tensors: ", i,
+                                     " expected: ", i_shape,
+                                     " got: ", tensor.shape());
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+class RunTFLiteModelOp : public OpKernel {
+ public:
+  explicit RunTFLiteModelOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string model_file_path;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("model_file_path", &model_file_path));
+    model_ = tflite::FlatBufferModel::BuildFromFile(model_file_path.data());
+    OP_REQUIRES(ctx, model_,
+                errors::InvalidArgument(
+                    "Model loading failed. Invalid model file path: ",
+                    model_file_path));
+    tflite::ops::builtin::BuiltinOpResolver resolver;
+
+    tflite::InterpreterBuilder(*model_, resolver)(&interpreter_);
+    OP_REQUIRES(ctx, interpreter_,
+                errors::Internal("Interpreter creation failed."));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList input_tensors;
+    OP_REQUIRES_OK(context, context->input_list("model_input", &input_tensors));
+
+    OP_REQUIRES_OK(context, ValidateInputsMatch(input_tensors, *interpreter_));
+    OpOutputList output_tensors;
+    OP_REQUIRES_OK(context,
+                   context->output_list("model_output", &output_tensors));
+    auto tfl_outputs = interpreter_->outputs();
+    OP_REQUIRES(context, output_tensors.size() == tfl_outputs.size(),
+                errors::InvalidArgument(
+                    "Invalid output size, expected: ", tfl_outputs.size(),
+                    " got: ", output_tensors.size()));
+    for (int i = 0; i < output_tensors.size(); i++) {
+      DataType tfl_type = metrics::utils::GetTFDataType(
+          interpreter_->tensor(tfl_outputs[i])->type);
+      DataType otype = output_tensors.expected_output_dtype(i);
+      OP_REQUIRES(
+          context, tfl_type == otype,
+          errors::InvalidArgument("Invalid data type for output at index: ", i,
+                                  " expected: ", tfl_type, " got: ", otype));
+    }
+
+    auto allocation_status = interpreter_->AllocateTensors();
+    OP_REQUIRES(context, allocation_status == kTfLiteOk,
+                errors::Internal("Unable to allocate tensors."));
+    for (int i = 0; i < input_tensors.size(); i++) {
+      const int tfl_index = interpreter_->inputs()[i];
+      TfLiteTensor* tflite_tensor = interpreter_->tensor(tfl_index);
+      auto tensor_bytes = input_tensors[i].tensor_data();
+      OP_REQUIRES(context, tflite_tensor->bytes == tensor_bytes.size(),
+                  errors::InvalidArgument(
+                      "Size mismatch, expected: ", tflite_tensor->bytes,
+                      " got: ", tensor_bytes.size()));
+      std::memcpy(tflite_tensor->data.raw, tensor_bytes.data(),
+                  tensor_bytes.size());
+    }
+    auto invocation_status = interpreter_->Invoke();
+    OP_REQUIRES(context, invocation_status == kTfLiteOk,
+                errors::Internal("Interpreter invocation failed."));
+    for (int i = 0; i < output_tensors.size(); i++) {
+      auto tfl_tensor = interpreter_->tensor(tfl_outputs[i]);
+      TensorShape shape = metrics::utils::GetTFLiteTensorShape(*tfl_tensor);
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context, output_tensors.allocate(i, shape, &output));
+      auto tensor_bytes = output->tensor_data();
+      OP_REQUIRES(context, tensor_bytes.size() == tfl_tensor->bytes,
+                  errors::Internal("Invalid size"));
+      std::memcpy(const_cast<char*>(tensor_bytes.data()), tfl_tensor->data.raw,
+                  tfl_tensor->bytes);
+    }
+  }
+
+ private:
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RunTFLiteModel").Device(DEVICE_CPU),
+                        RunTFLiteModelOp);
+
+REGISTER_OP("RunTFLiteModel")
+    .Input("model_input: input_type")
+    .Output("model_output: output_type")
+    .Attr("model_file_path: string")
+    .Attr("input_type : list(type)")
+    .Attr("output_type: list(type)")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // TODO(shashishekhar): Infer the correct shape based on output_type and
+      // maybe another attribute.
+      return shape_inference::UnknownShape(c);
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op_test.cc b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op_test.cc
new file mode 100644
index 0000000000..88175984a0
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op_test.cc
@@ -0,0 +1,200 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+tensorflow::string* g_test_model_file = nullptr;
+}
+
+namespace tensorflow {
+namespace {
+
+TEST(RunTfliteModelOpTest, ModelIsRun) {
+  ASSERT_TRUE(g_test_model_file != nullptr);
+  string test_model_file = *g_test_model_file;
+  ASSERT_FALSE(test_model_file.empty());
+
+  Scope scope = Scope::NewRootScope();
+  TF_CHECK_OK(scope.status());
+  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
+  //  x = a+b+c, y=b+c+d
+
+  std::vector<Input> graph_inputs = {
+      ops::Const(scope, 1.0f, {1, 8, 8, 3}),  // a
+      ops::Const(scope, 2.1f, {1, 8, 8, 3}),  // b
+      ops::Const(scope, 3.2f, {1, 8, 8, 3}),  // c
+      ops::Const(scope, 4.3f, {1, 8, 8, 3}),  // d
+  };
+
+  std::vector<NodeBuilder::NodeOut> input_data;
+  std::transform(graph_inputs.begin(), graph_inputs.end(),
+                 std::back_inserter(input_data), [&scope](Input model_input) {
+                   return ops::AsNodeOut(scope, model_input);
+                 });
+
+  std::vector<DataType> model_input_type = {DT_FLOAT, DT_FLOAT, DT_FLOAT,
+                                            DT_FLOAT};
+  ::tensorflow::Node* ret;
+  auto builder = ::tensorflow::NodeBuilder("run_model_op", "RunTFLiteModel")
+                     .Input(input_data)
+                     .Attr("model_file_path", test_model_file)
+                     .Attr("input_type", model_input_type)
+                     .Attr("output_type", {DT_FLOAT, DT_FLOAT});
+
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+
+  std::vector<Tensor> outputs;
+  TF_CHECK_OK(
+      session->Run({}, {"run_model_op:0", "run_model_op:1"}, {}, &outputs));
+  EXPECT_EQ(2, outputs.size());
+
+  for (const auto& tensor : outputs) {
+    EXPECT_TRUE(tensor.shape().IsSameSize({1, 8, 8, 3}));
+  }
+  auto output_x = outputs[0].flat<float>();
+  auto output_y = outputs[1].flat<float>();
+  EXPECT_EQ(1 * 8 * 8 * 3, output_x.size());
+  EXPECT_EQ(1 * 8 * 8 * 3, output_y.size());
+  for (int i = 0; i < output_x.size(); i++) {
+    EXPECT_NEAR(6.3f, output_x(i), 1e-6f);  // a+b+c
+    EXPECT_NEAR(9.6f, output_y(i), 1e-6f);  // b+c+d
+  }
+}
+
+TEST(RunTfliteModelOpTest, NumInputsMismatch) {
+  ASSERT_TRUE(g_test_model_file != nullptr);
+  string test_model_file = *g_test_model_file;
+  ASSERT_FALSE(test_model_file.empty());
+
+  Scope scope = Scope::NewRootScope();
+  TF_CHECK_OK(scope.status());
+  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
+  //  x = a+b+c, y=b+c+d
+  //  Remove a from input.
+
+  std::vector<Input> graph_inputs = {
+      ops::Const(scope, 2.1f, {1, 8, 8, 3}),  // b
+      ops::Const(scope, 3.2f, {1, 8, 8, 3}),  // c
+      ops::Const(scope, 4.3f, {1, 8, 8, 3}),  // d
+  };
+
+  std::vector<NodeBuilder::NodeOut> input_data;
+  std::transform(graph_inputs.begin(), graph_inputs.end(),
+                 std::back_inserter(input_data), [&scope](Input model_input) {
+                   return ops::AsNodeOut(scope, model_input);
+                 });
+
+  std::vector<DataType> model_input_type = {DT_FLOAT, DT_FLOAT, DT_FLOAT};
+
+  ::tensorflow::Node* ret;
+  auto builder = ::tensorflow::NodeBuilder("run_model_op", "RunTFLiteModel")
+                     .Input(input_data)
+                     .Attr("model_file_path", test_model_file)
+                     .Attr("input_type", model_input_type)
+                     .Attr("output_type", {DT_FLOAT, DT_FLOAT});
+
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+
+  std::vector<Tensor> outputs;
+  auto status =
+      (session->Run({}, {"run_model_op:0", "run_model_op:1"}, {}, &outputs));
+  EXPECT_FALSE(status.ok());
+}
+
+TEST(RunTfliteModelOpTest, InputSizesMismatch) {
+  ASSERT_TRUE(g_test_model_file != nullptr);
+  string test_model_file = *g_test_model_file;
+  ASSERT_FALSE(test_model_file.empty());
+
+  Scope scope = Scope::NewRootScope();
+  TF_CHECK_OK(scope.status());
+  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
+  //  x = a+b+c, y=b+c+d
+  // Set a to be invalid size.
+  std::vector<Input> graph_inputs = {
+      ops::Const(scope, 1.0f, {1, 8, 8, 4}),  // a invalid size,
+      ops::Const(scope, 2.1f, {1, 8, 8, 3}),  // b
+      ops::Const(scope, 3.2f, {1, 8, 8, 3}),  // c
+      ops::Const(scope, 4.3f, {1, 8, 8, 3}),  // d
+  };
+
+  std::vector<NodeBuilder::NodeOut> input_data;
+  std::transform(graph_inputs.begin(), graph_inputs.end(),
+                 std::back_inserter(input_data), [&scope](Input model_input) {
+                   return ops::AsNodeOut(scope, model_input);
+                 });
+
+  std::vector<DataType> model_input_type = {DT_FLOAT, DT_FLOAT, DT_FLOAT,
+                                            DT_FLOAT};
+  ::tensorflow::Node* ret;
+  auto builder = ::tensorflow::NodeBuilder("run_model_op", "RunTFLiteModel")
+                     .Input(input_data)
+                     .Attr("model_file_path", test_model_file)
+                     .Attr("input_type", model_input_type)
+                     .Attr("output_type", {DT_FLOAT, DT_FLOAT});
+
+  scope.UpdateBuilder(&builder);
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+
+  std::vector<Tensor> outputs;
+  auto status =
+      (session->Run({}, {"run_model_op:0", "run_model_op:1"}, {}, &outputs));
+  EXPECT_FALSE(status.ok());
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  g_test_model_file = new tensorflow::string();
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", g_test_model_file,
+                       "Path to test tflite model file."),
+  };
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  CHECK(parse_result) << "Required test_model_file";
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.cc b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.cc
new file mode 100644
index 0000000000..c96795d499
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h"
+
+#include <vector>
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+
+namespace tensorflow {
+namespace metrics {
+void RunTFLiteModelStage::AddToGraph(const Scope& scope, const Input& input) {
+  if (!scope.ok()) return;
+  Scope s = scope.WithOpName(name());
+
+  std::vector<NodeBuilder::NodeOut> _data = {ops::AsNodeOut(s, input)};
+  ::tensorflow::Node* ret;
+  auto builder = NodeBuilder(output_name(), "RunTFLiteModel")
+                     .Input(_data)
+                     .Attr("model_file_path", params_.model_file_path)
+                     .Attr("input_type", params_.input_type)
+                     .Attr("output_type", params_.output_type);
+
+  s.UpdateBuilder(&builder);
+  s.UpdateStatus(builder.Finalize(s.graph(), &ret));
+  if (!s.ok()) return;
+  s.UpdateStatus(s.DoShapeInference(ret));
+  this->stage_output_ = ::tensorflow::Output(ret, 0);
+}
+
+}  //  namespace metrics
+}  //  namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h
new file mode 100644
index 0000000000..90d12d6f42
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
+
+#include <string>
+
+#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+
+namespace tensorflow {
+namespace metrics {
+// Stage that loads and runs a TFLite model.
+// Inputs: The input to TFLite model.
+// Outputs: The output of running the TFLite model.
+class RunTFLiteModelStage : public Stage {
+ public:
+  // The parameters for the stage.
+  struct Params {
+    string model_file_path;
+    std::vector<TensorShape> output_shape;
+    std::vector<DataType> input_type;
+    std::vector<DataType> output_type;
+  };
+
+  explicit RunTFLiteModelStage(const Params& params) : params_(params) {}
+
+  string name() const override { return "stage_run_tfl_model"; }
+  // TODO(shashishekhar): This stage can have multiple inputs and
+  // outputs, perhaps change the definition of stage.
+  string output_name() const override { return "stage_run_tfl_model_output"; }
+
+  void AddToGraph(const Scope& scope, const Input& input) override;
+
+ private:
+  Params params_;
+};
+
+}  //  namespace metrics
+}  //  namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/stage.h b/tensorflow/contrib/lite/tools/accuracy/stage.h
new file mode 100644
index 0000000000..8292ea2ec7
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/stage.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_STAGE_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_STAGE_H_
+
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+namespace metrics {
+
+// A stage in an evaluation pipeline.
+// Each stage adds a subgraph to the pipeline. Stages can be chained
+// together.
+class Stage {
+ public:
+  Stage() = default;
+  Stage(const Stage&) = delete;
+  Stage& operator=(const Stage&) = delete;
+
+  Stage(const Stage&&) = delete;
+  Stage& operator=(const Stage&&) = delete;
+
+  // Adds a subgraph to given scope that takes in `input` as a parameter.
+  virtual void AddToGraph(const Scope& scope, const Input& input) = 0;
+  virtual ~Stage() {}
+
+  // The name of the stage.
+  // Can be used by derived classes for naming the subscope for the stage
+  // graph.
+  virtual string name() const = 0;
+
+  // The name of the output for the stage.
+  virtual string output_name() const = 0;
+
+  const ::tensorflow::Output& Output() const { return stage_output_; }
+
+ protected:
+  ::tensorflow::Output stage_output_;
+};
+}  //  namespace metrics
+}  //  namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/utils.cc b/tensorflow/contrib/lite/tools/accuracy/utils.cc
new file mode 100644
index 0000000000..f5493301fc
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/utils.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+
+#include <sys/stat.h>
+
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <string>
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
+
+namespace tensorflow {
+namespace metrics {
+
+namespace utils {
+
+DataType GetTFDataType(TfLiteType tflite_type) {
+  switch (tflite_type) {
+    case kTfLiteFloat32:
+      return DT_FLOAT;
+    case kTfLiteUInt8:
+      return DT_UINT8;
+    default:
+      return DT_INVALID;
+  }
+}
+
+TensorShape GetTFLiteTensorShape(const TfLiteTensor& tflite_tensor) {
+  TensorShape shape;
+  for (int i = 0; i < tflite_tensor.dims->size; i++) {
+    shape.AddDim(tflite_tensor.dims->data[i]);
+  }
+  return shape;
+}
+
+Status ReadFileLines(const string& file_path,
+                     std::vector<string>* lines_output) {
+  if (!lines_output) {
+    return errors::InvalidArgument("Invalid output");
+  }
+  std::vector<string> lines;
+  std::ifstream stream(file_path, std::ios_base::in);
+  if (!stream) {
+    return errors::InvalidArgument("Unable to open file: ", file_path);
+  }
+  std::string line;
+  while (std::getline(stream, line)) {
+    lines_output->push_back(line);
+  }
+  return Status::OK();
+}
+
+Status GetTFliteModelInfo(const string& model_file_path,
+                          ModelInfo* model_info) {
+  if (model_file_path.empty()) {
+    return errors::InvalidArgument("Invalid model file.");
+  }
+  struct stat stat_buf;
+  if (stat(model_file_path.c_str(), &stat_buf) != 0) {
+    int error_num = errno;
+    return errors::InvalidArgument("Invalid model file: ", model_file_path,
+                                   std::strerror(error_num));
+  }
+
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  model = tflite::FlatBufferModel::BuildFromFile(model_file_path.data());
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    return errors::InvalidArgument("Invalid model", model_file_path);
+  }
+  for (int i : interpreter->inputs()) {
+    TfLiteTensor* tensor = interpreter->tensor(i);
+    model_info->input_shapes.push_back(utils::GetTFLiteTensorShape(*tensor));
+    model_info->input_types.push_back(utils::GetTFDataType(tensor->type));
+  }
+  return Status::OK();
+}
+
+}  // namespace utils
+}  // namespace metrics
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/lite/tools/accuracy/utils.h b/tensorflow/contrib/lite/tools/accuracy/utils.h
new file mode 100644
index 0000000000..37cbad4d51
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace metrics {
+
+namespace utils {
+
+struct ModelInfo {
+  std::vector<TensorShape> input_shapes;
+  std::vector<DataType> input_types;
+};
+
+Status GetTFliteModelInfo(const string& model_file_path, ModelInfo* model_info);
+
+DataType GetTFDataType(TfLiteType tflite_type);
+
+TensorShape GetTFLiteTensorShape(const TfLiteTensor& tflite_tensor);
+
+Status ReadFileLines(const string& file_path,
+                     std::vector<string>* lines_output);
+}  // namespace utils
+}  // namespace metrics
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_UTILS_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/utils_test.cc b/tensorflow/contrib/lite/tools/accuracy/utils_test.cc
new file mode 100644
index 0000000000..727eba21b6
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/accuracy/utils_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+tensorflow::string* g_test_model_file = nullptr;
+}
+
+namespace tensorflow {
+namespace metrics {
+namespace utils {
+namespace {
+
+TEST(UtilsTest, GetTFLiteModelInfoReturnsCorrectly) {
+  ASSERT_TRUE(g_test_model_file != nullptr);
+  string test_model_file = *g_test_model_file;
+  ASSERT_FALSE(test_model_file.empty());
+  // Passed graph has 4 inputs : a,b,c,d and 2 outputs x,y
+  //  x = a+b+c, y=b+c+d
+  // Input and outputs have shape : {1,8,8,3}
+  ModelInfo model_info;
+  auto status = GetTFliteModelInfo(test_model_file, &model_info);
+  TF_CHECK_OK(status);
+  ASSERT_EQ(4, model_info.input_shapes.size());
+  ASSERT_EQ(4, model_info.input_types.size());
+
+  for (int i = 0; i < 4; i++) {
+    const TensorShape& shape = model_info.input_shapes[i];
+    DataType dataType = model_info.input_types[i];
+    EXPECT_TRUE(shape.IsSameSize({1, 8, 8, 3}));
+    EXPECT_EQ(DT_FLOAT, dataType);
+  }
+}
+
+TEST(UtilsTest, GetTFliteModelInfoIncorrectFile) {
+  ModelInfo model_info;
+  auto status = GetTFliteModelInfo("non_existent_file", &model_info);
+  EXPECT_FALSE(status.ok());
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace metrics
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  g_test_model_file = new tensorflow::string();
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", g_test_model_file,
+                       "Path to test tflite model file."),
+  };
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  CHECK(parse_result) << "Required test_model_file";
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index 2cb07eb6ec..dc97d22401 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -5,8 +5,8 @@ package(default_visibility = [
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
 
 common_copts = ["-Wall"] + tflite_copts()
 
@@ -35,6 +35,25 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "benchmark_model_plus_eager",
+    srcs = [
+        "benchmark_main.cc",
+    ],
+    copts = common_copts + ["-DTFLITE_EXTENDED"],
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":benchmark_tflite_model_plus_eager_lib",
+        ":logging",
+    ],
+)
+
 cc_test(
     name = "benchmark_test",
     srcs = ["benchmark_test.cc"],
@@ -88,7 +107,25 @@ cc_library(
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
         "//tensorflow/contrib/lite/profiling:profile_summarizer",
-        "//tensorflow/contrib/lite/profiling:profiler",
+    ],
+)
+
+cc_library(
+    name = "benchmark_tflite_model_plus_eager_lib",
+    srcs = [
+        "benchmark_tflite_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_tflite_model.h"],
+    copts = common_copts + ["-DTFLITE_EXTENDED"],
+    deps = [
+        ":benchmark_model_lib",
+        ":logging",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/delegates/eager:delegate",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/profiling:profile_summarizer",
     ],
 )
 
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
index 677a1ee68c..cc215a7b7f 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
 
 #include <cmath>
 #include <limits>
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index 7f97f5d0cd..02039922b4 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -23,6 +23,9 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#ifdef TFLITE_EXTENDED
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#endif  // TFLITE_EXTENDED
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/op_resolver.h"
@@ -261,6 +264,16 @@ void BenchmarkTfLiteModel::Init() {
   bool use_nnapi = params_.Get<bool>("use_nnapi");
 
   interpreter->UseNNAPI(use_nnapi);
+
+#ifdef TFLITE_EXTENDED
+  TFLITE_LOG(INFO) << "Instantiating Eager Delegate";
+  delegate_ = EagerDelegate::Create();
+  if (delegate_) {
+    interpreter->ModifyGraphWithDelegate(delegate_.get(),
+                                         /*allow_dynamic_tensors=*/true);
+  }
+#endif  // TFLITE_EXTENDED
+
   auto interpreter_inputs = interpreter->inputs();
 
   if (!inputs.empty()) {
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index 9931dcbafe..4c4320a998 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
 
 #include <memory>
 #include <string>
 #include <vector>
 
+#ifdef TFLITE_EXTENDED
+#include "tensorflow/contrib/lite/delegates/eager/delegate.h"
+#endif  // TFLITE_EXTENDED
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
 #include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
@@ -52,6 +55,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
   BenchmarkTfLiteModel();
   BenchmarkTfLiteModel(BenchmarkParams params);
+  virtual ~BenchmarkTfLiteModel() {}
 
   std::vector<Flag> GetFlags() override;
   void LogParams() override;
@@ -59,7 +63,6 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   uint64_t ComputeInputBytes() override;
   void Init() override;
   void RunImpl() override;
-  virtual ~BenchmarkTfLiteModel() {}
 
   struct InputLayerInfo {
     std::string name;
@@ -67,6 +70,9 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   };
 
  private:
+#ifdef TFLITE_EXTENDED
+  std::unique_ptr<EagerDelegate> delegate_;
+#endif  // TFLITE_EXTENDED
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
   std::vector<InputLayerInfo> inputs;
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
index 2e514ae3ea..6a0affd834 100644
--- a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
 
 #include <functional>
 #include <string>
diff --git a/tensorflow/contrib/lite/tools/benchmark/logging.h b/tensorflow/contrib/lite/tools/benchmark/logging.h
index 9e9292e2fe..4045d1e731 100644
--- a/tensorflow/contrib/lite/tools/benchmark/logging.h
+++ b/tensorflow/contrib/lite/tools/benchmark/logging.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_LOGGING_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_LOGGING_H_
 
 // LOG and CHECK macros for benchmarks.
 
diff --git a/tensorflow/contrib/lite/tools/optimize/BUILD b/tensorflow/contrib/lite/tools/optimize/BUILD
new file mode 100644
index 0000000000..01fbce0ac7
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/optimize/BUILD
@@ -0,0 +1,11 @@
+# TODO(suharshs): Write quantize_weights tests that use small exportable files.
+# Then we can remove this file.
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
diff --git a/tensorflow/contrib/lite/tools/optimize/quantize_weights.cc b/tensorflow/contrib/lite/tools/optimize/quantize_weights.cc
new file mode 100644
index 0000000000..0758514e39
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/optimize/quantize_weights.cc
@@ -0,0 +1,280 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/tools/optimize/quantize_weights.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+namespace optimize {
+
+namespace {
+
+// The minimum number of elements a weights array must have to be quantized
+// by this transformation.
+// TODO(suharshs): Make this configurable.
+const int kWeightsMinSize = 1024;
+
+// Nudge min and max so that floating point 0 falls exactly on a quantized
+// value, returning the nudges scale and zero_point.
+//
+// Although this code originates from FakeQuantization in quantized training,
+// we may deviate from that implementation as we please since we do not fine
+// tune the weights with quantized training.
+void GetQuantizationParams(const float min, const float max,
+                           const int quant_min, const int quant_max,
+                           QuantizationParametersT* quantization_params) {
+  // Adjust the boundaries to guarantee 0 is included.
+  const float quant_min_float = std::min(static_cast<float>(quant_min), 0.0f);
+  const float quant_max_float = std::max(static_cast<float>(quant_max), 0.0f);
+  const float scale = (max - min) / (quant_max_float - quant_min_float);
+  const float zero_point_from_min = quant_min_float - min / scale;
+  int64_t zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    zero_point = static_cast<int64_t>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    zero_point = static_cast<int64_t>(quant_max);
+  } else {
+    zero_point = static_cast<int64_t>(std::round(zero_point_from_min));
+  }
+  quantization_params->scale = {scale};
+  quantization_params->zero_point = {zero_point};
+}
+
+// Returns the number of elements in tensor.
+uint64 NumElements(const TensorT* tensor) {
+  if (tensor->shape.empty()) {
+    LOG(FATAL) << "Tensor has no shape information.";
+  }
+  uint64 num_elements = 1;
+  for (const uint64 dim : tensor->shape) {
+    num_elements *= dim;
+  }
+  return num_elements;
+}
+
+uint64 CountTensorConsumers(const ModelT* model, const SubGraphT* subgraph,
+                            int32_t tensor_idx) {
+  uint64 count = 0;
+  for (int op_idx = 0; op_idx < subgraph->operators.size(); ++op_idx) {
+    const OperatorT* op = subgraph->operators[op_idx].get();
+    if (op == nullptr) {
+      continue;
+    }
+    for (int i = 0; i < op->inputs.size(); ++i) {
+      if (op->inputs[i] == tensor_idx) {
+        count++;
+      }
+    }
+  }
+  return count;
+}
+
+// Returns true if the Operator's weight tensor should be quantized.
+bool GetQuantizableTensorFromOperator(const ModelT* model, const OperatorT* op,
+                                      TensorT** tensor, int32_t* tensor_idx,
+                                      int32_t* op_input_index) {
+  SubGraphT* subgraph = model->subgraphs.at(0).get();
+  const BuiltinOperator op_code =
+      model->operator_codes[op->opcode_index]->builtin_code;
+
+  if (op_code == BuiltinOperator_CONV_2D ||
+      op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
+      op_code == BuiltinOperator_FULLY_CONNECTED ||
+      op_code == BuiltinOperator_SVDF) {
+    *op_input_index = 1;
+  } else if (op_code == BuiltinOperator_LSTM) {
+    // TODO(suharshs): Add RNN, and sequential/bidi versions.
+    *op_input_index = 2;
+  } else {
+    return false;
+  }
+  *tensor_idx = op->inputs[*op_input_index];
+
+  // TODO(suharshs): Support shared weights, i.e. If two tensors share the
+  // same weight array, things may break. (i.e. SSD object detection)
+  if (CountTensorConsumers(model, subgraph, *tensor_idx) != 1) {
+    LOG(INFO) << "Skipping quantization of tensor that is shared between "
+                 "multiple multiple operations.";
+    return false;
+  }
+
+  *tensor = subgraph->tensors[*tensor_idx].get();
+
+  if ((*tensor)->type != TensorType_FLOAT32) {
+    LOG(INFO) << "Skipping quantization of tensor that is not type float.";
+    return false;
+  }
+  const uint64 num_elements = NumElements(*tensor);
+  if (num_elements < kWeightsMinSize) {
+    LOG(INFO) << "Skipping quantization of tensor because it has fewer than "
+              << kWeightsMinSize << " elements (" << num_elements << ").";
+    return false;
+  }
+
+  return true;
+}
+
+// Quantizes tensor using asymmetric quantization with the min and max elements
+// of the tensor. This is needed to pass to Dequantize operations.
+TfLiteStatus AsymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  float* float_data = reinterpret_cast<float*>(buffer->data.data());
+  const uint64 num_elements = NumElements(tensor);
+  LOG(INFO) << "Quantizing tensor with " << num_elements << " elements.";
+
+  // Compute the quantization params.
+  float min_value = *std::min_element(float_data, float_data + num_elements);
+  float max_value = *std::max_element(float_data, float_data + num_elements);
+  GetQuantizationParams(min_value, max_value, 0, 255,
+                        tensor->quantization.get());
+
+  // Quantize the buffer.
+  std::vector<uint8_t> quantized_buffer;
+  quantized_buffer.resize(num_elements);
+  const double inverse_scale = 1. / tensor->quantization->scale[0];
+  for (std::size_t i = 0; i < num_elements; i++) {
+    const float src_val = float_data[i];
+    double scaled_val;
+    if (tensor->quantization->scale[0] == 0) {
+      scaled_val = tensor->quantization->zero_point[0];
+    } else {
+      scaled_val =
+          tensor->quantization->zero_point[0] + inverse_scale * src_val;
+    }
+    uint8_t integer_val = static_cast<uint8_t>(std::round(scaled_val));
+    quantized_buffer[i] = integer_val;
+  }
+  model->buffers[tensor->buffer]->data = quantized_buffer;
+
+  // Update the tensor type.
+  tensor->type = TensorType_UINT8;
+
+  return kTfLiteOk;
+}
+
+// Returns the index of the Dequantize op_code.
+// If a Dequantize op_code doesn't exist, adds it and returns its index.
+int32_t GetOrInsertDequantizeOpCodeIndex(ModelT* model) {
+  for (int i = 0; i < model->operator_codes.size(); ++i) {
+    if (model->operator_codes[i]->builtin_code == BuiltinOperator_DEQUANTIZE) {
+      return i;
+    }
+  }
+  model->operator_codes.push_back(std::make_unique<OperatorCodeT>());
+  int op_code_idx = model->operator_codes.size() - 1;
+  model->operator_codes[op_code_idx]->builtin_code = BuiltinOperator_DEQUANTIZE;
+  // TODO(suharshs): How should the version be set in this op_code?
+
+  // Return the index of the newly placed OperatorCodeT.
+  return op_code_idx;
+}
+
+// Creates a Dequantize OperatorT object.
+void MakeDequantizeOperator(ModelT* model, std::unique_ptr<OperatorT>* op,
+                            int32_t input, int32_t output) {
+  OperatorT* op_raw = new OperatorT;
+  op_raw->opcode_index = GetOrInsertDequantizeOpCodeIndex(model);
+  op_raw->inputs = {input};
+  op_raw->outputs = {output};
+
+  op->reset(op_raw);
+}
+
+// Create a new TensorT object.
+void MakeTensor(const string& name, const std::vector<int32_t>& shape,
+                std::unique_ptr<TensorT>* tensor) {
+  TensorT* tensor_raw = new TensorT;
+  tensor_raw->name = name;
+  tensor_raw->shape = shape;
+
+  tensor->reset(tensor_raw);
+}
+
+}  // namespace
+
+TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
+                             const Model* input_model) {
+  std::unique_ptr<ModelT> model;
+  model.reset(input_model->UnPack());
+
+  // TODO(suharshs): When models support multiple subgraphs, add support.
+  if (model->subgraphs.size() != 1) {
+    LOG(ERROR) << "Quantize weights tool only supports tflite models with one "
+                  "subgraph.";
+    return kTfLiteError;
+  }
+
+  SubGraphT* subgraph = model->subgraphs.at(0).get();
+
+  std::vector<std::unique_ptr<OperatorT>> new_operators;
+  for (int i = 0; i < subgraph->operators.size(); ++i) {
+    OperatorT* op = subgraph->operators[i].get();
+
+    TensorT* tensor;
+    // The index of the weight tensor in subgraph->tensors.
+    int32_t tensor_idx;
+    int32_t op_input_idx;  // The index of tensor_idx in the op->inputs.
+    // TODO(suharshs): Support hybrid ops that require symmetric quantization.
+    if (GetQuantizableTensorFromOperator(model.get(), op, &tensor, &tensor_idx,
+                                         &op_input_idx)) {
+      // Quantize the tensors.
+      TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(model.get(), tensor));
+
+      // Create a new tensor to be the output of the dequantize op.
+      std::unique_ptr<TensorT> dequantize_output;
+      MakeTensor(tensor->name + "_dequantize", tensor->shape,
+                 &dequantize_output);
+      int32_t dequantize_output_idx = subgraph->tensors.size();
+      subgraph->tensors.push_back(std::move(dequantize_output));
+
+      // Create the Dequantize operation.
+      std::unique_ptr<OperatorT> dequantize_op;
+      MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
+                             dequantize_output_idx);
+
+      // Update the op_input of tensor_idx to dequantize_output_idx.
+      op->inputs[op_input_idx] = dequantize_output_idx;
+      // Insert the updated op.
+      new_operators.push_back(std::move(subgraph->operators[i]));
+
+      // Insert the newly created Dequantize operation.
+      new_operators.push_back(std::move(dequantize_op));
+    } else {
+      // If this tensor wasn't quantizable, just copy the op over as-is.
+      new_operators.push_back(std::move(subgraph->operators[i]));
+    }
+  }
+  // At this point all unique_ptrs in the original operators are invalid, and
+  // we need to replace it with the new_operators vector.
+  subgraph->operators = std::move(new_operators);
+
+  flatbuffers::Offset<Model> output_model_location =
+      Model::Pack(*builder, model.get());
+  FinishModelBuffer(*builder, output_model_location);
+
+  return kTfLiteOk;
+}
+
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/optimize/quantize_weights.h b/tensorflow/contrib/lite/tools/optimize/quantize_weights.h
new file mode 100644
index 0000000000..a408c1662d
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/optimize/quantize_weights.h
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
+
+#include <memory>
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+
+// Quantizes input_model and populates the provided builder with the new model.
+//
+// A tflite::Model can be obtained from the builder with:
+//   const uint8_t* buffer = builder->GetBufferPointer();
+//   tflite::Model* model = GetModel(buffer);
+TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
+                             const Model* input_model);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
diff --git a/tensorflow/contrib/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/contrib/lite/tools/optimize/quantize_weights_test.cc
new file mode 100644
index 0000000000..0e0676e5ff
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/optimize/quantize_weights_test.cc
@@ -0,0 +1,130 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/tools/optimize/quantize_weights.h"
+
+#include <memory>
+
+#include "flatbuffers/flexbuffers.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace {
+
+class QuantizeWeightsTest : public ::testing::Test {
+ protected:
+  int GetElementsNum(const TensorT* tensor) {
+    int tensor_size = 1;
+    for (const int dim : tensor->shape) {
+      tensor_size *= dim;
+    }
+    return tensor_size;
+  }
+
+  const OperatorT* GetOpWithOutput(const SubGraphT* subgraph,
+                                   int32_t output_tensor_idx) {
+    for (int i = 0; i < subgraph->operators.size(); ++i) {
+      OperatorT* op = subgraph->operators[i].get();
+      if (std::find(op->outputs.begin(), op->outputs.end(),
+                    output_tensor_idx) != op->outputs.end()) {
+        return op;
+      }
+    }
+    return nullptr;
+  }
+
+  void CheckWeights(const Model* model_packed) {
+    std::unique_ptr<ModelT> model;
+    model.reset(model_packed->UnPack());
+
+    SubGraphT* subgraph = model->subgraphs.at(0).get();
+
+    for (int i = 0; i < subgraph->operators.size(); ++i) {
+      OperatorT* op = subgraph->operators[i].get();
+      const BuiltinOperator op_code =
+          model->operator_codes[op->opcode_index]->builtin_code;
+
+      // These are the operations that should be quantized.
+      int32_t tensor_idx;
+      if (op_code == BuiltinOperator_CONV_2D ||
+          op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
+          op_code == BuiltinOperator_FULLY_CONNECTED) {
+        tensor_idx = op->inputs[1];
+      } else if (op_code == BuiltinOperator_LSTM) {
+        // TODO(suharshs): Add tests for LSTMs.
+        tensor_idx = op->inputs[1];
+      } else {
+        continue;
+      }
+      const TensorT* tensor = subgraph->tensors[tensor_idx].get();
+      int tensor_size = GetElementsNum(tensor);
+      // If the tensor_size is less than 1024 we expect the tensor to remain
+      // unquantized.
+      if (tensor_size < 1024) {
+        ASSERT_TRUE(tensor->type == TensorType_FLOAT32) << tensor->name;
+        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
+        // The weight tensor should not come from a dequantize op.
+        ASSERT_TRUE(preceding_op == nullptr);
+      } else {
+        // The input to the op should still be float.
+        ASSERT_TRUE(tensor->type == TensorType_FLOAT32) << tensor->name;
+        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
+        ASSERT_TRUE(preceding_op != nullptr);
+        // The float input should be the dequantize output.
+        ASSERT_TRUE(
+            model->operator_codes[preceding_op->opcode_index]->builtin_code ==
+            BuiltinOperator_DEQUANTIZE);
+        // Finally, ensure that the input to the dequantize operation is
+        // quantized.
+        ASSERT_TRUE(subgraph->tensors[preceding_op->inputs[0]]->type ==
+                    TensorType_UINT8);
+        // TODO(suharshs): Add more rigorous testing for the numerical values in
+        // the tensors.
+      }
+    }
+  }
+};
+
+TEST_F(QuantizeWeightsTest, SimpleTest) {
+  string model_path =
+      "third_party/tensorflow/contrib/lite/tools/optimize/testdata/"
+      "mobilenet_v1_0.25_128.tflite";
+  std::unique_ptr<FlatBufferModel> input_fb =
+      FlatBufferModel::BuildFromFile(model_path.data());
+  const Model* input_model = input_fb->GetModel();
+
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(QuantizeWeights(&builder, input_model), kTfLiteOk);
+
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+
+  CheckWeights(output_model);
+}
+
+// TODO(suharshs): Add tests that run the resulting model.
+
+}  // namespace
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/util.cc b/tensorflow/contrib/lite/util.cc
index 8ccb65c24f..7950653da9 100644
--- a/tensorflow/contrib/lite/util.cc
+++ b/tensorflow/contrib/lite/util.cc
@@ -14,8 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/util.h"
 
+#include <cstring>
+
 namespace tflite {
 
+bool IsEagerOp(const char* custom_name) {
+  return custom_name && strncmp(custom_name, kEagerCustomCodePrefix,
+                                strlen(kEagerCustomCodePrefix)) == 0;
+}
+
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
   return ConvertArrayToTfLiteIntArray(input.size(), input.data());
 }
diff --git a/tensorflow/contrib/lite/util.h b/tensorflow/contrib/lite/util.h
index 3c4801183b..f5b208afbb 100644
--- a/tensorflow/contrib/lite/util.h
+++ b/tensorflow/contrib/lite/util.h
@@ -26,6 +26,16 @@ limitations under the License.
 
 namespace tflite {
 
+// The prefix of Eager op custom code.
+// This will be matched agains the `custom_code` field in `OperatorCode`
+// Flatbuffer Table.
+// WARNING: This is an experimental API and subject to change.
+constexpr char kEagerCustomCodePrefix[] = "Eager";
+
+// Checks whether the prefix of the custom name indicates the operation is an
+// Eager operation.
+bool IsEagerOp(const char* custom_name);
+
 // Converts a `std::vector` to a `TfLiteIntArray`. The caller takes ownership
 // of the returned pointer.
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input);
diff --git a/tensorflow/contrib/lite/util_test.cc b/tensorflow/contrib/lite/util_test.cc
index 04579c53aa..32bf917a59 100644
--- a/tensorflow/contrib/lite/util_test.cc
+++ b/tensorflow/contrib/lite/util_test.cc
@@ -41,6 +41,16 @@ TEST(ConvertVectorToTfLiteIntArray, TestWithEmptyVector) {
   TfLiteIntArrayFree(output);
 }
 
+TEST(UtilTest, IsEagerOp) {
+  EXPECT_TRUE(IsEagerOp("Eager"));
+  EXPECT_TRUE(IsEagerOp("EagerOp"));
+  EXPECT_FALSE(IsEagerOp("eager"));
+  EXPECT_FALSE(IsEagerOp("Eage"));
+  EXPECT_FALSE(IsEagerOp("OpEager"));
+  EXPECT_FALSE(IsEagerOp(nullptr));
+  EXPECT_FALSE(IsEagerOp(""));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index e3928a82a2..83e80f25bc 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -34,6 +34,7 @@ tf_py_test(
         ":lookup_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
+        "//tensorflow/contrib/data",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index 8c0bfefb30..f83765a48d 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_lookup_ops
@@ -39,6 +42,7 @@ from tensorflow.python.ops.lookup_ops import TextFileIndex
 from tensorflow.python.ops.lookup_ops import TextFileInitializer
 from tensorflow.python.ops.lookup_ops import TextFileStringTableInitializer
 # pylint: enable=unused-import
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.util.deprecation import deprecated
 
@@ -285,7 +289,7 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   return table.lookup(tensor)
 
 
-class MutableHashTable(LookupInterface):
+class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
   """A generic mutable hash table implementation.
 
   Data can be inserted by calling the insert method. It does not support
@@ -336,6 +340,13 @@ class MutableHashTable(LookupInterface):
                                                 dtype=value_dtype)
     self._value_shape = self._default_value.get_shape()
 
+    executing_eagerly = context.executing_eagerly()
+    if executing_eagerly and shared_name is None:
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "table_%d" % (ops.uid(),)
     # The table must be shared if checkpointing is requested for multi-worker
     # training to work correctly. Use the node name if no shared_name has been
     # explicitly specified.
@@ -355,9 +366,12 @@ class MutableHashTable(LookupInterface):
           value_dtype=value_dtype,
           value_shape=self._default_value.get_shape(),
           name=name)
+    if executing_eagerly:
+      op_name = None
+    else:
+      op_name = self._table_ref.op.name.split("/")[-1]
     super(MutableHashTable, self).__init__(key_dtype, value_dtype,
-                                           self._table_ref.op.name.split(
-                                               "/")[-1])
+                                           op_name)
 
     if checkpoint:
       saveable = MutableHashTable._Saveable(self, name)
@@ -419,11 +433,10 @@ class MutableHashTable(LookupInterface):
       TypeError: when `keys` or `values` doesn't match the table data
         types.
     """
-    # pylint: disable=protected-access
-    lookup_ops._check_table_dtypes(self, keys.dtype, values.dtype)
-    # pylint: enable=protected-access
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
+      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(values, self._value_dtype, name="values")
       with ops.colocate_with(self._table_ref):
         # pylint: disable=protected-access
         op = gen_lookup_ops.lookup_table_insert_v2(
@@ -447,6 +460,10 @@ class MutableHashTable(LookupInterface):
             self._table_ref, self._key_dtype, self._value_dtype, name=name)
     return exported_keys, exported_values
 
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {"table": functools.partial(MutableHashTable._Saveable, table=self)}
+
   class _Saveable(BaseSaverBuilder.SaveableObject):
     """SaveableObject implementation for MutableHashTable."""
 
@@ -459,14 +476,15 @@ class MutableHashTable(LookupInterface):
       # pylint: disable=protected-access
       super(MutableHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, unused_restored_shapes):
+    def restore(self, restored_tensors, restored_shapes):
+      del restored_shapes  # unused
       # pylint: disable=protected-access
       with ops.colocate_with(self.op._table_ref):
         return gen_lookup_ops.lookup_table_import_v2(
             self.op._table_ref, restored_tensors[0], restored_tensors[1])
 
 
-class MutableDenseHashTable(LookupInterface):
+class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
   """A generic mutable hash table implementation using tensors as backing store.
 
   Data can be inserted by calling the insert method. It does not support
@@ -537,6 +555,13 @@ class MutableDenseHashTable(LookupInterface):
     use_node_name_sharing = checkpoint and shared_name is None
     empty_key = ops.convert_to_tensor(
         empty_key, dtype=key_dtype, name="empty_key")
+    executing_eagerly = context.executing_eagerly()
+    if executing_eagerly and shared_name is None:
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "table_%d" % (ops.uid(),)
     self._table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
         empty_key=empty_key,
         shared_name=shared_name,
@@ -545,8 +570,12 @@ class MutableDenseHashTable(LookupInterface):
         value_shape=self._value_shape,
         initial_num_buckets=initial_num_buckets,
         name=name)
+    if executing_eagerly:
+      op_name = None
+    else:
+      op_name = self._table_ref.op.name.split("/")[-1]
     super(MutableDenseHashTable, self).__init__(
-        key_dtype, value_dtype, self._table_ref.op.name.split("/")[-1])
+        key_dtype, value_dtype, op_name)
 
     if checkpoint:
       saveable = MutableDenseHashTable._Saveable(self, name)
@@ -637,6 +666,11 @@ class MutableDenseHashTable(LookupInterface):
 
     return exported_keys, exported_values
 
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {"table": functools.partial(
+        MutableDenseHashTable._Saveable, table=self)}
+
   class _Saveable(BaseSaverBuilder.SaveableObject):
     """SaveableObject implementation for MutableDenseHashTable."""
 
@@ -649,7 +683,8 @@ class MutableDenseHashTable(LookupInterface):
       # pylint: disable=protected-access
       super(MutableDenseHashTable._Saveable, self).__init__(table, specs, name)
 
-    def restore(self, restored_tensors, unused_restored_shapes):
+    def restore(self, restored_tensors, restored_shapes):
+      del restored_shapes  # unused
       # pylint: disable=protected-access
       with ops.colocate_with(self.op._table_ref):
         return gen_lookup_ops.lookup_table_import_v2(
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 6fb5244fc6..0a54bb1f5e 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 import six
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.data.python.ops import counter
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -37,6 +38,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
+from tensorflow.python.training.checkpointable import util as checkpointable
 
 
 class HashTableOpTest(test.TestCase):
@@ -331,7 +333,7 @@ class MutableHashTableOpTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v0 = variables.Variable(10.0, name="v0")
       v1 = variables.Variable(20.0, name="v1")
 
@@ -356,7 +358,7 @@ class MutableHashTableOpTest(test.TestCase):
       self.assertTrue(isinstance(val, six.string_types))
       self.assertEqual(save_path, val)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v0 = variables.Variable(-1.0, name="v0")
       v1 = variables.Variable(-1.0, name="v1")
       default_val = -1
@@ -382,6 +384,59 @@ class MutableHashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
       self.assertAllEqual([-1, 0, 1, 2, -1], output.eval())
 
+  @test_util.run_in_graph_and_eager_modes
+  def testObjectSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    v0 = variables.Variable(10.0, name="v0")
+    v1 = variables.Variable(20.0, name="v1")
+
+    default_val = -1
+    keys = constant_op.constant(["b", "c", "d"], dtypes.string)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup.MutableHashTable(
+        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+
+    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
+    self.evaluate([v0.initializer, v1.initializer])
+
+    # Check that the parameter nodes have been initialized.
+    self.assertEqual(10.0, self.evaluate(v0))
+    self.assertEqual(20.0, self.evaluate(v1))
+
+    self.assertAllEqual(0, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    save_path = checkpoint.save(save_prefix)
+    del table, checkpoint, v0, v1
+
+    v0 = variables.Variable(-1.0, name="v0")
+    v1 = variables.Variable(-1.0, name="v1")
+    default_val = -1
+    table = lookup.MutableHashTable(
+        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+    self.evaluate(table.insert(
+        constant_op.constant(["a", "c"], dtypes.string),
+        constant_op.constant([12, 24], dtypes.int64)))
+    self.assertAllEqual(2, self.evaluate(table.size()))
+
+    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
+
+    # Restore the saved values in the parameter nodes.
+    checkpoint.restore(save_path).run_restore_ops()
+    # Check that the parameter nodes have been restored.
+    self.assertEqual(10.0, self.evaluate(v0))
+    self.assertEqual(20.0, self.evaluate(v1))
+
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant(["a", "b", "c", "d", "e"],
+                                        dtypes.string)
+    output = table.lookup(input_string)
+    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
   def testSharing(self):
     # Start a server to store the table state
     server = server_lib.Server(
@@ -646,11 +701,11 @@ class MutableHashTableOpTest(test.TestCase):
                                       default_val)
 
       # insert with keys of the wrong type
-      with self.assertRaises(TypeError):
+      with self.assertRaises(ValueError):
         table.insert(constant_op.constant([4, 5, 6]), values).run()
 
       # insert with values of the wrong type
-      with self.assertRaises(TypeError):
+      with self.assertRaises(ValueError):
         table.insert(keys, constant_op.constant(["a", "b", "c"])).run()
 
       self.assertAllEqual(0, table.size().eval())
@@ -957,7 +1012,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       default_value = -1
       empty_key = 0
       keys = constant_op.constant([11, 12, 13], dtypes.int64)
@@ -982,7 +1037,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertTrue(isinstance(val, six.string_types))
       self.assertEqual(save_path, val)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
@@ -1009,11 +1064,65 @@ class MutableDenseHashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
       self.assertAllEqual([-1, 0, 1, 2, -1], output.eval())
 
+  @test_util.run_in_graph_and_eager_modes
+  def testObjectSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    default_value = -1
+    empty_key = 0
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    save_table = lookup.MutableDenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        name="t1",
+        checkpoint=True,
+        initial_num_buckets=32)
+
+    save_checkpoint = checkpointable.Checkpoint(table=save_table)
+
+    self.assertAllEqual(0, self.evaluate(save_table.size()))
+    self.evaluate(save_table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(save_table.size()))
+    self.assertAllEqual(32, len(self.evaluate(save_table.export()[0])))
+
+    save_path = save_checkpoint.save(save_prefix)
+    del save_table, save_checkpoint
+
+    load_table = lookup.MutableDenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        name="t1",
+        checkpoint=True,
+        initial_num_buckets=64)
+    self.evaluate(load_table.insert(
+        constant_op.constant([11, 14], dtypes.int64),
+        constant_op.constant([12, 24], dtypes.int64)))
+    self.assertAllEqual(2, self.evaluate(load_table.size()))
+    self.assertAllEqual(64, len(self.evaluate(load_table.export()[0])))
+
+    restore_checkpoint = checkpointable.Checkpoint(table=load_table)
+
+    # Restore the saved values in the parameter nodes.
+    restore_checkpoint.restore(save_path).run_restore_ops()
+
+    self.assertAllEqual(3, self.evaluate(load_table.size()))
+    self.assertAllEqual(32, len(self.evaluate(load_table.export()[0])))
+
+    input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
+    output = load_table.lookup(input_string)
+    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
   def testVectorSaveRestore(self):
     save_dir = os.path.join(self.get_temp_dir(), "vector_save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
       default_value = constant_op.constant([-1, -2], dtypes.int64)
       keys = constant_op.constant([[11, 12], [11, 14], [13, 14]], dtypes.int64)
@@ -1038,7 +1147,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertTrue(isinstance(val, six.string_types))
       self.assertEqual(save_path, val)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
       default_value = constant_op.constant([-1, -2], dtypes.int64)
       table = lookup.MutableDenseHashTable(
@@ -1073,7 +1182,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "vector_scalar_save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
       default_value = constant_op.constant(-1, dtypes.int64)
       keys = constant_op.constant([[11, 12], [11, 14], [13, 14]], dtypes.int64)
@@ -1098,7 +1207,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertTrue(isinstance(val, six.string_types))
       self.assertEqual(save_path, val)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
       default_value = constant_op.constant(-1, dtypes.int64)
       table = lookup.MutableDenseHashTable(
@@ -2397,5 +2506,60 @@ class IdTableWithHashBucketsTest(test.TestCase):
             hasher_spec=lookup.StrongHashSpec([None, 2]))
 
 
+class MutableHashTableBenchmark(test.Benchmark):
+
+  def _create_table(self):
+    return lookup.MutableHashTable(dtypes.int64, dtypes.float32, 0.0)
+
+  def benchmark_single_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable(1.0)
+    insert = table.insert(0, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) == 1
+
+  def benchmark_many_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    c = counter.Counter().make_one_shot_iterator().get_next()
+    value = variables.Variable(1.0)
+    insert = table.insert(c, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) >= 10000
+
+  def benchmark_single_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) == 32
+
+  def benchmark_many_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    c = counter.Counter().make_one_shot_iterator().get_next()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(32 * c + list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) >= 1000*32
+
+
+class MutableDenseHashTableBenchmark(MutableHashTableBenchmark):
+
+  def _create_table(self):
+    return lookup.MutableDenseHashTable(
+        dtypes.int64, dtypes.float32, default_value=0.0, empty_key=-1)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/losses/__init__.py b/tensorflow/contrib/losses/__init__.py
index db58647d48..92b380df53 100644
--- a/tensorflow/contrib/losses/__init__.py
+++ b/tensorflow/contrib/losses/__init__.py
@@ -15,7 +15,7 @@
 
 """Ops for building neural network losses.
 
-See @{$python/contrib.losses}.
+See [Contrib Losses](https://tensorflow.org/api_guides/python/contrib.losses).
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/losses/python/losses/__init__.py b/tensorflow/contrib/losses/python/losses/__init__.py
index 6e9d1d4a77..1675387227 100644
--- a/tensorflow/contrib/losses/python/losses/__init__.py
+++ b/tensorflow/contrib/losses/python/losses/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Ops for building neural network losses.
 
-See @{$python/contrib.losses}.
+See [Contrib Losses](https://tensorflow.org/api_guides/python/contrib.losses).
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/losses/python/metric_learning/__init__.py b/tensorflow/contrib/losses/python/metric_learning/__init__.py
index 4e551d6aca..3d93a4d0ac 100644
--- a/tensorflow/contrib/losses/python/metric_learning/__init__.py
+++ b/tensorflow/contrib/losses/python/metric_learning/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Ops for building neural network losses.
 
-See @{$python/contrib.losses}.
+See [Contrib Losses](https://tensorflow.org/api_guides/python/contrib.losses).
 """
 
 from __future__ import absolute_import
@@ -35,5 +35,3 @@ _allowed_symbols = [
     'triplet_semihard_loss',
 ]
 remove_undocumented(__name__, _allowed_symbols)
-
-
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 1a1ab54a53..d962a5e12d 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -90,6 +90,7 @@ HOST_INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
 -I$(MAKEFILE_DIR)/downloads/double_conversion \
+-I$(MAKEFILE_DIR)/downloads/absl \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
@@ -116,6 +117,25 @@ ifeq ($(HOST_OS),PI)
 	HOST_LIBS += -ldl -lpthread
 endif
 
+# Abseil sources.
+ABSL_CC_ALL_SRCS := \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*/*.cc)
+
+ABSL_CC_EXCLUDE_SRCS := \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*test*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*test*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*test*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*/*test*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*benchmark*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*benchmark*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*benchmark*.cc) \
+$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*/*benchmark*.cc) \
+tensorflow/contrib/makefile/downloads/absl/absl/synchronization/internal/mutex_nonprod.cc
+
+ABSL_CC_SRCS := $(filter-out $(ABSL_CC_EXCLUDE_SRCS), $(ABSL_CC_ALL_SRCS))
 
 # proto_text is a tool that converts protobufs into a form we can use more
 # compactly within TensorFlow. It's a bit like protoc, but is designed to
@@ -125,7 +145,9 @@ endif
 PROTO_TEXT := $(HOST_BINDIR)proto_text
 # The list of dependencies is derived from the Bazel build file by running
 # the gen_file_lists.sh script on a system with a working Bazel setup.
-PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt)
+PROTO_TEXT_CC_FILES := \
+  $(ABSL_CC_SRCS) \
+  $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt)
 PROTO_TEXT_PB_CC_LIST := \
 	$(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \
 	$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc)
@@ -175,6 +197,7 @@ INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
 -I$(MAKEFILE_DIR)/downloads/double_conversion \
+-I$(MAKEFILE_DIR)/downloads/absl \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
@@ -236,7 +259,6 @@ ifeq ($(TARGET),PI)
 endif
 
 # Set up Android building
-# LINT.IfChange
 ifeq ($(TARGET),ANDROID)
 # Override NDK_ROOT on the command line with your own NDK location, e.g.
 # make -f tensorflow/contrib/makefile/Makefile TARGET=ANDROID \
@@ -331,6 +353,7 @@ $(MARCH_OPTION) \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
 -I$(MAKEFILE_DIR)/downloads/double_conversion \
+-I$(MAKEFILE_DIR)/downloads/absl \
 -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -446,7 +469,6 @@ $(MARCH_OPTION) \
 		DEPDIR := $(DEPDIR)android_$(ANDROID_ARCH)/
 	endif # ifeq ($(BUILD_FOR_TEGRA),1)
 endif  # ANDROID
-# LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
 # Settings for iOS.
 ifeq ($(TARGET),IOS)
@@ -596,6 +618,7 @@ BENCHMARK_NAME := $(BINDIR)benchmark
 # gen_file_lists.sh script.
 
 CORE_CC_ALL_SRCS := \
+$(ABSL_CC_SRCS) \
 $(wildcard tensorflow/core/*.cc) \
 $(wildcard tensorflow/core/common_runtime/*.cc) \
 $(wildcard tensorflow/core/framework/*.cc) \
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index a28fc3a87f..cb4c94d92f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -256,6 +256,7 @@ for arch in $archs; do
                 esac
 
                 makefile='
+			AR := ${NDK_ROOT}/toolchains/'"$toolchain"'/prebuilt/'"$android_os_arch"'/bin/'"$bin_prefix"'-ar
                         CC=${CC_PREFIX} \
                            ${NDK_ROOT}/toolchains/'"$toolchain"'/prebuilt/'"$android_os_arch"'/bin/'"$bin_prefix"'-g++
                         PLATFORM_CPPFLAGS=--sysroot \
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index ecf2e120df..66a3315700 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -301,7 +301,6 @@ tensorflow/core/ops/array_grad.cc
 tensorflow/core/kernels/spacetobatch_functor.cc
 tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
-tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/ops/audio_ops.cc
 tensorflow/core/kernels/decode_proto_op.cc
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index 88798d61b7..5645784f8d 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Ops for evaluation metrics and summary statistics.
 
-See the @{$python/contrib.metrics} guide.
+See the
+[Contrib Metrics](https://tensorflow.org/api_guides/python/contrib.metrics)
+guide.
 
 @@auc_with_confidence_intervals
 @@streaming_accuracy
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index e553612269..7053907da0 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -24,7 +24,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 
 # TODO(nsilberman): move into metrics/python/ops/
 
@@ -174,7 +174,7 @@ def f1_score(labels, predictions, weights=None, num_thresholds=200,
         ops.add_to_collections(metrics_collections, best_f1)
       return best_f1
 
-    best_f1 = distribute_lib.get_tower_context().merge_call(
+    best_f1 = distribution_strategy_context.get_tower_context().merge_call(
         f1_across_towers, values)
 
     update_op = compute_best_f1_score(tp=update_ops['tp'], fp=update_ops['fp'],
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index a328670526..bbf5d3f30c 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2532,7 +2532,8 @@ def sparse_recall_at_top_k(labels,
         name=name_scope)
 
 
-def _compute_recall_at_precision(tp, fp, fn, precision, name):
+def _compute_recall_at_precision(tp, fp, fn, precision, name,
+                                 strict_mode=False):
   """Helper function to compute recall at a given `precision`.
 
   Args:
@@ -2541,17 +2542,42 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     fn: The number of false negatives.
     precision: The precision for which the recall will be calculated.
     name: An optional variable_scope name.
+    strict_mode: If true and there exists a threshold where the precision is
+      no smaller than the target precision, return the corresponding recall at
+      the threshold. Otherwise, return 0. If false, find the threshold where the
+      precision is closest to the target precision and return the recall at the
+      threshold.
 
   Returns:
     The recall at a given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
-  tf_index = math_ops.argmin(
-      math_ops.abs(precisions - precision), 0, output_type=dtypes.int32)
+  if not strict_mode:
+    tf_index = math_ops.argmin(
+        math_ops.abs(precisions - precision), 0, output_type=dtypes.int32)
+    # Now, we have the implicit threshold, so compute the recall:
+    return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + _EPSILON,
+                        name)
+  else:
+    # We aim to find the threshold where the precision is minimum but no smaller
+    # than the target precision.
+    # The rationale:
+    # 1. Compute the difference between precisions (by different thresholds) and
+    #   the target precision.
+    # 2. Take the reciprocal of the values by the above step. The intention is
+    #   to make the positive values rank before negative values and also the
+    #   smaller positives rank before larger positives.
+    tf_index = math_ops.argmax(
+        math_ops.div(1.0, precisions - precision + _EPSILON),
+        0,
+        output_type=dtypes.int32)
+
+    def _return_good_recall():
+      return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + _EPSILON,
+                          name)
 
-  # Now, we have the implicit threshold, so compute the recall:
-  return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + _EPSILON,
-                      name)
+    return control_flow_ops.cond(precisions[tf_index] >= precision,
+                                 _return_good_recall, lambda: .0)
 
 
 def recall_at_precision(labels,
@@ -2561,7 +2587,8 @@ def recall_at_precision(labels,
                         num_thresholds=200,
                         metrics_collections=None,
                         updates_collections=None,
-                        name=None):
+                        name=None,
+                        strict_mode=False):
   """Computes `recall` at `precision`.
 
   The `recall_at_precision` function creates four local variables,
@@ -2593,6 +2620,11 @@ def recall_at_precision(labels,
     updates_collections: An optional list of collections that `update_op` should
       be added to.
     name: An optional variable_scope name.
+    strict_mode: If true and there exists a threshold where the precision is
+      above the target precision, return the corresponding recall at the
+      threshold. Otherwise, return 0. If false, find the threshold where the
+      precision is closest to the target precision and return the recall at the
+      threshold.
 
   Returns:
     recall: A scalar `Tensor` representing the recall at the given
@@ -2621,10 +2653,11 @@ def recall_at_precision(labels,
         predictions, labels, thresholds, weights)
 
     recall = _compute_recall_at_precision(values['tp'], values['fp'],
-                                          values['fn'], precision, 'value')
+                                          values['fn'], precision, 'value',
+                                          strict_mode)
     update_op = _compute_recall_at_precision(update_ops['tp'], update_ops['fp'],
                                              update_ops['fn'], precision,
-                                             'update_op')
+                                             'update_op', strict_mode)
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, recall)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 401fedcbed..024bd54912 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -3467,6 +3467,60 @@ class RecallAtPrecisionTest(test.TestCase):
       self.assertAlmostEqual(target_recall, sess.run(update_op))
       self.assertAlmostEqual(target_recall, recall.eval())
 
+  def _test_strict_mode(self, strict_mode, target_precision, expected_recall):
+    num_thresholds = 11
+    predictions_values = [.2, .3, .5, .6, .7, .8, .9, .9, .9, .1]
+    labels_values = [1, 1, 0, 0, 0, 0, 0, 0, 0, 1]
+    # Resulting thresholds and the corresponding precision and recall values at
+    # each threshold:
+    # Thresholds  [0.1   0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9]
+    # precisions: [0.3   0.2  0.1  0    0    0    0    0    0]
+    # recalls:    [1.0   0.7  0.3  0    0    0    0    0    0]
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    recall, update_op = metrics.recall_at_precision(
+        labels,
+        predictions,
+        num_thresholds=num_thresholds,
+        precision=target_precision,
+        strict_mode=strict_mode)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(expected_recall, sess.run(update_op))
+      self.assertAlmostEqual(expected_recall, recall.eval())
+
+  def testStrictMode_Off(self):
+    # strict_mode is turned off and return the recall at the threshold where the
+    # precision (0.3) is closest to target precision (0.9). The recall
+    # corresponding to the threshold is 1.0.
+    self._test_strict_mode(
+        strict_mode=False, target_precision=0.9, expected_recall=1.0)
+
+  def testStrictMode_OnAndFail(self):
+    # strict_mode is turned on and we fail to reach the target precision at any
+    # threshold.
+    # Target precision: 0.9
+    # Diff:       [-0.6  -0.7  -0.8  -0.9  -0.9  -0.9  -0.9  -0.9  -0.9]
+    # Reciprocal: [-1.6  -1.4  -1.3  -1.1  -1.1  -1.1  -1.1  -1.1  -1.1]
+    # Max index: 3 and corresponding precision is: 0 which is smaller than
+    # target precsion 0.9. As a result, the expected recall is 0.
+    self._test_strict_mode(
+        strict_mode=True, target_precision=0.9, expected_recall=.0)
+
+  def testStrictMode_OnAndSucceed(self):
+    # strict_mode is on and we can reach the target precision at certain
+    # threshold.
+    # Target precision: 0.2
+    # Diff:       [0.1   0      -0.1  -0.2  -0.2  -0.2  -0.2  -0.2  -0.2]
+    # Reciprocal: [10    infty  -10.0 -5.0  -5.0  -5.0  -5.0  -5.0  -5.0]
+    # Max index: 1 and corresponding precision is: 0.2 which is no smaller than
+    # target precsion 0.2. In this case, we return the recall at index 1, which
+    # is 2.0/3 (0.7).
+    self._test_strict_mode(
+        strict_mode=True, target_precision=0.2, expected_recall=2.0 / 3)
+
 
 class PrecisionAtRecallTest(test.TestCase):
 
@@ -3963,7 +4017,7 @@ class StreamingSparsePrecisionTest(test.TestCase):
                                             expected,
                                             class_id=None,
                                             weights=None):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       if weights is not None:
         weights = constant_op.constant(weights, dtypes_lib.float32)
       metric, update = metrics.streaming_sparse_precision_at_k(
@@ -3992,7 +4046,7 @@ class StreamingSparsePrecisionTest(test.TestCase):
                                                 expected,
                                                 class_id=None,
                                                 weights=None):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       if weights is not None:
         weights = constant_op.constant(weights, dtypes_lib.float32)
       metric, update = metrics.streaming_sparse_precision_at_top_k(
@@ -4021,7 +4075,7 @@ class StreamingSparsePrecisionTest(test.TestCase):
                                                     k,
                                                     expected,
                                                     weights=None):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       if weights is not None:
         weights = constant_op.constant(weights, dtypes_lib.float32)
       predictions = constant_op.constant(predictions, dtypes_lib.float32)
@@ -4047,7 +4101,7 @@ class StreamingSparsePrecisionTest(test.TestCase):
                                                         labels,
                                                         expected,
                                                         weights=None):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       if weights is not None:
         weights = constant_op.constant(weights, dtypes_lib.float32)
       metric, update = metrics.streaming_sparse_average_precision_at_top_k(
@@ -4635,7 +4689,7 @@ class StreamingSparseRecallTest(test.TestCase):
                                          expected,
                                          class_id=None,
                                          weights=None):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       if weights is not None:
         weights = constant_op.constant(weights, dtypes_lib.float32)
       metric, update = metrics.streaming_sparse_recall_at_k(
@@ -4664,7 +4718,7 @@ class StreamingSparseRecallTest(test.TestCase):
                                    expected,
                                    class_id=None,
                                    weights=None):
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       if weights is not None:
         weights = constant_op.constant(weights, dtypes_lib.float32)
       metric, update = metric_ops.sparse_recall_at_top_k(
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index 16ddc38f5a..e662b11be8 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -119,6 +119,7 @@ py_test(
     deps = [
         ":pruning_utils",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index cd58526ed3..a81abac2fa 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -476,8 +476,8 @@ class Pruning(object):
 
       smoothed_threshold, new_mask = self._update_mask(pooled_weights,
                                                        threshold)
-      updated_mask = pruning_utils.kronecker_product(
-          new_mask, array_ops.ones(self._block_dim))
+
+      updated_mask = pruning_utils.expand_tensor(new_mask, self._block_dim)
       sliced_mask = array_ops.slice(
           updated_mask, [0, 0],
           [squeezed_weights.get_shape()[0],
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index 33c4ad58bd..cd3d8e76bb 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -61,14 +61,14 @@ class PruningHParamsTest(test.TestCase):
     self.assertEqual(p._weight_sparsity_map["conv2/kernel"], 0.8)
 
   def testInitWithExternalSparsity(self):
-    with self.test_session():
+    with self.cached_session():
       p = pruning.Pruning(spec=self.pruning_hparams, sparsity=self.sparsity)
       variables.global_variables_initializer().run()
       sparsity = p._sparsity.eval()
       self.assertAlmostEqual(sparsity, 0.5)
 
   def testInitWithVariableReuse(self):
-    with self.test_session():
+    with self.cached_session():
       p = pruning.Pruning(spec=self.pruning_hparams, sparsity=self.sparsity)
       p_copy = pruning.Pruning(
           spec=self.pruning_hparams, sparsity=self.sparsity)
@@ -87,7 +87,7 @@ class PruningTest(test.TestCase):
   def testCreateMask2D(self):
     width = 10
     height = 20
-    with self.test_session():
+    with self.cached_session():
       weights = variables.Variable(
           random_ops.random_normal([width, height], stddev=1), name="weights")
       masked_weights = pruning.apply_mask(weights,
@@ -98,7 +98,7 @@ class PruningTest(test.TestCase):
       self.assertAllEqual(weights_val, masked_weights_val)
 
   def testUpdateSingleMask(self):
-    with self.test_session() as session:
+    with self.cached_session() as session:
       weights = variables.Variable(
           math_ops.linspace(1.0, 100.0, 100), name="weights")
       masked_weights = pruning.apply_mask(weights)
@@ -122,7 +122,7 @@ class PruningTest(test.TestCase):
 
     # Set up pruning
     p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
-    with self.test_session():
+    with self.cached_session():
       variables.global_variables_initializer().run()
       _, new_mask = p._maybe_update_block_mask(weights, threshold)
       # Check if the mask is the same size as the weights
@@ -167,7 +167,7 @@ class PruningTest(test.TestCase):
 
   def testPartitionedVariableMasking(self):
     partitioner = partitioned_variables.variable_axis_size_partitioner(40)
-    with self.test_session() as session:
+    with self.cached_session() as session:
       with variable_scope.variable_scope("", partitioner=partitioner):
         sparsity = variables.Variable(0.5, name="Sparsity")
         weights = variable_scope.get_variable(
@@ -201,7 +201,7 @@ class PruningTest(test.TestCase):
     sparsity_val = math_ops.linspace(0.0, 0.9, 10)
     increment_global_step = state_ops.assign_add(self.global_step, 1)
     non_zero_count = []
-    with self.test_session() as session:
+    with self.cached_session() as session:
       variables.global_variables_initializer().run()
       for i in range(10):
         session.run(state_ops.assign(sparsity, sparsity_val[i]))
@@ -234,7 +234,7 @@ class PruningTest(test.TestCase):
     mask_update_op = p.conditional_mask_update_op()
     increment_global_step = state_ops.assign_add(self.global_step, 1)
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       variables.global_variables_initializer().run()
       for _ in range(110):
         session.run(mask_update_op)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
index ef6c6a3f5d..b50a372e9d 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -69,7 +69,7 @@ def weight_threshold_variable(var, scope):
     scope: The variable scope of the variable var
 
   Returns:
-    a scalar threshold variable initialized to 0.
+    A scalar threshold variable initialized to 0.
   """
   with variable_scope.variable_scope(scope):
     threshold = variable_scope.get_variable(
@@ -97,6 +97,74 @@ def kronecker_product(mat1, mat2):
   return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
 
 
+def expand_tensor(tensor, block_dims):
+  """Expands a 2D tensor by replicating the tensor values.
+
+  This is equivalent to the kronecker product of the tensor and a matrix of
+  ones of size block_dims.
+
+  Example:
+
+  tensor = [[1,2]
+            [3,4]]
+  block_dims = [2,2]
+
+  result = [[1 1 2 2]
+            [1 1 2 2]
+            [3 3 4 4]
+            [3 3 4 4]]
+
+  Args:
+    tensor: A 2D tensor that needs to be expanded.
+    block_dims: List of integers specifying the expansion factor.
+
+  Returns:
+    The expanded tensor
+
+  Raises:
+    ValueError: if tensor is not rank-2 or block_dims is does not have 2
+    elements.
+  """
+  if tensor.get_shape().ndims != 2:
+    raise ValueError('Input tensor must be rank 2')
+
+  if len(block_dims) != 2:
+    raise ValueError('block_dims must have 2 elements')
+
+  block_height, block_width = block_dims
+
+  def _tile_rows(tensor, multiple):
+    """Create a new tensor by tiling the tensor along rows."""
+    return array_ops.tile(tensor, [multiple, 1])
+
+  def _generate_indices(num_rows, block_dim):
+    indices = np.zeros(shape=[num_rows * block_dim, 1], dtype=np.int32)
+    for k in range(block_dim):
+      for r in range(num_rows):
+        indices[k * num_rows + r] = r * block_dim + k
+    return indices
+
+  def _replicate_rows(tensor, multiple):
+    tensor_shape = tensor.shape.as_list()
+    expanded_shape = [tensor_shape[0] * multiple, tensor_shape[1]]
+    indices = constant_op.constant(_generate_indices(tensor_shape[0], multiple))
+    return array_ops.scatter_nd(indices, _tile_rows(tensor, multiple),
+                                expanded_shape)
+
+  expanded_tensor = tensor
+
+  # Expand rows by factor block_height.
+  if block_height > 1:
+    expanded_tensor = _replicate_rows(tensor, block_height)
+
+  # Transpose and expand by factor block_width. Transpose the result.
+  if block_width > 1:
+    expanded_tensor = array_ops.transpose(
+        _replicate_rows(array_ops.transpose(expanded_tensor), block_width))
+
+  return expanded_tensor
+
+
 def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
   """Return histogram of values.
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
index ccde5b4e8a..0aca843497 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.model_pruning.python import pruning_utils
@@ -26,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -36,27 +38,13 @@ class PruningUtilsTest(test.TestCase):
   def _compare_cdf(self, values):
     abs_values = math_ops.abs(values)
     max_value = math_ops.reduce_max(abs_values)
-    with self.test_session():
+    with self.cached_session():
       variables.global_variables_initializer().run()
       cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
           abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
       cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
       self.assertAllEqual(cdf.eval(), cdf_from_histogram.eval())
 
-  def _compare_pooling_methods(self, weights, pooling_kwargs):
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      pooled_weights_tf = array_ops.squeeze(
-          nn_ops.pool(
-              array_ops.reshape(
-                  weights,
-                  [1, weights.get_shape()[0],
-                   weights.get_shape()[1], 1]), **pooling_kwargs))
-      pooled_weights_factorized_pool = pruning_utils.factorized_pool(
-          weights, **pooling_kwargs)
-      self.assertAllClose(pooled_weights_tf.eval(),
-                          pooled_weights_factorized_pool.eval())
-
   def testHistogram(self):
     width = 10
     height = 10
@@ -67,7 +55,7 @@ class PruningUtilsTest(test.TestCase):
         "weights", [width, height], initializer=init)
     histogram = pruning_utils._histogram(
         weights, [0, 1.0], nbins, dtype=np.float32)
-    with self.test_session():
+    with self.cached_session():
       variables.global_variables_initializer().run()
       computed_histogram = histogram.eval()
     self.assertAllEqual(expected_histogram, computed_histogram)
@@ -79,7 +67,7 @@ class PruningUtilsTest(test.TestCase):
     norm_cdf = pruning_utils.compute_cdf_from_histogram(
         abs_weights, [0.0, 5.0], nbins=nbins)
     expected_cdf = np.array([0.1, 0.4, 0.5, 0.6, 1.0], dtype=np.float32)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       variables.global_variables_initializer().run()
       norm_cdf_val = sess.run(norm_cdf)
       self.assertAllEqual(len(norm_cdf_val), nbins)
@@ -95,26 +83,60 @@ class PruningUtilsTest(test.TestCase):
     weights = variable_scope.get_variable("weights", shape=[5, 5, 128, 128])
     self._compare_cdf(weights)
 
-  def testFactorizedAvgPool(self):
+
+@parameterized.named_parameters(
+    ("1x1", [1, 1]), ("4x4", [4, 4]), ("6x6", [6, 6]), ("1x4", [1, 4]),
+    ("4x1", [4, 1]), ("1x8", [1, 8]), ("8x1", [8, 1]))
+class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
+
+  def _compare_pooling_methods(self, weights, pooling_kwargs):
+    with self.cached_session():
+      variables.global_variables_initializer().run()
+      pooled_weights_tf = array_ops.squeeze(
+          nn_ops.pool(
+              array_ops.reshape(
+                  weights,
+                  [1, weights.get_shape()[0],
+                   weights.get_shape()[1], 1]), **pooling_kwargs))
+      pooled_weights_factorized_pool = pruning_utils.factorized_pool(
+          weights, **pooling_kwargs)
+      self.assertAllClose(pooled_weights_tf.eval(),
+                          pooled_weights_factorized_pool.eval())
+
+  def _compare_expand_tensor_with_kronecker_product(self, tensor, block_dim):
+    with self.cached_session() as session:
+      variables.global_variables_initializer().run()
+      expanded_tensor = pruning_utils.expand_tensor(tensor, block_dim)
+      kronecker_product = pruning_utils.kronecker_product(
+          tensor, array_ops.ones(block_dim))
+      expanded_tensor_val, kronecker_product_val = session.run(
+          [expanded_tensor, kronecker_product])
+      self.assertAllEqual(expanded_tensor_val, kronecker_product_val)
+
+  def testFactorizedAvgPool(self, window_shape):
     weights = variable_scope.get_variable("weights", shape=[1024, 2048])
     pooling_kwargs = {
-        "window_shape": [2, 4],
+        "window_shape": window_shape,
         "pooling_type": "AVG",
-        "strides": [2, 4],
+        "strides": window_shape,
         "padding": "SAME"
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
-  def testFactorizedMaxPool(self):
+  def testFactorizedMaxPool(self, window_shape):
     weights = variable_scope.get_variable("weights", shape=[1024, 2048])
     pooling_kwargs = {
-        "window_shape": [2, 4],
+        "window_shape": window_shape,
         "pooling_type": "MAX",
-        "strides": [2, 4],
+        "strides": window_shape,
         "padding": "SAME"
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
+  def testExpandTensor(self, block_dim):
+    weights = random_ops.random_normal(shape=[1024, 512])
+    self._compare_expand_tensor_with_kronecker_product(weights, block_dim)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py
index 255daa0360..237510cb0c 100644
--- a/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py
+++ b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py
@@ -144,7 +144,7 @@ class StripPruningVarsTest(test.TestCase):
     return outputs
 
   def _get_initial_outputs(self, output_tensor_names_list):
-    with self.test_session(graph=self.initial_graph) as sess1:
+    with self.session(graph=self.initial_graph) as sess1:
       self._prune_model(sess1)
       reference_outputs = self._get_outputs(sess1, self.initial_graph,
                                             output_tensor_names_list)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index 57a96c5d33..7d158cc980 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -12,14 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
-#define TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#ifndef TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
+#define TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
 
 #ifdef GOOGLE_CUDA
 
 #include <unordered_map>
 #include <vector>
 
+// TODO(rmlarsen): Get rid of this workaround. "gpu_assert" is defined when
+// setting EIGEN_USE_THREADS. But when defining EIGEN_USE_THREADS here,
+// incAtomic and other CUDA specific symbols are no longer recognized.
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+
 #include "third_party/nccl/nccl.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -128,4 +135,4 @@ class NcclManager {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#endif  // TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
index 54a98e6f14..3aec88bcbf 100644
--- a/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
@@ -32,7 +32,7 @@ class AlphaDropoutTest(test.TestCase):
   def testAlphaDropout(self):
     x_dim, y_dim = 40, 30
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.test_session():
+      with self.cached_session():
         t = random_ops.random_normal([x_dim, y_dim])
         output = alpha_dropout(t, keep_prob)
         self.assertEqual([x_dim, y_dim], output.get_shape())
diff --git a/tensorflow/contrib/nn/python/ops/fwd_gradients_test.py b/tensorflow/contrib/nn/python/ops/fwd_gradients_test.py
index 56062c3cab..4cdac6a742 100644
--- a/tensorflow/contrib/nn/python/ops/fwd_gradients_test.py
+++ b/tensorflow/contrib/nn/python/ops/fwd_gradients_test.py
@@ -35,7 +35,7 @@ class ForwardAdTest(test.TestCase):
     dydx_tf = fwd_gradients.fwd_gradients([y], [x], [grad_x])[0]
     dydx_py = 2. * grad_x
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllClose(sess.run(dydx_tf), dydx_py, 1e-6)
 
   def testGather(self):
@@ -44,7 +44,7 @@ class ForwardAdTest(test.TestCase):
     y.set_shape([2])
     dydx = fwd_gradients.fwd_gradients([y], [x], assert_unused=True)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(dydx)
 
 
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops_test.py b/tensorflow/contrib/nn/python/ops/sampling_ops_test.py
index 1d4fe1321b..11738bb215 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops_test.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops_test.py
@@ -227,7 +227,7 @@ class RankSampledSoftmaxLossTest(test.TestCase):
           sampled_values=self._resampled_values,
           remove_accidental_hits=self._remove_accidental_hits,
           partition_strategy=partition_strategy)
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         loss_val = sess.run(loss)
         loss_nn_val = sess.run(loss_nn)
 
@@ -299,7 +299,7 @@ class RankSampledSoftmaxLossTest(test.TestCase):
           sampled_values=resampled_values,
           remove_accidental_hits=remove_accidental_hits,
           partition_strategy='div')
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         loss_val = sess.run(loss)
         loss_nn_val = sess.run(loss_nn)
 
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 778b710d78..5319a8b655 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -20,6 +20,7 @@ py_library(
         "python/training/elastic_average_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/ggt.py",
+        "python/training/lars_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/model_average_optimizer.py",
         "python/training/moving_average_optimizer.py",
@@ -365,3 +366,18 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "lars_optimizer_test",
+    srcs = ["python/training/lars_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 9471fb0181..ad7d7cfa6e 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -24,12 +24,14 @@ from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
+from tensorflow.contrib.opt.python.training.lars_optimizer import *
 from tensorflow.contrib.opt.python.training.ggt import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
 from tensorflow.contrib.opt.python.training.model_average_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
 from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
 from tensorflow.contrib.opt.python.training.nadam_optimizer import *
+from tensorflow.contrib.opt.python.training.reg_adagrad_optimizer import *
 from tensorflow.contrib.opt.python.training.shampoo import *
 from tensorflow.contrib.opt.python.training.weight_decay_optimizers import *
 from tensorflow.contrib.opt.python.training.powersign import *
@@ -46,6 +48,7 @@ _allowed_symbols = [
     'DelayCompensatedGradientDescentOptimizer',
     'DropStaleGradientOptimizer',
     'ExternalOptimizerInterface',
+    'LARSOptimizer',
     'LazyAdamOptimizer',
     'NadamOptimizer',
     'MovingAverageOptimizer',
@@ -63,6 +66,7 @@ _allowed_symbols = [
     'ModelAverageCustomGetter',
     'GGTOptimizer',
     'ShampooOptimizer',
+    'RegAdagradOptimizer',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 915e6504e1..61d8b94eca 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -74,7 +74,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)
         m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
@@ -142,7 +142,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         repeated_index_update_var = variables.Variable(
             [[1.0], [2.0]], dtype=dtype)
         aggregated_update_var = variables.Variable(
@@ -172,7 +172,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
   def doTestBasic(self, use_resource=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.test_session(graph=ops.Graph()):
+      with self.session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -233,7 +233,7 @@ class AdaMaxOptimizerTest(test.TestCase):
                              opt.get_slot(var=var0, name="m").name)
 
   def testBasic(self):
-    with self.test_session():
+    with self.cached_session():
       self.doTestBasic(use_resource=False)
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
@@ -242,7 +242,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -278,7 +278,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
index 5763593b81..bbafd59aae 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -17,22 +17,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-
-from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import optimizer
+from tensorflow.python.training import saver
 from tensorflow.python.training import session_run_hook
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import constant_op
 
 LOCAL_VARIABLE_NAME = 'local_center_variable'
 GLOBAL_VARIABLE_NAME = 'global_center_variable'
+GLOBAL_STEP = 'global_step'
 
 
 class ElasticAverageCustomGetter(object):
@@ -52,16 +53,32 @@ class ElasticAverageCustomGetter(object):
   with tf.device(
     tf.train.replica_device_setter(
       worker_device=worker_device,
-      ps_device="/job:ps/cpu:0",
+      ps_device="/job:ps",
       cluster=cluster)),
     tf.variable_scope('',custom_getter=ea_custom_getter):
-    hid_w = tf.get_variable(
-      initializer=tf.truncated_normal(
-          [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
-          stddev=1.0 / IMAGE_PIXELS),
-      name="hid_w")
-    hid_b = tf.get_variable(initializer=tf.zeros([FLAGS.hidden_units]),
-                            name="hid_b")
+    ...
+    create your model here
+    ...
+    with tf.device(worker_device):
+      opt = tf.train.MomentumOptimizer(...)
+      optimizer = ElasticAverageOptimizer(
+            opt,
+            num_worker=2,
+            moving_rate=0.01, # or use default value
+            communication_period=20,
+            ea_custom_getter=ea_custom_getter)
+      ...
+      train_op = optimizer.apply_gradients(
+        grads_vars,
+        global_step=global_step)
+    ...
+    hooks = [optimizer.make_session_run_hook(is_chief, task_index)]
+    ...
+    with tf.train.MonitoredTrainingSession(master=server.target,
+                                           is_chief=is_chief,
+                                           checkpoint_dir=("...),
+                                           save_checkpoint_secs=600,
+                                           hooks=hooks) as mon_sess:
   """
 
   def __init__(self, worker_device):
@@ -83,24 +100,40 @@ class ElasticAverageCustomGetter(object):
             collections=[ops.GraphKeys.LOCAL_VARIABLES],
             *args,
             **kwargs)
-      global_center_variable = variable_scope.variable(
+      if kwargs['reuse'] == True:
+        return local_var
+      global_center_variable = getter(
           name='%s/%s' % (GLOBAL_VARIABLE_NAME, name),
-          initial_value=local_var.initialized_value(),
           trainable=False,
-          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+          *args,
+          **kwargs)
 
       with ops.device(self._worker_device):
-        local_center_variable = variable_scope.variable(
+        local_center_variable = getter(
             name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
-            initial_value=local_var.initialized_value(),
             trainable=False,
-            collections=[ops.GraphKeys.LOCAL_VARIABLES])
-
-      self._local_map[local_var] = local_center_variable
-      self._global_map[local_var] = global_center_variable
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            *args,
+            **kwargs)
+      if kwargs['partitioner'] is None:
+        self._local_map[local_var] = local_center_variable
+        self._global_map[local_var] = global_center_variable
+      else:
+        v_list = list(local_var)
+        for i in range(len(v_list)):
+          self._local_map[v_list[i]] \
+              = list(local_center_variable)[i]
+          self._global_map[v_list[i]] \
+              = list(global_center_variable)[i]
       return local_var
     else:
-      return getter(name, trainable, collections, *args, **kwargs)
+      return getter(
+          name,
+          trainable=trainable,
+          collections=collections,
+          *args,
+          **kwargs)
 
 
 class ElasticAverageOptimizer(optimizer.Optimizer):
@@ -125,6 +158,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
                moving_rate=None,
                rho=None,
                use_locking=True,
+               synchronous=False,
                name='ElasticAverageOptimizer'):
     """Construct a new gradient descent optimizer.
 
@@ -136,9 +170,16 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
       communication_period: An int point value to controls the frequency
         of the communication between every worker and the ps.
       moving_rate: A floating point value to control the elastic difference.
-      rho: the amount of exploration we allow ine the model. The default
+      rho: the amount of exploration we allow in the model. The default
         value is moving_rate/learning_rate
+        rho=0.0 is suggested in async mode.
       use_locking: If True use locks for update operations.
+      synchronous: Add_sync_queues_and_barrier or not.
+              True: all workers will wait for each other before start training
+              False: worker can start training when its initilization is done,
+                     no need to wait for everyone is ready.
+                     in case one worker is restarted, it can join and continue
+                     training without being blocked.
       name: Optional name prefix for the operations created when applying
         gradients. Defaults to "ElasticAverageOptimizer".
     """
@@ -148,6 +189,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     self._period = communication_period
     self._local_map = ea_custom_getter._local_map
     self._global_map = ea_custom_getter._global_map
+    self._synchronous = synchronous
 
     if moving_rate is None:
       self._moving_rate = self.BETA / communication_period / num_worker
@@ -241,11 +283,29 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
       TypeError: If `grads_and_vars` is malformed.
       ValueError: If none of the variables have gradients.
     """
+    global_old = set(n.op.name for n in variables.global_variables())
     apply_updates = self._opt.apply_gradients(grads_and_vars)
+    global_new = set(n.op.name for n in variables.global_variables())
     with ops.control_dependencies([apply_updates]):
       local_update = state_ops.assign_add(
           self._local_step, 1, name='local_step_update').op
 
+    # this is for place the variables created by optimizer to local collection
+    # e.g., AdamOptimizer will create beta as global variables
+    def _adjust_optimizer_variable_collection(opt_vars):
+      g = ops.get_default_graph()
+      idx = 0
+      for _ in range(len(g._collections[ops.GraphKeys.GLOBAL_VARIABLES])):
+        var = g.get_collection_ref(ops.GraphKeys.GLOBAL_VARIABLES)[idx]
+        name = var.op.name
+        if name in opt_vars:
+          ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, var)
+          del g.get_collection_ref(ops.GraphKeys.GLOBAL_VARIABLES)[idx]
+        else:
+          idx += 1
+
+    _adjust_optimizer_variable_collection(global_new - global_old)
+
     # update global variables.
     def _Update_global_variables():
       local_vars = [v for g, v in grads_and_vars if g is not None]
@@ -290,7 +350,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     variables equal to the global center variables before the training begins"""
 
     def _Add_sync_queues_and_barrier(enqueue_after_list):
-      """Adds ops to enqueu on all worker queues"""
+      """Adds ops to enqueue on all worker queues"""
       sync_queues = [
           data_flow_ops.FIFOQueue(
               self._num_worker, [dtypes.bool],
@@ -324,6 +384,9 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
       init_ops.append(state_ops.assign(lc_var, gc_var))
 
     init_op = control_flow_ops.group(*(init_ops))
+    if self._synchronous == False:
+      return init_op
+
     sync_queue_op = _Add_sync_queues_and_barrier([init_op])
     return sync_queue_op
 
@@ -331,6 +394,51 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     """Creates a hook to handle ElasticAverageOptimizerHook ops such as initialization."""
     return _ElasticAverageOptimizerHook(self, is_chief, task_index)
 
+  def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs):
+    """Create a saver copy global_center_variable to trainable variables
+    Please call this function after all your variables created with
+    ElasticAverageCustomGetter. For evaluations or inference, use this saver
+    during training.  It will save the global_center_variable of the trained
+    parameters under the original parameter names.
+    Args:
+      var_list: List of variables to save, as per `Saver()`.
+                If set to None, save all the trainable_variables that have
+                been created before this call.
+      name: The name of the saver.
+      **kwargs: Keyword arguments of `Saver()`.
+    Returns:
+      A `tf.train.Saver` object.
+    Raises:
+      RuntimeError: global_center_variable is empty, please make sure
+                    this is called after model created and
+                    ElasticAverageCustomGetter is used when declaring you model
+    """
+    if not self._global_map:
+      raise RuntimeError('global_center_variable is empty, please make sure '
+                         'this is called after model created and '
+                         'ElasticAverageCustomGetter is used when declaring '
+                         'you model')
+
+    if var_list is None:
+      var_list = variables.trainable_variables()
+    if not isinstance(var_list, dict):
+      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
+
+    swapped_var_list = {}
+    for key, var in var_list.items():
+      tensor = var
+
+      if not isinstance(var, list):
+        for tvar in variables.trainable_variables():
+          if tvar.op.name == var.op.name:
+            tensor = self._global_map.get(tvar, var)
+            break
+      else: #partitioned variable
+        tensor = [self._global_map.get(lvar, lvar) for lvar in var]
+
+      swapped_var_list[key] = tensor
+
+    return saver.Saver(swapped_var_list, name=name, **kwargs)
 
 class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
 
@@ -351,3 +459,7 @@ class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
     if self._is_chief:
       self._global_init_op = variables.global_variables_initializer()
     self._variable_init_op = self._ea_optimizer.get_init_op(self._task_index)
+
+  def after_create_session(self, session, coord):
+    """Run initialization ops"""
+    session.run(self._variable_init_op)
+\ No newline at end of file
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
index 5ed8057b86..5bf6a08de1 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -17,17 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import portpicker
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import device_setter
 from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import device_setter
 
 from tensorflow.contrib.opt.python.training.elastic_average_optimizer import \
   ElasticAverageOptimizer, ElasticAverageCustomGetter, GLOBAL_VARIABLE_NAME
@@ -59,29 +64,49 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
 # Creates the workers and return their sessions, graphs, train_ops.
 # Chief worker will update at last
-def _get_workers(num_workers, period, workers, moving_rate):
+def _get_workers(num_workers, period, workers, moving_rate, num_ps=1):
   sessions = []
   graphs = []
   train_ops = []
+  savers = []
   for worker_id in range(num_workers):
     graph = ops.Graph()
     is_chief = (worker_id == 0)
     with graph.as_default():
       worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
-      ea_coustom = ElasticAverageCustomGetter(worker_device=worker_device)
+      ea_custom = ElasticAverageCustomGetter(worker_device=worker_device)
       with variable_scope.variable_scope(
-          "", custom_getter=ea_coustom), ops.device(
+          "", custom_getter=ea_custom), ops.device(
               device_setter.replica_device_setter(
                   worker_device=worker_device,
                   ps_device="/job:ps/task:0/cpu:0",
                   ps_tasks=1)):
-        global_step = variables.Variable(0, name="global_step", trainable=False)
+        global_step = training_util.get_or_create_global_step()
         var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
         var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
+      if num_ps > 1:
+        with variable_scope.variable_scope(
+            "",
+            partitioner=partitioned_variables.fixed_size_partitioner(
+                num_ps, axis=0),
+            custom_getter=ea_custom), ops.device(
+                device_setter.replica_device_setter(
+                    worker_device=worker_device,
+                    ps_device="/job:ps/task:0/cpu:0",
+                    ps_tasks=num_ps)):
+
+          partition_var = variable_scope.get_variable(
+              'partition_var',
+              shape=[2, 4],
+              initializer=init_ops.ones_initializer)
+          part_0 = list(partition_var)[0]
+          part_1 = list(partition_var)[1]
 
       with ops.device("/job:worker/task:" + str(worker_id)):
         grads_0 = constant_op.constant(-1.0)
         grads_1 = constant_op.constant(-1.0)
+        grads_part_0 = constant_op.constant([[-1., -1., -1., -1.]])
+        grads_part_1 = constant_op.constant([[-1., -1., -1., -1.]])
 
         sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
         opt = ElasticAverageOptimizer(
@@ -89,12 +114,22 @@ def _get_workers(num_workers, period, workers, moving_rate):
             num_worker=num_workers,
             moving_rate=moving_rate,
             communication_period=period,
-            ea_custom_getter=ea_coustom)
-        train_op = [
-            opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
-                                global_step)
-        ]
+            ea_custom_getter=ea_custom)
+        if num_ps == 1:
+          train_op = [
+              opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
+                                  global_step)
+          ]
+        else:
+          train_op = [
+              opt.apply_gradients(([grads_0, var_0],
+                                   [grads_1, var_1],
+                                   [grads_part_0, part_0],
+                                   [grads_part_1, part_1]),
+                                  global_step)
+          ]
         easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
+        saver = opt.swapping_saver()
       # Creates MonitoredSession
       sess = training.MonitoredTrainingSession(
           workers[worker_id].target, hooks=[easgd_hook])
@@ -102,8 +137,9 @@ def _get_workers(num_workers, period, workers, moving_rate):
     sessions.append(sess)
     graphs.append(graph)
     train_ops.append(train_op)
+    savers.append(saver)
 
-  return sessions, graphs, train_ops
+  return sessions, graphs, train_ops, savers
 
 
 class ElasticAverageOptimizerTest(test.TestCase):
@@ -118,7 +154,7 @@ class ElasticAverageOptimizerTest(test.TestCase):
     cluster, workers, _ = create_local_cluster(
         num_workers=num_workers, num_ps=num_ps)
 
-    sessions, graphs, train_ops = _get_workers(
+    sessions, graphs, train_ops, savers = _get_workers(
         num_workers, communication_period, workers, 1.0)
 
     var_0 = graphs[0].get_tensor_by_name("v0:0")
@@ -158,6 +194,21 @@ class ElasticAverageOptimizerTest(test.TestCase):
     self.assertAllEqual(2.0, sessions[0].run(var_0_g))
     self.assertAllEqual(3.0, sessions[0].run(var_1_g))
     self.assertAllEqual(1, sessions[0].run(global_step))
+    sessions[0].run(train_ops[0])
+
+    # save, data will be global value
+    outfile = os.path.join(test.get_temp_dir(), "model")
+    savers[0].save(sessions[0]._sess._sess._sess._sess,
+                   save_path=outfile)
+    ops.reset_default_graph()   # restore on a new graph
+    with session.Session() as sess:
+      v0 = variable_scope.get_variable(initializer=0.0, name="v0")
+      v1 = variable_scope.get_variable(initializer=1.0, name="v1")
+      sess.run(variables.local_variables_initializer())
+      saver_opt = saver.Saver(var_list=[v1, v0])
+      saver_opt.restore(sess, outfile)
+      self.assertAllEqual(2.0, sess.run(v0))
+      self.assertAllEqual(3.0, sess.run(v1))
 
   def test2Worker1Period(self):
     num_workers = 2
@@ -166,8 +217,8 @@ class ElasticAverageOptimizerTest(test.TestCase):
     cluster, workers, _ = create_local_cluster(
         num_workers=num_workers, num_ps=num_ps)
 
-    sessions, graphs, train_ops = _get_workers(
-        num_workers, communication_period, workers, 0.5)
+    sessions, graphs, train_ops, savers = _get_workers(
+        num_workers, communication_period, workers, 0.5, num_ps=2)
 
     var_0 = graphs[0].get_tensor_by_name("v0:0")
     var_1 = graphs[0].get_tensor_by_name("v1:0")
@@ -177,6 +228,9 @@ class ElasticAverageOptimizerTest(test.TestCase):
 
     var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
     var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    part_0_g = graphs[0].get_tensor_by_name(
+        GLOBAL_VARIABLE_NAME + "/partition_var/part_0:0")
+
     # Verify the initialized value.
     self.assertAllEqual(0.0, sessions[0].run(var_0))
     self.assertAllEqual(1.0, sessions[0].run(var_1))
@@ -194,22 +248,45 @@ class ElasticAverageOptimizerTest(test.TestCase):
     self.assertAllEqual(1.75, sessions[0].run(var_1_g))
     self.assertAllEqual(0.75, sessions[1].run(var_0_1))
     self.assertAllEqual(1.75, sessions[1].run(var_1_1))
+    # part_0 of global_center copy
+    part_0_g = sessions[0].run(part_0_g)
+
+    outfile = os.path.join(test.get_temp_dir(), "model")
+    savers[0].save(sessions[0]._sess._sess._sess._sess,
+                   save_path=outfile)
+
+    # verify restore of partitioned_variables
+    ops.reset_default_graph()   # restore on a new graph
+    g = ops.get_default_graph()
+    with session.Session() as sess, g.as_default():
+      with variable_scope.variable_scope(
+          "",
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_ps, axis=0)):
+        partition_var = variable_scope.get_variable(
+            'partition_var',
+            shape=[2, 4],
+            initializer=init_ops.ones_initializer)
+      s = saver.Saver(var_list=[partition_var])
+      s.restore(sess, outfile)
+      part_0 = g.get_tensor_by_name('partition_var/part_0:0')
+      self.assertAllEqual(part_0_g, sess.run(part_0))
 
   def testPS2TasksWithClusterSpecClass(self):
     cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
-    ea_coustom = ElasticAverageCustomGetter(worker_device="/job:worker/task:0")
+    ea_custom = ElasticAverageCustomGetter(worker_device="/job:worker/task:0")
     from tensorflow.python.training import device_setter
     with ops.device(
         device_setter.replica_device_setter(cluster=cluster_spec,
                                             worker_device="/job:worker/task:0",
                                             ps_device="/job:ps")), \
-         variable_scope.variable_scope("", custom_getter=ea_coustom):
+        variable_scope.variable_scope("", custom_getter=ea_custom):
       v = variable_scope.get_variable(initializer=[1, 2], name="v")
       w = variable_scope.get_variable(initializer=[2, 1], name="w")
-      v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w]
+      v_g, w_g = ea_custom._global_map[v], ea_custom._global_map[w]
       self.assertDeviceEqual("/job:worker/task:0", v.device)
       self.assertDeviceEqual("job:ps/task:0", v_g.device)
       self.assertDeviceEqual("/job:worker/task:0", w.device)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 953586ee70..9997103016 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -85,7 +85,7 @@ class ExternalOptimizerInterfaceTest(TestCase):
 
     optimizer = MockOptimizerInterface(loss)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       optimizer.minimize(sess)
@@ -107,7 +107,7 @@ class ExternalOptimizerInterfaceTest(TestCase):
 
     optimizer = MockOptimizerInterface(loss)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       initial_vector_val = sess.run(vector)
@@ -164,7 +164,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
     optimizer = external_optimizer.ScipyOptimizerInterface(
         self._objective(x), method=method, options=options)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       optimizer.minimize(sess)
 
@@ -176,7 +176,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
     x = variables.Variable(array_ops.zeros(dimension))
     optimizer = external_optimizer.ScipyOptimizerInterface(self._objective(x))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       optimizer.minimize(sess)
 
@@ -242,7 +242,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
     optimizer = external_optimizer.ScipyOptimizerInterface(
         loss, equalities=equalities, inequalities=inequalities, method='SLSQP')
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       optimizer.minimize(sess)
       self.assertAllClose(np.ones(2), sess.run(vector))
@@ -260,7 +260,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
     optimizer = external_optimizer.ScipyOptimizerInterface(
         loss, var_to_bounds=var_to_bounds)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       optimizer.minimize(sess)
       self.assertAllClose(np.ones(2), sess.run(vector))
@@ -277,7 +277,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
     optimizer = external_optimizer.ScipyOptimizerInterface(
         loss, var_to_bounds=var_to_bounds)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       optimizer.minimize(sess)
       self.assertAllClose([0., 2.], sess.run(vector))
@@ -293,7 +293,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
     optimizer = external_optimizer.ScipyOptimizerInterface(
         loss, method='SLSQP')
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       optimizer.minimize(sess)
       method = optimizer.optimizer_kwargs.get('method')
@@ -312,7 +312,7 @@ class ScipyOptimizerInterfaceTest(TestCase):
 
     optimizer = external_optimizer.ScipyOptimizerInterface(loss, method='SLSQP')
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       initial_vector_val = sess.run(vector)
diff --git a/tensorflow/contrib/opt/python/training/ggt_test.py b/tensorflow/contrib/opt/python/training/ggt_test.py
index 42162960b0..1775edabb3 100644
--- a/tensorflow/contrib/opt/python/training/ggt_test.py
+++ b/tensorflow/contrib/opt/python/training/ggt_test.py
@@ -76,7 +76,7 @@ class GGTOptimizerTest(test.TestCase):
   def doTestBasic(self, use_resource=False):
     # SVD does not support float16
     for i, dtype in enumerate([dtypes.float32, dtypes.float64]):
-      with self.test_session(graph=ops.Graph()):
+      with self.session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
         m0 = 0.0
         window = 3
@@ -171,7 +171,7 @@ class GGTOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testBasic(self):
-    with self.test_session():
+    with self.cached_session():
       self.doTestBasic(use_resource=False)
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer.py b/tensorflow/contrib/opt/python/training/lars_optimizer.py
new file mode 100644
index 0000000000..a8dafd9a4c
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer.py
@@ -0,0 +1,164 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layer-wise Adaptive Rate Scaling optimizer for large-batch training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class LARSOptimizer(optimizer.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+
+  Implements the LARS learning rate scheme presented in the paper above. This
+  optimizer is useful when scaling the batch size to up to 32K without
+  significant performance degradation. It is recommended to use the optimizer
+  in conjunction with:
+      - Gradual learning rate warm-up
+      - Linear learning rate scaling
+      - Poly rule learning rate decay
+
+  Note, LARS scaling is currently only enabled for dense tensors. Sparse tensors
+  use the default momentum optimizer.
+  """
+
+  def __init__(
+      self,
+      learning_rate,
+      momentum=0.9,
+      weight_decay=0.0001,
+      # The LARS coefficient is a hyperparameter
+      eeta=0.001,
+      epsilon=0.0,
+      name="LARSOptimizer",
+      # Enable skipping variables from LARS scaling.
+      # TODO(sameerkm): Enable a direct mechanism to pass a
+      # subset of variables to the optimizer.
+      skip_list=None,
+      use_nesterov=False):
+    """Construct a new LARS Optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or floating point value. The base learning rate.
+      momentum: A floating point value. Momentum hyperparameter.
+      weight_decay: A floating point value. Weight decay hyperparameter.
+      eeta: LARS coefficient as used in the paper. Dfault set to LARS
+        coefficient from the paper. (eeta / weight_decay) determines the highest
+        scaling factor in LARS.
+      epsilon: Optional epsilon parameter to be set in models that have very
+        small gradients. Default set to 0.0.
+      name: Optional name prefix for variables and ops created by LARSOptimizer.
+      skip_list: List of strings to enable skipping variables from LARS scaling.
+        If any of the strings in skip_list is a subset of var.name, variable
+        'var' is skipped from LARS scaling. For a typical classification model
+        with batch normalization, the skip_list is ['batch_normalization',
+        'bias']
+      use_nesterov: when set to True, nesterov momentum will be enabled
+
+    Raises:
+      ValueError: If a hyperparameter is set to a non-sensical value.
+    """
+    if momentum < 0.0:
+      raise ValueError("momentum should be positive: %s" % momentum)
+    if weight_decay < 0.0:
+      raise ValueError("weight_decay should be positive: %s" % weight_decay)
+    super(LARSOptimizer, self).__init__(use_locking=False, name=name)
+
+    self._learning_rate = learning_rate
+    self._momentum = momentum
+    self._weight_decay = weight_decay
+    self._eeta = eeta
+    self._epsilon = epsilon
+    self._name = name
+    self._skip_list = skip_list
+    self._use_nesterov = use_nesterov
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self._zeros_slot(v, "momentum", self._name)
+
+  def compute_lr(self, grad, var):
+    scaled_lr = self._learning_rate
+    if self._skip_list is None or not any(v in var.name
+                                          for v in self._skip_list):
+      w_norm = linalg_ops.norm(var, ord=2)
+      g_norm = linalg_ops.norm(grad, ord=2)
+      trust_ratio = array_ops.where(
+          math_ops.greater(w_norm, 0),
+          array_ops.where(
+              math_ops.greater(g_norm, 0),
+              (self._eeta * w_norm /
+               (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0),
+          1.0)
+      scaled_lr = self._learning_rate * trust_ratio
+    return scaled_lr
+
+  def _apply_dense(self, grad, var):
+    scaled_lr = self.compute_lr(grad, var)
+    mom = self.get_slot(var, "momentum")
+    return training_ops.apply_momentum(
+        var,
+        mom,
+        scaled_lr,
+        grad,
+        self._momentum,
+        use_locking=False,
+        use_nesterov=self._use_nesterov)
+
+  def _resource_apply_dense(self, grad, var):
+    scaled_lr = self.compute_lr(grad, var)
+    mom = self.get_slot(var, "momentum")
+    return training_ops.resource_apply_momentum(
+        var.handle,
+        mom.handle,
+        scaled_lr,
+        grad,
+        self._momentum,
+        use_locking=False,
+        use_nesterov=self._use_nesterov)
+
+  # Fallback to momentum optimizer for sparse tensors
+  def _apply_sparse(self, grad, var):
+    mom = self.get_slot(var, "momentum")
+    return training_ops.sparse_apply_momentum(
+        var,
+        mom,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    mom = self.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_momentum(
+        var.handle,
+        mom.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype),
+        grad,
+        indices,
+        math_ops.cast(self._momentum_tensor, grad.dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer_test.py b/tensorflow/contrib/opt/python/training/lars_optimizer_test.py
new file mode 100644
index 0000000000..b76db763da
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer_test.py
@@ -0,0 +1,127 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0. Licensed to the Apache
+# Software Foundation. You may not use this file except in compliance with the
+# License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for Layer-wise Adaptive Rate Scaling optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import lars_optimizer as lo
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class LARSOptimizerTest(test.TestCase):
+
+  def testLARSGradientOneStep(self):
+    for _ in range(10):
+      for dtype in [dtypes.float32, dtypes.float64]:
+        with self.cached_session() as sess:
+          shape = [3, 3]
+          var_np = np.ones(shape)
+          grad_np = np.ones(shape)
+          lr_np = 0.1
+          m_np = 0.9
+          wd_np = 0.1
+          ep_np = 1e-5
+          eeta = 0.1
+          vel_np = np.zeros(shape)
+
+          var = variables.Variable(var_np, dtype=dtype)
+          grad = variables.Variable(grad_np, dtype=dtype)
+          opt = lo.LARSOptimizer(
+              learning_rate=lr_np,
+              momentum=m_np,
+              weight_decay=wd_np,
+              eeta=eeta,
+              epsilon=ep_np)
+
+          step = opt.apply_gradients([(grad, var)])
+          variables.global_variables_initializer().run()
+
+          pre_var = sess.run(var)
+          pre_vel = sess.run(opt.get_slot(var, 'momentum'))
+          self.assertAllClose(var_np, pre_var)
+          self.assertAllClose(vel_np, pre_vel)
+
+          step.run()
+          post_var = sess.run(var)
+          post_vel = sess.run(opt.get_slot(var, 'momentum'))
+
+          w_norm = np.linalg.norm(var_np.flatten(), ord=2)
+          g_norm = np.linalg.norm(grad_np.flatten(), ord=2)
+          trust_ratio = eeta * w_norm / (g_norm + wd_np * w_norm + ep_np)
+          scaled_lr = lr_np * trust_ratio
+
+          vel_np = m_np * vel_np + grad_np
+          var_np -= scaled_lr * vel_np
+
+          self.assertAllClose(var_np, post_var)
+          self.assertAllClose(vel_np, post_vel)
+
+  def testLARSGradientMultiStep(self):
+    for _ in range(10):
+      for dtype in [dtypes.float32, dtypes.float64]:
+        with self.cached_session() as sess:
+          shape = [3, 3]
+          var_np = np.ones(shape)
+          grad_np = np.ones(shape)
+          lr_np = 0.1
+          m_np = 0.9
+          wd_np = 0.1
+          ep_np = 1e-5
+          eeta = 0.1
+          vel_np = np.zeros(shape)
+
+          var = variables.Variable(var_np, dtype=dtype)
+          grad = variables.Variable(grad_np, dtype=dtype)
+          opt = lo.LARSOptimizer(
+              learning_rate=lr_np,
+              momentum=m_np,
+              eeta=eeta,
+              weight_decay=wd_np,
+              epsilon=ep_np)
+
+          step = opt.apply_gradients([(grad, var)])
+          variables.global_variables_initializer().run()
+
+          pre_var = sess.run(var)
+          pre_vel = sess.run(opt.get_slot(var, 'momentum'))
+          self.assertAllClose(var_np, pre_var)
+          self.assertAllClose(vel_np, pre_vel)
+
+          for _ in range(10):
+            step.run()
+
+            post_var = sess.run(var)
+            post_vel = sess.run(opt.get_slot(var, 'momentum'))
+
+            w_norm = np.linalg.norm(var_np.flatten(), ord=2)
+            g_norm = np.linalg.norm(grad_np.flatten(), ord=2)
+            trust_ratio = eeta * w_norm / (g_norm + wd_np * w_norm + ep_np)
+            scaled_lr = lr_np * trust_ratio
+
+            vel_np = m_np * vel_np + grad_np
+            var_np -= scaled_lr * vel_np
+
+            self.assertAllClose(var_np, post_var)
+            self.assertAllClose(vel_np, post_vel)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
index a16857db7d..dc4c462ce4 100644
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
@@ -53,7 +53,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -109,7 +109,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         repeated_index_update_var = variables.Variable(
             [[1.0], [2.0]], dtype=dtype)
         aggregated_update_var = variables.Variable(
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index ac04ad9911..f22e724528 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -46,7 +46,7 @@ class MovingAverageOptimizerTest(test.TestCase):
   def _helpTestRun(self, use_resource=False):
     for sequential_update in [True, False]:
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        with self.test_session(graph=ops.Graph()) as sess:
+        with self.session(graph=ops.Graph()) as sess:
           orig_val0 = [1.0, 2.0]
           orig_val1 = [3.0, 4.0]
           var0 = variable_scope.get_variable(
@@ -165,7 +165,7 @@ class MovingAverageOptimizerTest(test.TestCase):
             self.assertLess(avg_val1[i], orig_val1[i])
 
   def testFailWhenSaverCreatedBeforeInitialized(self):
-    with self.test_session():
+    with self.cached_session():
       var = variables.Variable([1.0], name='var', dtype=dtypes.float32)
       opt = moving_average_optimizer.MovingAverageOptimizer(
           gradient_descent.GradientDescentOptimizer(learning_rate=2.0))
@@ -187,7 +187,7 @@ class MovingAverageOptimizerTest(test.TestCase):
         self.apply_gradients_called = True
         return super(WrapperOptimizer, self).apply_gradients(*args, **kwargs)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       var = variables.Variable([1.2], name='var', dtype=dtypes.float32)
       loss = var ** 2
       wrapper_opt = WrapperOptimizer(learning_rate=2.0)
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
index 618d8eb18d..904aa9ab13 100644
--- a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
@@ -34,7 +34,7 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
   """
 
   def testWrapper(self):
-    with self.test_session():
+    with self.cached_session():
       var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtypes.float32)
@@ -92,7 +92,7 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
           self.evaluate(slot1))
 
   def testGradientClipping(self):
-    with self.test_session():
+    with self.cached_session():
       var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
       var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
       var2 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
index 825c08a09a..85e05ce71c 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -53,7 +53,7 @@ class NadamOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -106,7 +106,7 @@ class NadamOptimizerTest(test.TestCase):
 
   def doTestBasic(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/contrib/opt/python/training/powersign.py b/tensorflow/contrib/opt/python/training/powersign.py
index 828f3c51c9..b4aa19264d 100644
--- a/tensorflow/contrib/opt/python/training/powersign.py
+++ b/tensorflow/contrib/opt/python/training/powersign.py
@@ -65,7 +65,7 @@ class PowerSignOptimizer(optimizer.Optimizer):
     Example usage for PowerSign-cd (PowerSign with cosine sign decay)
     ```
     decay_steps = 1000
-    linear_decay_fn = sign_decays.get_linear_decay_fn(decay_steps)
+    linear_decay_fn = sign_decays.get_cosine_decay_fn(decay_steps)
     opt = PowerSignOptimizer(learning_rate=0.1, sign_decay_fn=linear_decay_fn)
     ```
 
diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py
index ea56e1646a..c09e2ac76d 100644
--- a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py
@@ -36,7 +36,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def doTestBasic(self, use_locking=False, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
           var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
@@ -73,7 +73,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable(
             [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
@@ -92,7 +92,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -116,7 +116,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
         var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
         grads0 = ops.IndexedSlices(
@@ -144,7 +144,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         repeated_index_update_var = variables.Variable(
             [[1.0], [2.0]], dtype=dtype)
         aggregated_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype)
@@ -170,7 +170,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def testSparseRepeatedIndicesResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var_repeated = resource_variable_ops.ResourceVariable(
             [1.0, 2.0], dtype=dtype)
         loss_repeated = math_ops.reduce_sum(
@@ -194,7 +194,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def testSparseStability(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         shape = [1, 6]
         var0 = variables.Variable(
             [[
@@ -230,7 +230,7 @@ class RegAdagradOptimizerTest(test.TestCase):
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -263,7 +263,7 @@ class RegAdagradOptimizerTest(test.TestCase):
             np.array([2.715679168701172, 3.715679168701172]), var1.eval())
 
   def testDynamicShapeVariable_Ok(self):
-    with self.test_session():
+    with self.cached_session():
       v = variable_scope.get_variable(
           "v", initializer=constant_op.constant(1.), validate_shape=False)
       self.assertFalse(v.shape.is_fully_defined())
@@ -274,7 +274,7 @@ class RegAdagradOptimizerTest(test.TestCase):
   def testSkipUpdatingSlots(self):
     iav = 0.130005  # A value that works with float16
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -306,7 +306,7 @@ class RegAdagradOptimizerTest(test.TestCase):
   def testSparseSkipUpdatingSlots(self):
     iav = 0.130005  # A value that works with float16
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
         var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
         grads0 = ops.IndexedSlices(
diff --git a/tensorflow/contrib/opt/python/training/shampoo_test.py b/tensorflow/contrib/opt/python/training/shampoo_test.py
index 2e0a202ae2..b3688ab181 100644
--- a/tensorflow/contrib/opt/python/training/shampoo_test.py
+++ b/tensorflow/contrib/opt/python/training/shampoo_test.py
@@ -52,7 +52,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np = np.random.rand(size)
     grad_np_2 = np.random.rand(size)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -103,7 +103,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np = np.random.rand(size[0], size[1])
     grad_np_2 = np.random.rand(size[0], size[1])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -162,7 +162,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np = np.random.rand(size[0], size[1], size[2])
     grad_np_2 = np.random.rand(size[0], size[1], size[2])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -240,7 +240,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np = np.random.rand(size)
     grad_np_2 = np.random.rand(size)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -294,7 +294,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     grad_np = np.random.rand(size[0], size[1])
     grad_np_2 = np.random.rand(size[0], size[1])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -365,7 +365,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
                                               replace=False))
     grad_np_2 = np.random.rand(sample_size_2, size[1])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -445,7 +445,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
                                             replace=False))
     grad_np = np.random.rand(sample_size, size[1], size[2])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -512,7 +512,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     gbar_decay = 0.9
     gbar_weight = 0.1
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -601,7 +601,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     mat_g3_a = np.eye(size[2])
     mat_g3 = np.zeros_like(mat_g3_a)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
@@ -672,7 +672,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
     mat_g3_a = np.eye(size[2])
     mat_g3 = np.zeros_like(mat_g3_a)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       global_step = variables.Variable(
           0, dtype=dtypes.int64, use_resource=use_resource_var)
       var = variables.Variable(
diff --git a/tensorflow/contrib/opt/python/training/sign_decay_test.py b/tensorflow/contrib/opt/python/training/sign_decay_test.py
index c31cb924ea..3a84789afd 100644
--- a/tensorflow/contrib/opt/python/training/sign_decay_test.py
+++ b/tensorflow/contrib/opt/python/training/sign_decay_test.py
@@ -66,7 +66,7 @@ class SignDecaysTest(test.TestCase):
     linear_decay_fn = sign_decay.get_linear_decay_fn(num_training_steps)
 
     for step in range(0, 1000, 100):
-      with self.test_session():
+      with self.cached_session():
         tf_decayed = linear_decay_fn(step).eval()
         py_decayed = py_linear_decay_fn(num_training_steps)(step)
         self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
@@ -78,7 +78,7 @@ class SignDecaysTest(test.TestCase):
         num_training_steps, num_periods=5, zero_after=2)
 
     for step in range(0, 1000, 100):
-      with self.test_session():
+      with self.cached_session():
         tf_decayed = cosine_decay_fn(step).eval()
         py_decayed = py_cosine_decay_fn(num_training_steps)(step)
         self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
@@ -95,7 +95,7 @@ class SignDecaysTest(test.TestCase):
         num_training_steps, num_periods=5, zero_after=2)
 
     for step in range(0, 1000, 100):
-      with self.test_session():
+      with self.cached_session():
         tf_decayed = restart_decay_fn(step).eval()
         py_decayed = py_restart_decay_fn(num_training_steps)(step)
         self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
diff --git a/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py b/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py
index fdda86b0b5..ff0ea8d766 100644
--- a/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/variable_clipping_optimizer_test.py
@@ -158,7 +158,7 @@ class VariableClippingOptimizerTest(test.TestCase):
 
   def testDenseLocal(self):
     for dtype in [dtypes.float32, dtypes.float64, dtypes.half]:
-      with self.test_session():
+      with self.cached_session():
         var0, var1, update_op = self._setupDense(False, dtype)
         self._assertDenseCorrect(var0, var1, update_op)
 
@@ -171,7 +171,7 @@ class VariableClippingOptimizerTest(test.TestCase):
 
   def testSparseLocal(self):
     for dtype in [dtypes.float64, dtypes.float32, dtypes.half]:
-      with self.test_session():
+      with self.cached_session():
         var0, var1, update_op = self._setupSparse(False, dtype)
         self._assertSparseCorrect(var0, var1, update_op)
 
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
index b9cf40eb7b..29acfc602e 100644
--- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
@@ -26,6 +26,7 @@ from tensorflow.python.training import adam
 from tensorflow.python.training import momentum as momentum_opt
 from tensorflow.python.training import optimizer
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.ops import array_ops
 
 
 class DecoupledWeightDecayExtension(object):
@@ -159,8 +160,8 @@ class DecoupledWeightDecayExtension(object):
 
   def _decay_weights_sparse_op(self, var, indices, scatter_add):
     if not self._decay_var_list or var in self._decay_var_list:
-      return scatter_add(var, indices, -self._weight_decay * var,
-                         self._use_locking)
+      update = -self._weight_decay * array_ops.gather(var, indices)
+      return scatter_add(var, indices, update, self._use_locking)
     return control_flow_ops.no_op()
 
   # Here, we overwrite the apply functions that the base optimizer calls.
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
index 76d8a5697a..9c91078301 100644
--- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers_test.py
@@ -58,7 +58,7 @@ class WeightDecayOptimizerTest(test.TestCase):
   def doTest(self, optimizer, update_fn, optimizer_name, slot_name,
              use_resource=False, do_sparse=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.test_session(graph=ops.Graph()):
+      with self.session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/contrib/optimizer_v2/adadelta_test.py b/tensorflow/contrib/optimizer_v2/adadelta_test.py
index 31cfec0d50..4c94b66679 100644
--- a/tensorflow/contrib/optimizer_v2/adadelta_test.py
+++ b/tensorflow/contrib/optimizer_v2/adadelta_test.py
@@ -37,7 +37,7 @@ class AdadeltaOptimizerTest(test.TestCase):
     for dtype in [dtypes.half, dtypes.float32]:
       for grad in [0.2, 0.1, 0.01]:
         for lr in [1.0, 0.5, 0.1]:
-          with self.test_session():
+          with self.cached_session():
             var0_init = [1.0, 2.0]
             var1_init = [3.0, 4.0]
             if use_resource:
@@ -146,7 +146,7 @@ class AdadeltaOptimizerTest(test.TestCase):
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
diff --git a/tensorflow/contrib/optimizer_v2/adagrad_test.py b/tensorflow/contrib/optimizer_v2/adagrad_test.py
index 18191c3ef2..debaaaeeba 100644
--- a/tensorflow/contrib/optimizer_v2/adagrad_test.py
+++ b/tensorflow/contrib/optimizer_v2/adagrad_test.py
@@ -36,7 +36,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def doTestBasic(self, use_locking=False, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
           var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
@@ -73,7 +73,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable(
             [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
@@ -92,7 +92,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -116,7 +116,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
         var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
         grads0 = ops.IndexedSlices(
@@ -147,7 +147,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         repeated_index_update_var = variables.Variable(
             [[1.0], [2.0]], dtype=dtype)
         aggregated_update_var = variables.Variable(
@@ -177,7 +177,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testSparseRepeatedIndicesResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var_repeated = resource_variable_ops.ResourceVariable(
             [1.0, 2.0], dtype=dtype)
         loss_repeated = math_ops.reduce_sum(
@@ -201,7 +201,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testSparseStability(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         shape = [1, 6]
         var0 = variables.Variable(
             [[
@@ -237,7 +237,7 @@ class AdagradOptimizerTest(test.TestCase):
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -270,7 +270,7 @@ class AdagradOptimizerTest(test.TestCase):
             np.array([2.715679168701172, 3.715679168701172]), var1.eval())
 
   def testDynamicShapeVariable_Ok(self):
-    with self.test_session():
+    with self.cached_session():
       v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
                                       validate_shape=False)
       self.assertFalse(v.shape.is_fully_defined())
diff --git a/tensorflow/contrib/optimizer_v2/adam_test.py b/tensorflow/contrib/optimizer_v2/adam_test.py
index d9ad58b0a6..b1ad0ade42 100644
--- a/tensorflow/contrib/optimizer_v2/adam_test.py
+++ b/tensorflow/contrib/optimizer_v2/adam_test.py
@@ -56,7 +56,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -122,7 +122,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         repeated_index_update_var = variables.Variable(
             [[1.0], [2.0]], dtype=dtype)
         aggregated_update_var = variables.Variable(
@@ -152,7 +152,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def doTestBasic(self, use_resource=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.test_session(graph=ops.Graph()):
+      with self.session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -215,7 +215,7 @@ class AdamOptimizerTest(test.TestCase):
                              opt.get_slot(var=var0, name="m").name)
 
   def testBasic(self):
-    with self.test_session():
+    with self.cached_session():
       self.doTestBasic(use_resource=False)
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
@@ -224,7 +224,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -261,7 +261,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 28a531dfec..e13b82d1d2 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -310,7 +310,7 @@ class CheckpointingTests(test.TestCase):
               global_step=root.global_step)
           checkpoint_path = checkpoint_management.latest_checkpoint(
               checkpoint_directory)
-          with self.test_session(graph=ops.get_default_graph()) as session:
+          with self.session(graph=ops.get_default_graph()) as session:
             status = root.restore(save_path=checkpoint_path)
             status.initialize_or_restore(session=session)
             if checkpoint_path is None:
@@ -504,7 +504,7 @@ class CheckpointingTests(test.TestCase):
     """Saves after the first should not modify the graph."""
     with context.graph_mode():
       graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
+      with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         obj = tracking.Checkpointable()
@@ -522,7 +522,7 @@ class CheckpointingTests(test.TestCase):
     """Restores after the first should not modify the graph."""
     with context.graph_mode():
       graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
+      with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         obj = tracking.Checkpointable()
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent_test.py b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
index ad9aef804f..4a77bce478 100644
--- a/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
@@ -34,7 +34,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -57,7 +57,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -82,7 +82,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
@@ -108,7 +108,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
@@ -135,7 +135,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -157,7 +157,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         opt = gradient_descent.GradientDescentOptimizer(3.0)
         values = [1.0, 3.0]
         vars_ = [variables.Variable([v], dtype=dtype) for v in values]
@@ -168,7 +168,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testWithGlobalStep(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         global_step = variables.Variable(0, trainable=False)
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
@@ -191,7 +191,7 @@ class GradientDescentOptimizerTest(test.TestCase):
 
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
         var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
         grads0 = ops.IndexedSlices(
diff --git a/tensorflow/contrib/optimizer_v2/momentum_test.py b/tensorflow/contrib/optimizer_v2/momentum_test.py
index 24cdab4626..e69f12839e 100644
--- a/tensorflow/contrib/optimizer_v2/momentum_test.py
+++ b/tensorflow/contrib/optimizer_v2/momentum_test.py
@@ -123,7 +123,7 @@ class MomentumOptimizerTest(test.TestCase):
           ]), self.evaluate(var1))
 
   def testBasic(self):
-    with self.test_session():
+    with self.cached_session():
       self.doTestBasic(use_resource=False)
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
@@ -162,7 +162,7 @@ class MomentumOptimizerTest(test.TestCase):
 
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -188,7 +188,7 @@ class MomentumOptimizerTest(test.TestCase):
 
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -282,7 +282,7 @@ class MomentumOptimizerTest(test.TestCase):
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -435,7 +435,7 @@ class MomentumOptimizerTest(test.TestCase):
     return db_grad, db_out
 
   def testLikeDistBeliefMom01(self):
-    with self.test_session():
+    with self.cached_session():
       db_grad, db_out = self._dbParamsMom01()
       num_samples = len(db_grad)
       var0 = variables.Variable([0.0] * num_samples)
@@ -449,7 +449,7 @@ class MomentumOptimizerTest(test.TestCase):
 
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
         var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
         grads0 = ops.IndexedSlices(
@@ -518,7 +518,7 @@ class MomentumOptimizerTest(test.TestCase):
 
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 8c11d8bcfd..f6ecaba834 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -620,7 +621,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     # Map from graph_key to state for that graph. We use the graph_key
     # since it works in both eager and graph mode, and gives the outer
     # graph inside functions.
-    tower_context = distribute_lib.get_tower_context()
+    tower_context = distribution_strategy_context.get_tower_context()
     if tower_context is None:
       # In a cross-tower context for a DistributionStrategy, which means
       # only one Optimizer will be created, not one per tower.
@@ -769,7 +770,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
               distribute_lib.get_loss_reduction() ==
               variable_scope.VariableAggregation.MEAN)
         if scale_loss_by_num_towers:
-          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          num_towers = distribution_strategy_context.get_distribution_strategy(
+          ).num_towers
           if num_towers > 1:
             loss_value *= 1. / num_towers
 
@@ -788,7 +790,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
           distribute_lib.get_loss_reduction() ==
           variable_scope.VariableAggregation.MEAN)
     if scale_loss_by_num_towers:
-      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      num_towers = distribution_strategy_context.get_distribution_strategy(
+      ).num_towers
       if num_towers > 1:
         loss *= 1. / num_towers
 
@@ -862,7 +865,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     if not filtered:
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, v in grads_and_vars],))
-    return distribute_lib.get_tower_context().merge_call(
+    return distribution_strategy_context.get_tower_context().merge_call(
         self._distributed_apply, filtered, global_step=global_step, name=name)
 
   def _get_or_create_state(self, var_list=None):
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
index a44bfd1bfd..dd7f2f4405 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -61,7 +61,7 @@ class OptimizerTest(test.TestCase):
 
   def testAggregationMethod(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         cost = 5 * var0 + 3 * var1
@@ -86,7 +86,7 @@ class OptimizerTest(test.TestCase):
 
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         cost = 5 * var0 + 3 * var1
@@ -212,7 +212,7 @@ class OptimizerTest(test.TestCase):
       sgd_op.apply_gradients(grads_and_vars)
 
   def testTrainOp(self):
-    with self.test_session():
+    with self.cached_session():
       var0 = variables.Variable([1.0, 2.0])
       var1 = variables.Variable([3.0, 4.0])
       cost = 5 * var0 + 3 * var1
@@ -225,7 +225,7 @@ class OptimizerTest(test.TestCase):
   def testConstraint(self):
     constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
     constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
-    with self.test_session():
+    with self.cached_session():
       var0 = variables.Variable([1.0, 2.0],
                                 constraint=constraint_01)
       var1 = variables.Variable([3.0, 4.0],
@@ -247,7 +247,7 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([0., 0.], var1.eval())
 
   def testStopGradients(self):
-    with self.test_session():
+    with self.cached_session():
       var0 = variables.Variable([1.0, 2.0], name='var0')
       var1 = variables.Variable([3.0, 4.0], name='var1')
       var0_id = array_ops.identity(var0)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
index 164ff0ea06..3de53405ec 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -22,7 +22,7 @@ A detailed description of rmsprop.
 - divide gradient by the root of this average
 
 mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
-mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square + epsilon)
+mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square)
 delta = - mom
 
 This implementation of RMSProp uses plain momentum, not Nesterov momentum.
@@ -33,7 +33,7 @@ gradients, and uses that average to estimate the variance:
 mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
 mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
 mom = momentum * mom{t-1} + learning_rate * g_t /
-    sqrt(mean_square - mean_grad**2 + epsilon)
+    sqrt(mean_square - mean_grad**2)
 delta = - mom
 """
 
@@ -43,7 +43,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
 
 from tensorflow.python.training import training_ops
 
@@ -87,7 +86,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
       decay: A float hyperparameter. Discounting factor for the history/coming
         gradient.
       momentum: A float hyperparameter.
-      epsilon: A float hyperparameter. Small value to avoid zero denominator.
+      epsilon: A float hyperparameter. Small value to initialize the average
+        square gradient variable and avoid zero denominator.
       use_locking: If True use locks for update operation.
       centered: If True, gradients are normalized by the estimated variance of
         the gradient; if False, by the uncentered second moment. Setting this to
@@ -106,10 +106,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
 
   def _create_vars(self, var_list, state):
     for v in var_list:
-      if v.get_shape().is_fully_defined():
-        init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
-      else:
-        init_rms = array_ops.ones_like(v)
+      init_rms = state.get_hyper(
+          "epsilon", v.dtype.base_dtype) * array_ops.ones_like(v)
       state.create_slot_with_initializer(v, init_rms, v.get_shape(),
                                          v.dtype.base_dtype, "rms")
       if self._centered:
@@ -129,7 +127,9 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          # epsilon is now the rms initial value and is not added to the
+          # denominator anymore, hence calling the kernel op with epsilon=0.
+          0,
           grad,
           use_locking=self._use_locking).op
     else:
@@ -140,7 +140,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           use_locking=self._use_locking).op
 
@@ -157,7 +157,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           use_locking=self._use_locking)
     else:
@@ -168,7 +168,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           use_locking=self._use_locking)
 
@@ -185,7 +185,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad.values,
           grad.indices,
           use_locking=self._use_locking)
@@ -197,7 +197,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad.values,
           grad.indices,
           use_locking=self._use_locking)
@@ -215,7 +215,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           indices,
           use_locking=self._use_locking)
@@ -227,7 +227,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
           state.get_hyper("learning_rate", var.dtype.base_dtype),
           state.get_hyper("decay", var.dtype.base_dtype),
           state.get_hyper("momentum", var.dtype.base_dtype),
-          state.get_hyper("epsilon", var.dtype.base_dtype),
+          0,
           grad,
           indices,
           use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index dc23ef241a..44301ffe9e 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -39,34 +39,34 @@ _DATA_TYPES = [dtypes.half, dtypes.float32]
 
 _TEST_PARAM_VALUES = [
     # learning_rate, decay, momentum, epsilon, centered, use_resource
-    [0.5, 0.9, 0.0, 1e-3, True, False],
-    [0.5, 0.9, 0.0, 1e-3, False, False],
-    [0.5, 0.9, 0.0, 1e-3, True, True],
-    [0.5, 0.9, 0.0, 1e-3, False, True],
-    [0.1, 0.9, 0.0, 1e-3, True, False],
-    [0.5, 0.95, 0.0, 1e-3, False, False],
-    [0.5, 0.95, 0.0, 1e-5, True, False],
-    [0.5, 0.95, 0.9, 1e-5, True, False],
+    [0.5, 0.9, 0.0, 1.0, True, False],
+    [0.5, 0.9, 0.0, 1.0, False, False],
+    [0.5, 0.9, 0.0, 1.0, True, True],
+    [0.5, 0.9, 0.0, 1.0, False, True],
+    [0.1, 0.9, 0.0, 1.0, True, False],
+    [0.5, 0.95, 0.0, 1.0, False, False],
+    [0.5, 0.8, 0.0, 1e-3, True, False],
+    [0.5, 0.8, 0.9, 1e-3, True, False],
 ]
 
 
 class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
-                            epsilon, centered):
+                            centered):
     rms_t = rms * decay + (1 - decay) * g * g
-    denom_t = rms_t + epsilon
     if centered:
       mg_t = mg * decay + (1 - decay) * g
-      denom_t -= mg_t * mg_t
+      denom_t = rms_t - mg_t * mg_t
     else:
       mg_t = mg
+      denom_t = rms_t
     mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
     var_t = var - mom_t
     return var_t, mg_t, rms_t, mom_t
 
   def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
-                                   lr, decay, momentum, epsilon, centered):
+                                   lr, decay, momentum, centered):
     mg_t = copy.deepcopy(mg)
     rms_t = copy.deepcopy(rms)
     mom_t = copy.deepcopy(mom)
@@ -75,7 +75,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
       gindex = gindexs[i]
       gvalue = gvalues[i]
       rms_t[gindex] = rms[gindex] * decay + (1 - decay) * gvalue * gvalue
-      denom_t = rms_t[gindex] + epsilon
+      denom_t = rms_t[gindex]
       if centered:
         mg_t[gindex] = mg_t[gindex] * decay + (1 - decay) * gvalue
         denom_t -= mg_t[gindex] * mg_t[gindex]
@@ -129,8 +129,8 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
       mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
       mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
@@ -144,10 +144,10 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
         var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
             var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
-            decay, momentum, epsilon, centered)
+            decay, momentum, centered)
         var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
             var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
-            decay, momentum, epsilon, centered)
+            decay, momentum, centered)
 
         # Validate updated params
         if centered:
@@ -162,7 +162,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters([dtypes.float32, dtypes.float64])
   def testMinimizeSparseResourceVariable(self, dtype):
-    with self.test_session():
+    with self.cached_session():
       var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
       x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
       pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
@@ -184,14 +184,14 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters([dtypes.float32, dtypes.float64])
   def testMinimizeSparseResourceVariableCentered(self, dtype):
-    with self.test_session():
+    with self.cached_session():
       var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
       x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
       pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
       loss = pred * pred
       sgd_op = rmsprop.RMSPropOptimizer(
           learning_rate=1.0,
-          decay=0.0,
+          decay=0.1,
           momentum=0.0,
           epsilon=1.0,
           centered=True).minimize(loss)
@@ -202,7 +202,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
       sgd_op.run()
       # Validate updated params
       self.assertAllCloseAccordingToType(
-          [[-111, -138]], var0.eval(), atol=0.01)
+          [[-7/3.0, -4/3.0]], var0.eval(), atol=0.01)
 
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
@@ -251,8 +251,8 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
       mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
       mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
       mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
@@ -266,10 +266,10 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
         var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
             var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
-            learning_rate, decay, momentum, epsilon, centered)
+            learning_rate, decay, momentum, centered)
         var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
             var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
-            learning_rate, decay, momentum, epsilon, centered)
+            learning_rate, decay, momentum, centered)
 
         # Validate updated params
         if centered:
@@ -317,13 +317,13 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
       # Check the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901))
           ]), var0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
           ]), var1.eval())
       # Step 2: the root mean square accumulators contain the previous update.
       update.run()
@@ -335,17 +335,17 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
       # Check the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
           ]), var0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
           ]), var1.eval())
 
   @parameterized.parameters(_DATA_TYPES)
@@ -357,7 +357,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
       grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
 
       opt = rmsprop.RMSPropOptimizer(
-          learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+          learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1.0)
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
@@ -383,22 +383,22 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
           np.array([0.90001, 0.90001]), rms1.eval())
       # Check the momentum accumulators
       self.assertAllCloseAccordingToType(
-          np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                    (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+          np.array([(0.1 * 2.0 / math.sqrt(0.901)),
+                    (0.1 * 2.0 / math.sqrt(0.901))]), mom0.eval())
       self.assertAllCloseAccordingToType(
-          np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                    (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+          np.array([(0.01 * 2.0 / math.sqrt(0.90001)),
+                    (0.01 * 2.0 / math.sqrt(0.90001))]), mom1.eval())
 
       # Check that the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901))
           ]), var0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
           ]), var1.eval())
 
       # Step 2: the root mean square accumulators contain the previous update.
@@ -410,38 +410,38 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
           np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
-              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
           ]), mom0.eval())
       self.assertAllCloseAccordingToType(
           np.array([
-              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
-              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
           ]), mom1.eval())
 
       # Check the parameters.
       self.assertAllCloseAccordingToType(
           np.array([
-              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
-              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)))
           ]), var0.eval())
 
       self.assertAllCloseAccordingToType(
           np.array([
-              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
-              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)))
           ]), var1.eval())
 
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 42fba81a5c..85b5a5a3b9 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -14,8 +14,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
-#define TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+#ifndef TENSORFLOW_CONTRIB_PERIODIC_RESAMPLE_KERNELS_PERIODIC_RESAMPLE_OP_H_
+#define TENSORFLOW_CONTRIB_PERIODIC_RESAMPLE_KERNELS_PERIODIC_RESAMPLE_OP_H_
 
 #include <cmath>
 #include <type_traits>
@@ -421,4 +421,4 @@ class PeriodicResampleOpGrad : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
-#endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+#endif  // TENSORFLOW_CONTRIB_PERIODIC_RESAMPLE_KERNELS_PERIODIC_RESAMPLE_OP_H_
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
index e3570e38a3..17b69c7b35 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
@@ -170,7 +170,7 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
     field_names = [f.name for f in fields]
     output_types = [f.dtype for f in fields]
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sizes, vtensor = self._decode_module.decode_proto(
           batch,
           message_type=message_type,
@@ -290,7 +290,7 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
     field_names = ['sizes']
     field_types = [dtypes.int32]
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ctensor, vtensor = self._decode_module.decode_proto(
           batch,
           message_type=msg_type,
diff --git a/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py
index 9a1c04af32..7e9b355c69 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py
@@ -137,7 +137,7 @@ class DescriptorSourceTestBase(test.TestCase):
     field_names = ['values', 'shapes', 'sizes', 'fields']
     tensor_types = [dtypes.string, dtypes.int32, dtypes.int32, dtypes.string]
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sizes, field_tensors = self._decode_module.decode_proto(
           in_bufs,
           message_type=message_type,
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
index 07dfb924d3..01b3ccc7fd 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
@@ -55,7 +55,7 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
 
   def testBadInputs(self):
     # Invalid field name
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError('Unknown field: non_existent_field'):
         self._encode_module.encode_proto(
             sizes=[[1]],
@@ -64,7 +64,7 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
             field_names=['non_existent_field']).eval()
 
     # Incorrect types.
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError(
           'Incompatible type for field double_value.'):
         self._encode_module.encode_proto(
@@ -74,7 +74,7 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
             field_names=['double_value']).eval()
 
     # Incorrect shapes of sizes.
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError(
           r'sizes should be batch_size \+ \[len\(field_names\)\]'):
         sizes = array_ops.placeholder(dtypes.int32)
@@ -89,7 +89,7 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
             })
 
     # Inconsistent shapes of values.
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError(
           'Values must match up to the last dimension'):
         sizes = array_ops.placeholder(dtypes.int32)
@@ -109,7 +109,7 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
     field_names = [f.name for f in fields]
     out_types = [f.dtype for f in fields]
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sizes, field_tensors = self._decode_module.decode_proto(
           in_bufs,
           message_type=message_type,
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 23363617ed..499fec4ffa 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -244,7 +244,9 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
     ],
 )
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index cb66fd1f76..2ddbd73ea6 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -455,6 +455,24 @@ class _LayerMatch(object):
     return self._bias_add_op
 
 
+def _FollowedByFakeQuant(tensor):
+  """Returns True if the tensor is followed by a FakeQuant."""
+  fake_quant_ops = set([
+      'FakeQuantWithMinMaxVars', 'FakeQuantWithMinMaxArgs',
+      'FakeQuantWithMinMaxVarsPerChannel'
+  ])
+  pass_through_ops = set(['Reshape', 'Identity'])
+  consumers = tensor.consumers()
+  while consumers:
+    c = consumers.pop()
+    if c.type in fake_quant_ops:
+      return True
+    elif c.type in pass_through_ops:
+      for output in c.outputs:
+        consumers.extend(output.consumers())
+  return False
+
+
 def _InsertQuantOp(context,
                    name,
                    producer,
@@ -535,11 +553,7 @@ def _InsertQuantOp(context,
   # Prevent ops from being quantized multiple times. Bypass ops can sometimes
   # overlap between multiple matches, so we need to ensure that we don't
   # add duplicate FakeQuant operations.
-  fake_quant_ops = set([
-      'FakeQuantWithMinMaxVars',
-      'FakeQuantWithMinMaxArgs'
-  ])
-  if fake_quant_ops.intersection(set([c.type for c in inputs.consumers()])):
+  if _FollowedByFakeQuant(inputs):
     return
 
   if moving_avg:
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 2944f964c7..484493f1b2 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -59,6 +59,10 @@ def _create_graph(input_graph=None,
 
   if input_graph is None:
     input_graph = ops.get_default_graph()
+
+  # Add check to see if graph has training ops, if so provide error message and
+  # exit
+  _check_for_training_ops(input_graph)
   with input_graph.as_default():
     fold_batch_norms.FoldBatchNorms(
         input_graph,
@@ -78,6 +82,9 @@ def create_training_graph(input_graph=None, quant_delay=0):
 
   Variables added by the rewrite get added to the global variables collection.
 
+  This function must be invoked prior to insertion of gradient ops in a graph
+  as quantization should be modeled in both forward and backward passes.
+
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
   the expected behavior of previously held references to nodes and tensors may
@@ -104,7 +111,6 @@ def create_training_graph(input_graph=None, quant_delay=0):
   # Currently the values below are hardcoded for mobilenetV1 on imagenet
   # Please use the experimental API if you need to tune these values.
   freeze_bn_delay = None
-
   _create_graph(
       input_graph=input_graph,
       is_training=True,
@@ -141,6 +147,9 @@ def experimental_create_training_graph(input_graph=None,
                                        scope=None):
   """Rewrites a training input_graph in place for simulated quantization.
 
+  This function must be invoked prior to insertion of gradient ops in a graph
+  as quantization should be modeled in both forward and backward passes.
+
   Variables added by the rewrite get added to the global variables collection.
 
   This function has additional experimental options not (yet) available to
@@ -226,3 +235,45 @@ def experimental_create_eval_graph(input_graph=None,
       activation_bits=activation_bits,
       quant_delay=quant_delay,
       scope=scope)
+
+
+def _check_for_training_ops(g):
+  """Check if training ops are present in the graph.
+
+  Args:
+   g: The tf.Graph on which the check for training ops needs to be
+   performed.
+
+  Raises:
+    ValueError: If a training op is seen in the graph;
+  """
+
+  # The list here is obtained
+  # from https://www.tensorflow.org/api_docs/cc/group/training-ops
+  training_ops = frozenset([
+      'ApplyAdagrad', 'ApplyAdagradDA', 'ApplyAdam', 'ApplyAddSign',
+      'ApplyCenteredRMSProp', 'ApplyFtrl', 'ApplyFtrlV2',
+      'ApplyGradientDescent', 'ApplyMomentum', 'ApplyPowerSign',
+      'ApplyProximalAdagrad', 'ApplyProximalGradientDescent', 'ApplyRMSProp',
+      'ResourceApplyAdadelta', 'ResourceApplyAdagrad', 'ResourceApplyAdagradDA',
+      'ResourceApplyAdam', 'ResourceApplyAddSign',
+      'ResourceApplyCenteredRMSProp', 'ResourceApplyFtrl',
+      'ResourceApplyFtrlV2', 'ResourceApplyGradientDescent',
+      'ResourceApplyMomentum', 'ResourceApplyPowerSign',
+      'ResourceApplyProximalAdagrad', 'ResourceApplyProximalGradientDescent',
+      'ResourceApplyRMSProp', 'ResourceSparseApplyAdadelta',
+      'ResourceSparseApplyAdagrad', 'ResourceSparseApplyAdagradDA',
+      'ResourceSparseApplyCenteredRMSProp', 'ResourceSparseApplyFtrl',
+      'ResourceSparseApplyFtrlV2', 'ResourceSparseApplyMomentum',
+      'ResourceSparseApplyProximalAdagrad',
+      'ResourceSparseApplyProximalGradientDescent',
+      'ResourceSparseApplyRMSProp', 'SparseApplyAdadelta', 'SparseApplyAdagrad',
+      'SparseApplyAdagradDA', 'SparseApplyCenteredRMSProp', 'SparseApplyFtrl',
+      'SparseApplyFtrlV2', 'SparseApplyMomentum', 'SparseApplyProximalAdagrad',
+      'SparseApplyProximalGradientDescent', 'SparseApplyRMSProp'
+  ])
+
+  op_types = set([op.type for op in g.get_operations()])
+  train_op_list = op_types.intersection(training_ops)
+  if train_op_list:
+    raise ValueError('Training op found in graph, exiting %s' % train_op_list)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 54faf582f1..e80d2183a6 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import quantize_graph
+from tensorflow.python import training
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
@@ -145,6 +147,19 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
         self.assertTrue(('int64_val: %i' % quant_delay) in const_value)
     self.assertTrue(quant_delay_found)
 
+  def testTrainingOpsCheck(self):
+    self._RunTestOverTrainingRewrites(self._TestTrainingOpsCheck)
+
+  def _TestTrainingOpsCheck(self, rewrite_fn):
+    with ops.Graph().as_default():
+      output = self._ConvLayer()
+      output_scalar = math_ops.reduce_sum(output)
+      loss = math_ops.square(output_scalar - 1)
+      opt = training.gradient_descent.GradientDescentOptimizer(0.0001)
+      opt.minimize(loss)
+      with self.assertRaisesRegexp(ValueError, 'Training op found in graph'):
+        rewrite_fn()
+
   def testWeightBits(self):
     self._RunTestOverExperimentalRewrites(self._TestWeightBits)
 
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 06ebcdfee1..212d902a3c 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -471,6 +471,60 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       self.assertTrue(
           'part/test/test/weights_quant/FakeQuantWithMinMaxVars' in op_names)
 
+  def testSkipReshapeQuantization(self):
+    self._RunTestOverParameters(self._TestSkipReshapeQuantization)
+
+  def _TestSkipReshapeQuantization(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      conv = conv2d(
+          input1,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=nn_ops.relu6,
+          scope='test/test')
+
+      reshape = array_ops.reshape(
+          conv, (int(10), int(height / 2), int(width / 2), int(16)))
+
+      # Insert a fake quant node after the reshape. We will check that one isn't
+      # insert before.
+      array_ops.fake_quant_with_min_max_vars(reshape, -1, 1)
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that there isn't a FakeQuant added before the reshape.
+      self.assertFalse(
+          'FakeQuantWithMinMaxVars' in [i.op.type for i in reshape.op.inputs])
+
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      conv = conv2d(
+          input1,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=nn_ops.relu6,
+          scope='test/test')
+
+      reshape = array_ops.reshape(
+          conv, (int(10), int(height / 2), int(width / 2), int(16)))
+
+      # If no fake quant is added after the reshape, a FakeQuant should be added
+      # before the reshape.
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+
+      # Ensure that there isn't a FakeQuant added before the reshape.
+      self.assertTrue(
+          'FakeQuantWithMinMaxVars' in [i.op.type for i in reshape.op.inputs])
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/rate/BUILD b/tensorflow/contrib/rate/BUILD
new file mode 100644
index 0000000000..c461a7145e
--- /dev/null
+++ b/tensorflow/contrib/rate/BUILD
@@ -0,0 +1,48 @@
+# Description:
+#   contains parts of TensorFlow that are experimental or unstable and which are not supported.
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "rate",
+    srcs = [
+        "rate.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "rate_test",
+    size = "small",
+    srcs = ["rate_test.py"],
+    deps = [
+        ":rate",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/contrib/rate/rate.py b/tensorflow/contrib/rate/rate.py
new file mode 100644
index 0000000000..24d586479a
--- /dev/null
+++ b/tensorflow/contrib/rate/rate.py
@@ -0,0 +1,151 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of tf.contrib.rate module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
+_to_replace = re.compile("[^A-Za-z0-9.]")
+
+
+class Rate(object):
+  """Computes the rate of change since the last rate call."""
+
+  def __init__(self, name=None):
+    self._built = False
+    self._vars = []
+    self._initial_values = {}
+    name = name or self.__class__.__name__
+    # Replace things like spaces in name to create a valid scope name.
+    scope_name = _to_replace.sub("_", name)
+    # We create the variable scope now to get the unique name that will
+    # be used as a variable prefix when build() calls _add_variable().
+    with variable_scope.variable_scope(
+        scope_name, use_resource=True, reuse=False) as scope:
+      pos = scope.name.rfind(scope_name)
+      self._name = name + scope.name[pos + len(scope_name):]
+      self._scope = scope
+
+    # Ensures that if the user calls build directly we still set self._built to
+    # True to prevent variables from being recreated.
+    self._build = self.build
+    if context.executing_eagerly():
+      self._construction_scope = context.eager_mode
+    else:
+      # We make self.call() into a graph callable here, so that we can
+      # return a single op that performs all of the variable updates.
+      self._construction_scope = ops.get_default_graph().as_default
+      self.call = function.defun(self.call)
+
+  def build(self, values, denominator):
+    """Method to create variables.
+
+    Called by `__call__()` before `call()` for the first time.
+
+    Args:
+      values: The numerator for rate.
+      denominator: Value to which the rate is taken with respect.
+    """
+    self.numer = self._add_variable(
+        name="numer", shape=values.get_shape(), dtype=dtypes.float64)
+    self.denom = self._add_variable(
+        name="denom", shape=denominator.get_shape(), dtype=dtypes.float64)
+    self.prev_values = self._add_variable(
+        name="prev_values", shape=values.get_shape(), dtype=dtypes.float64)
+    self.prev_denominator = self._add_variable(
+        name="prev_denominator",
+        shape=denominator.get_shape(),
+        dtype=dtypes.float64)
+    self._built = True
+
+  def __call__(self, *args, **kwargs):
+    """Returns op to execute to update.
+
+    Returns None if eager execution is enabled.
+    Returns a graph-mode function if graph execution is enabled.
+
+    Args:
+      *args:
+      **kwargs: A mini-batch of inputs to Rate, passed on to `call()`.
+    """
+    if not self._built:
+      with variable_scope.variable_scope(
+          self._scope), self._construction_scope():
+        self.build(*args, **kwargs)
+      self._built = True
+    return self.call(*args, **kwargs)
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def variables(self):
+    return self._vars
+
+  def _safe_div(self, numerator, denominator, name):
+    t = math_ops.truediv(numerator, denominator)
+    zero = array_ops.zeros_like(t, dtype=denominator.dtype)
+    condition = math_ops.greater(denominator, zero)
+    zero = math_ops.cast(zero, t.dtype)
+    return array_ops.where(condition, t, zero, name=name)
+
+  def _add_variable(self, name, shape=None, dtype=None):
+    """Private method for adding variables to the graph."""
+    if self._built:
+      raise RuntimeError("Can't call add_variable() except in build().")
+    v = resource_variable_ops.ResourceVariable(
+        lambda: array_ops.zeros(shape, dtype),
+        trainable=False,
+        validate_shape=True,
+        name=name,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES])
+    return v
+
+  def call(self, values, denominator):
+    """Computes the rate since the last call.
+
+    Args:
+      values: Tensor with the per-example value.
+      denominator: Measure to take the rate with respect to.
+
+    Returns:
+      The rate or 0 if denominator is unchanged since last call.
+    """
+    if denominator.dtype != dtypes.float64:
+      denominator = math_ops.cast(denominator, dtypes.float64)
+    if values.dtype != dtypes.float64:
+      values = math_ops.cast(values, dtypes.float64)
+
+    state_ops.assign(self.numer, math_ops.subtract(values, self.prev_values))
+    state_ops.assign(self.denom,
+                     math_ops.subtract(denominator, self.prev_denominator))
+    state_ops.assign(self.prev_values, values)
+    state_ops.assign(self.prev_denominator, denominator)
+
+    return self._safe_div(self.numer, self.denom, name="safe_rate")
diff --git a/tensorflow/contrib/rate/rate_test.py b/tensorflow/contrib/rate/rate_test.py
new file mode 100644
index 0000000000..08908104f4
--- /dev/null
+++ b/tensorflow/contrib/rate/rate_test.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Rate."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rate import rate
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class RateTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBuildRate(self):
+    m = rate.Rate()
+    m.build(
+        constant_op.constant([1], dtype=dtypes.float32),
+        constant_op.constant([2], dtype=dtypes.float32))
+    old_numer = m.numer
+    m(
+        constant_op.constant([2], dtype=dtypes.float32),
+        constant_op.constant([2], dtype=dtypes.float32))
+    self.assertTrue(old_numer is m.numer)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBasic(self):
+    with self.test_session():
+      r_ = rate.Rate()
+      a = r_(array_ops.ones([1]), denominator=array_ops.ones([1]))
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual([[1]], self.evaluate(a))
+      b = r_(constant_op.constant([2]), denominator=constant_op.constant([2]))
+      self.assertEqual([[1]], self.evaluate(b))
+      c = r_(constant_op.constant([4]), denominator=constant_op.constant([3]))
+      self.assertEqual([[2]], self.evaluate(c))
+      d = r_(constant_op.constant([16]), denominator=constant_op.constant([3]))
+      self.assertEqual([[0]], self.evaluate(d))  # divide by 0
+
+  def testNamesWithSpaces(self):
+    m1 = rate.Rate(name="has space")
+    m1(array_ops.ones([1]), array_ops.ones([1]))
+    self.assertEqual(m1.name, "has space")
+    self.assertEqual(m1.prev_values.name, "has_space_1/prev_values:0")
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testWhileLoop(self):
+    with self.test_session():
+      r_ = rate.Rate()
+
+      def body(value, denom, i, ret_rate):
+        i += 1
+        ret_rate = r_(value, denom)
+        with ops.control_dependencies([ret_rate]):
+          value = math_ops.add(value, 2)
+          denom = math_ops.add(denom, 1)
+        return [value, denom, i, ret_rate]
+
+      def condition(v, d, i, r):
+        del v, d, r  # unused vars by condition
+        return math_ops.less(i, 100)
+
+      i = constant_op.constant(0)
+      value = constant_op.constant([1], dtype=dtypes.float64)
+      denom = constant_op.constant([1], dtype=dtypes.float64)
+      ret_rate = r_(value, denom)
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
+      loop = control_flow_ops.while_loop(condition, body,
+                                         [value, denom, i, ret_rate])
+      self.assertEqual([[2]], self.evaluate(loop[3]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
index f23194a6f2..1800edc05a 100644
--- a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
@@ -165,7 +165,7 @@ class FunctionalRnnTest(test_util.TensorFlowTestCase):
 
       fetches = self._CreateRnnGraph(
           fn, cell, tf_inputs, tf_slen, is_bidirectional, time_major=time_major)
-      with self.test_session(graph=graph) as sess:
+      with self.session(graph=graph) as sess:
         sess.run(variables.global_variables_initializer())
         # Note that cell.trainable_variables it not always set.
         self._MaybeResetVariables(variable_cache, sess,
diff --git a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
index 67a8f59c3c..c3db71359c 100644
--- a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
+++ b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
@@ -178,7 +178,8 @@ def _ApplyLengthsToBatch(sequence_lengths, tf_output):
   # TODO(drpng): just use Update so that we don't carry over the gradients?
   """Sets the output to be zero at the end of the sequence."""
   # output is batch major.
-  batch_size, max_time, vector_size = tf_output.shape
+  shape = array_ops.shape(tf_output)
+  batch_size, max_time, vector_size = shape[0], shape[1], shape[2]
   output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
   output_time = array_ops.reshape(output_time, [batch_size, max_time])
   lengths = array_ops.tile(
@@ -278,11 +279,16 @@ def functional_rnn(cell, inputs, sequence_length=None,
     if initial_state is None:
       initial_state = cell.zero_state(batch_size, dtype)
     func_cell = _FunctionalRnnCell(cell, inputs, initial_state)
+  if sequence_length is not None:
+    max_length = math_ops.reduce_max(sequence_length)
+  else:
+    max_length = None
   extended_acc_state, extended_final_state = recurrent.Recurrent(
       theta=func_cell.theta,
       state0=func_cell.extended_initial_state,
       inputs=inputs,
       cell_fn=func_cell.cell_step,
+      max_input_length=max_length,
       use_tpu=use_tpu)
   tf_output, tf_state = _PostProcessOutput(
       extended_acc_state, extended_final_state, func_cell,
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
index d8c0a0631d..69ef521c01 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_REDUCE_SLICE_OPS_KERNELS_REDUCE_SLICE_OPS_H_
+#define TENSORFLOW_CONTRIB_REDUCE_SLICE_OPS_KERNELS_REDUCE_SLICE_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -81,4 +81,4 @@ CALL_ALL_REDUCEOPS(ReduceSliceFunctorReduceop)
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_REDUCE_SLICE_OPS_KERNELS_REDUCE_SLICE_OPS_H_
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 2a84629080..5874245d58 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -149,7 +149,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "core_rnn_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/core_rnn_test.py"],
     additional_deps = [
         ":rnn_py",
@@ -175,7 +175,7 @@ cuda_py_tests(
 
 tf_py_test(
     name = "fused_rnn_cell_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/fused_rnn_cell_test.py"],
     additional_deps = [
         ":rnn_py",
@@ -192,10 +192,6 @@ tf_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "manual",
-        "notap",
-    ],
 )
 
 cuda_py_tests(
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index cb437f2a2f..026bf08ced 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """RNN Cells and additional RNN operations.
 
-See @{$python/contrib.rnn} guide.
+See [Contrib RNN](https://tensorflow.org/api_guides/python/contrib.rnn) guide.
 
 <!--From core-->
 @@RNNCell
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 85f0f8ced9..15ce9d1ce7 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -225,7 +225,7 @@ class RNNCellTest(test.TestCase):
   def testBasicLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
       np_dtype = dtype.as_numpy_dtype
-      with self.test_session(graph=ops.Graph()) as sess:
+      with self.session(graph=ops.Graph()) as sess:
         with variable_scope.variable_scope(
             "root", initializer=init_ops.constant_initializer(0.5)):
           x = array_ops.zeros([1, 2], dtype=dtype)
@@ -395,7 +395,7 @@ class RNNCellTest(test.TestCase):
   def testIndyLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
       np_dtype = dtype.as_numpy_dtype
-      with self.test_session(graph=ops.Graph()) as sess:
+      with self.session(graph=ops.Graph()) as sess:
         with variable_scope.variable_scope(
             "root", initializer=init_ops.constant_initializer(0.5)):
           x = array_ops.zeros([1, 2], dtype=dtype)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 1c20d88fe4..aa4562be7c 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -457,7 +457,7 @@ class LSTMTest(test.TestCase):
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, num_units)
@@ -491,7 +491,7 @@ class LSTMTest(test.TestCase):
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(
@@ -588,7 +588,7 @@ class LSTMTest(test.TestCase):
     num_proj = 4
     max_length = 8
     sequence_length = [4, 6]
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -834,7 +834,7 @@ class LSTMTest(test.TestCase):
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
@@ -884,7 +884,7 @@ class LSTMTest(test.TestCase):
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
@@ -930,7 +930,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1006,7 +1006,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1288,7 +1288,10 @@ class LSTMTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testDynamicEquivalentToStaticRNN(self):
     self._testDynamicEquivalentToStaticRNN(use_sequence_length=False)
-    self._testDynamicEquivalentToStaticRNN(use_sequence_length=False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDynamicEquivalentToStaticRNNWithSequenceLength(self):
+    self._testDynamicEquivalentToStaticRNN(use_sequence_length=True)
 
 
 class BidirectionalRNNTest(test.TestCase):
@@ -1609,7 +1612,7 @@ class MultiDimensionalLSTMTest(test.TestCase):
     batch_size = 2
     max_length = 8
     sequence_length = [4, 6]
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
@@ -1720,7 +1723,7 @@ class NestedLSTMTest(test.TestCase):
     state_size = 6
     max_length = 8
     sequence_length = [4, 6]
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
           dtypes.float32, shape=(None, input_size)),
@@ -2014,7 +2017,7 @@ class RawRNNTest(test.TestCase):
     np.random.seed(self._seed)
 
   def _testRawRNN(self, max_time):
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       batch_size = 16
       input_depth = 4
       num_units = 3
@@ -2123,7 +2126,7 @@ class RawRNNTest(test.TestCase):
     self._testRawRNN(max_time=10)
 
   def testLoopState(self):
-    with self.test_session(graph=ops_lib.Graph()):
+    with self.session(graph=ops_lib.Graph()):
       max_time = 10
       batch_size = 16
       input_depth = 4
@@ -2159,7 +2162,7 @@ class RawRNNTest(test.TestCase):
       self.assertEqual([10], loop_state.eval())
 
   def testLoopStateWithTensorArray(self):
-    with self.test_session(graph=ops_lib.Graph()):
+    with self.session(graph=ops_lib.Graph()):
       max_time = 4
       batch_size = 16
       input_depth = 4
@@ -2202,7 +2205,7 @@ class RawRNNTest(test.TestCase):
       self.assertAllEqual([1, 2, 2 + 2, 4 + 3, 7 + 4], loop_state.eval())
 
   def testEmitDifferentStructureThanCellOutput(self):
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       max_time = 10
       batch_size = 16
       input_depth = 4
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index c7d85862f6..2df8f0ec05 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1440,7 +1440,7 @@ class CompiledWrapperTest(test.TestCase):
     atol = 1e-5
 
     random_seed.set_random_seed(1234)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       xla_ops = _create_multi_lstm_cell_ops(
           batch_size=batch_size,
           num_units=num_units,
@@ -1452,7 +1452,7 @@ class CompiledWrapperTest(test.TestCase):
       xla_results = sess.run(xla_ops)
 
     random_seed.set_random_seed(1234)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       non_xla_ops = _create_multi_lstm_cell_ops(
           batch_size=batch_size,
           num_units=num_units,
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index fbb50befdf..e7eb4ac563 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -113,7 +113,6 @@ py_test(
     size = "small",
     srcs = ["python/saved_model/keras_saved_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":saved_model_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
index d10ec9cf0c..3e6ff65c33 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
@@ -43,7 +43,7 @@ class ReaderTest(test.TestCase):
   def testReadSavedModelValid(self):
     saved_model_dir = os.path.join(test.get_temp_dir(), "valid_saved_model")
     builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
     builder.save()
@@ -68,35 +68,35 @@ class ReaderTest(test.TestCase):
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
     # - a single tag (from predefined constants).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
 
     # Graph that updates the single variable. SavedModel invoked to:
     # - simply add the model (weights are not updated).
     # - a single tag (from predefined constants).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 43)
       builder.add_meta_graph([tag_constants.SERVING])
 
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple predefined tags.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 44)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
 
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple predefined tags for serving on TPU.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 44)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
 
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 45)
       builder.add_meta_graph(["foo", "bar"])
 
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index a7279bc339..674f7cdb22 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -15,7 +15,9 @@
 
 """Ops for building neural network seq2seq decoders and losses.
 
-See the @{$python/contrib.seq2seq} guide.
+See the
+[Contrib Seq2seq](https://tensorflow.org/api_guides/python/contrib.seq2seq)
+guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index cd162bae25..f2c43f30d4 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -512,7 +512,7 @@ class AttentionWrapperTest(test.TestCase):
 
     for axis in [0, 1]:
       for exclusive in [True, False]:
-        with self.test_session():
+        with self.cached_session():
           # Compute cumprod with regular tf.cumprod
           cumprod_output = math_ops.cumprod(
               test_input, axis=axis, exclusive=exclusive).eval()
@@ -548,7 +548,7 @@ class AttentionWrapperTest(test.TestCase):
         for p, a in zip(p_choose_i, previous_attention)])
 
     # Compute output with TensorFlow function, for both calculation types
-    with self.test_session():
+    with self.cached_session():
       recursive_output = wrapper.monotonic_attention(
           p_choose_i, previous_attention, 'recursive').eval()
 
@@ -569,7 +569,7 @@ class AttentionWrapperTest(test.TestCase):
         for p, a in zip(p_choose_i, previous_attention)])
 
     # Compute output with TensorFlow function, for both calculation types
-    with self.test_session():
+    with self.cached_session():
       parallel_output = wrapper.monotonic_attention(
           p_choose_i, previous_attention, 'parallel').eval()
 
@@ -594,7 +594,7 @@ class AttentionWrapperTest(test.TestCase):
         for p, a in zip(p_choose_i, previous_attention)])
 
     # Compute output with TensorFlow function, for both calculation types
-    with self.test_session():
+    with self.cached_session():
       hard_output = wrapper.monotonic_attention(
           # TensorFlow is unhappy when these are not wrapped as tf.constant
           constant_op.constant(p_choose_i),
@@ -634,7 +634,7 @@ class AttentionWrapperTest(test.TestCase):
     recursive_output = [np.array([1] + [0]*(p_choose_i.shape[1] - 1),
                                  np.float32)]
     # Compute output with TensorFlow function, for both calculation types
-    with self.test_session():
+    with self.cached_session():
       for j in range(p_choose_i.shape[0]):
         # Compute attention distribution for this output time step
         recursive_output.append(wrapper.monotonic_attention(
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 4073b390fc..f5b6b1bde9 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -66,7 +66,7 @@ class TestGatherTree(test.TestCase):
         max_sequence_lengths=max_sequence_lengths,
         end_token=11)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       res_ = sess.run(res)
 
     self.assertAllEqual(expected_result, res_)
@@ -115,7 +115,7 @@ class TestGatherTree(test.TestCase):
     sorted_array = beam_search_decoder.gather_tree_from_array(
         array, parent_ids, sequence_length)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sorted_array = sess.run(sorted_array)
       expected_array = sess.run(expected_array)
       self.assertAllEqual(expected_array, sorted_array)
@@ -170,7 +170,7 @@ class TestGatherTree(test.TestCase):
     sorted_array = beam_search_decoder.gather_tree_from_array(
         array, parent_ids, sequence_length)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sorted_array, expected_array = sess.run([sorted_array, expected_array])
       self.assertAllEqual(expected_array, sorted_array)
 
@@ -186,7 +186,7 @@ class TestArrayShapeChecks(test.TestCase):
     batch_size = array_ops.constant(batch_size)
     check_op = beam_search_decoder._check_batch_beam(t, batch_size, beam_width)  # pylint: disable=protected-access
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       if is_valid:
         sess.run(check_op)
       else:
@@ -220,7 +220,7 @@ class TestEosMasking(test.TestCase):
     masked = beam_search_decoder._mask_probs(probs, eos_token,
                                              previously_finished)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       probs = sess.run(probs)
       masked = sess.run(masked)
 
@@ -283,7 +283,7 @@ class TestBeamStep(test.TestCase):
         end_token=self.end_token,
         length_penalty_weight=self.length_penalty_weight)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       outputs_, next_state_, state_, log_probs_ = sess.run(
           [outputs, next_beam_state, beam_state, log_probs])
 
@@ -338,7 +338,7 @@ class TestBeamStep(test.TestCase):
         end_token=self.end_token,
         length_penalty_weight=self.length_penalty_weight)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       outputs_, next_state_, state_, log_probs_ = sess.run(
           [outputs, next_beam_state, beam_state, log_probs])
 
@@ -436,7 +436,7 @@ class TestLargeBeamStep(test.TestCase):
         end_token=self.end_token,
         length_penalty_weight=self.length_penalty_weight)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       outputs_, next_state_, _, _ = sess.run(
           [outputs, next_beam_state, beam_state, log_probs])
 
@@ -471,7 +471,7 @@ class BeamSearchDecoderTest(test.TestCase):
     output_layer = layers_core.Dense(vocab_size, use_bias=True, activation=None)
     beam_width = 3
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size_tensor = constant_op.constant(batch_size)
       embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
       cell = rnn_cell.LSTMCell(cell_depth)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index 277c5b6ef7..9662a5780a 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -67,7 +67,7 @@ class GatherTreeTest(test.TestCase):
           parent_ids=parent_ids,
           max_sequence_lengths=max_sequence_lengths,
           end_token=end_token)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError(
           r"parent id -1 at \(batch, time, beam\) == \(0, 0, 1\)"):
         _ = beams.eval()
diff --git a/tensorflow/contrib/session_bundle/session_bundle.cc b/tensorflow/contrib/session_bundle/session_bundle.cc
index cf26e3cae7..a690d9b129 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle.cc
@@ -138,10 +138,10 @@ Status RunRestoreOp(const RunOptions& run_options, const StringPiece export_dir,
   Tensor variables_tensor =
       CreateStringTensor(GetVariablesFilename(export_dir));
   std::vector<std::pair<string, Tensor>> inputs = {
-      {variables_filename_const_op_name.ToString(), variables_tensor}};
+      {string(variables_filename_const_op_name), variables_tensor}};
   AddAssetsTensorsToInputs(export_dir, asset_files, &inputs);
   RunMetadata run_metadata;
-  return session->Run(run_options, inputs, {}, {restore_op_name.ToString()},
+  return session->Run(run_options, inputs, {}, {string(restore_op_name)},
                       nullptr /* outputs */, &run_metadata);
 }
 
@@ -152,7 +152,7 @@ Status RunInitOp(const RunOptions& run_options, const StringPiece export_dir,
   std::vector<std::pair<string, Tensor>> inputs;
   AddAssetsTensorsToInputs(export_dir, asset_files, &inputs);
   RunMetadata run_metadata;
-  return session->Run(run_options, inputs, {}, {init_op_name.ToString()},
+  return session->Run(run_options, inputs, {}, {string(init_op_name)},
                       nullptr /* outputs */, &run_metadata);
 }
 
@@ -251,15 +251,14 @@ Status LoadSessionBundleFromPathUsingRunOptions(const SessionOptions& options,
   auto log_and_count = [&](const string& status_str) {
     LOG(INFO) << "Loading SessionBundle: " << status_str << ". Took "
               << load_latency_microsecs << " microseconds.";
-    load_attempt_count->GetCell(export_dir.ToString(), status_str)
-        ->IncrementBy(1);
+    load_attempt_count->GetCell(string(export_dir), status_str)->IncrementBy(1);
   };
   if (status.ok()) {
     log_and_count(kLoadAttemptSuccess);
   } else {
     log_and_count(kLoadAttemptFail);
   }
-  load_latency->GetCell(export_dir.ToString())
+  load_latency->GetCell(string(export_dir))
       ->IncrementBy(load_latency_microsecs);
   return status;
 }
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.py b/tensorflow/contrib/session_bundle/session_bundle_test.py
index a57e8920c5..3c06ec048d 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.py
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.py
@@ -167,7 +167,7 @@ class SessionBundleLoadNoVarsTest(test.TestCase):
       y = math_ops.subtract(w * x, 7.0, name="y")  # pylint: disable=unused-variable
       ops.add_to_collection("meta", "this is meta")
 
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         variables.global_variables_initializer().run()
         new_graph_def = graph_util.convert_variables_to_constants(
             session, g.as_graph_def(), ["y"])
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index 6a2080bcec..d088e74434 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Signal processing operations.
 
-See the @{$python/contrib.signal} guide.
+See the
+[Contrib Signal](https://tensorflow.org/api_guides/python/contrib.signal)
+guide.
 
 @@frame
 @@hamming_window
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 2c97834523..cbfdaeb45d 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -100,7 +100,7 @@ class EvaluationTest(test.TestCase):
 
     # Save initialized variables to a checkpoint directory:
     saver = saver_lib.Saver()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       init_op.run()
       saver.save(sess, os.path.join(chkpt_dir, 'chkpt'))
 
@@ -211,7 +211,7 @@ class EvaluationTest(test.TestCase):
 
     # Save initialized variables to a checkpoint directory:
     saver = saver_lib.Saver()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       init_op.run()
       saver.save(sess, os.path.join(chkpt_dir, 'chkpt'))
 
@@ -248,7 +248,7 @@ class SingleEvaluationTest(test.TestCase):
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(init_op)
       saver.save(sess, checkpoint_path)
 
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 831c6e427a..d92a7fbb47 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -73,7 +73,7 @@ class ClipGradientNormsTest(test.TestCase):
     # Ensure the variable passed through.
     self.assertEqual(gradients_to_variables[1], variable)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_gradient = sess.run(gradients_to_variables[0])
     np_testing.assert_almost_equal(actual_gradient, self._clipped_grad_vec)
 
@@ -164,7 +164,7 @@ class MultiplyGradientsTest(test.TestCase):
     # Ensure the variable passed through.
     self.assertEqual(grad_to_var[1], variable)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_gradient = sess.run(grad_to_var[0])
     np_testing.assert_almost_equal(actual_gradient, self._multiplied_grad_vec,
                                    5)
@@ -188,7 +188,7 @@ class MultiplyGradientsTest(test.TestCase):
     self.assertEqual(grad_to_var[0].indices, indices)
     self.assertEqual(grad_to_var[0].dense_shape, dense_shape)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_gradient = sess.run(grad_to_var[0].values)
     np_testing.assert_almost_equal(actual_gradient, self._multiplied_grad_vec,
                                    5)
@@ -204,7 +204,7 @@ class MultiplyGradientsTest(test.TestCase):
     [grad_to_var] = learning.multiply_gradients([grad_to_var],
                                                 gradient_multipliers)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables_lib.global_variables_initializer())
       gradient_true_flag = sess.run(grad_to_var[0])
       sess.run(multiplier_flag.assign(False))
diff --git a/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py b/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py
index eb93f753ae..b6d1afd27d 100644
--- a/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/alexnet_test.py
@@ -33,7 +33,7 @@ class AlexnetV2Test(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = alexnet.alexnet_v2(inputs, num_classes)
       self.assertEquals(logits.op.name, 'alexnet_v2/fc8/squeezed')
@@ -44,7 +44,7 @@ class AlexnetV2Test(test.TestCase):
     batch_size = 1
     height, width = 300, 400
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = alexnet.alexnet_v2(inputs, num_classes, spatial_squeeze=False)
       self.assertEquals(logits.op.name, 'alexnet_v2/fc8/BiasAdd')
@@ -55,7 +55,7 @@ class AlexnetV2Test(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       _, end_points = alexnet.alexnet_v2(inputs, num_classes)
       expected_names = [
@@ -70,7 +70,7 @@ class AlexnetV2Test(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       alexnet.alexnet_v2(inputs, num_classes)
       expected_names = [
@@ -98,7 +98,7 @@ class AlexnetV2Test(test.TestCase):
     batch_size = 2
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       eval_inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = alexnet.alexnet_v2(eval_inputs, is_training=False)
       self.assertListEqual(logits.get_shape().as_list(),
@@ -112,7 +112,7 @@ class AlexnetV2Test(test.TestCase):
     train_height, train_width = 224, 224
     eval_height, eval_width = 300, 400
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       train_inputs = random_ops.random_uniform(
           (train_batch_size, train_height, train_width, 3))
       logits, _ = alexnet.alexnet_v2(train_inputs)
@@ -132,7 +132,7 @@ class AlexnetV2Test(test.TestCase):
   def testForward(self):
     batch_size = 1
     height, width = 224, 224
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = alexnet.alexnet_v2(inputs)
       sess.run(variables.global_variables_initializer())
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py
index 7a3d1c9703..34f12d7591 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v1_test.py
@@ -143,7 +143,7 @@ class InceptionV1Test(test.TestCase):
     height, width = 224, 224
     num_classes = 1000
     input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(batch_size, None, None, 3))
       logits, end_points = inception_v1.inception_v1(inputs, num_classes)
@@ -167,7 +167,7 @@ class InceptionV1Test(test.TestCase):
     self.assertListEqual(logits.get_shape().as_list(), [None, num_classes])
     images = random_ops.random_uniform((batch_size, height, width, 3))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(logits, {inputs: images.eval()})
       self.assertEquals(output.shape, (batch_size, num_classes))
@@ -182,7 +182,7 @@ class InceptionV1Test(test.TestCase):
         eval_inputs, num_classes, is_training=False)
     predictions = math_ops.argmax(logits, 1)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(predictions)
       self.assertEquals(output.shape, (batch_size,))
@@ -200,7 +200,7 @@ class InceptionV1Test(test.TestCase):
     logits, _ = inception_v1.inception_v1(eval_inputs, num_classes, reuse=True)
     predictions = math_ops.argmax(logits, 1)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(predictions)
       self.assertEquals(output.shape, (eval_batch_size,))
@@ -211,7 +211,7 @@ class InceptionV1Test(test.TestCase):
     logits, _ = inception_v1.inception_v1(
         images, num_classes=num_classes, spatial_squeeze=False)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       variables.global_variables_initializer().run()
       logits_out = sess.run(logits)
       self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py b/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py
index 5fbc9e5aa3..66effba944 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v2_test.py
@@ -196,7 +196,7 @@ class InceptionV2Test(test.TestCase):
     height, width = 224, 224
     num_classes = 1000
     input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(batch_size, None, None, 3))
       logits, end_points = inception_v2.inception_v2(inputs, num_classes)
@@ -220,7 +220,7 @@ class InceptionV2Test(test.TestCase):
     self.assertListEqual(logits.get_shape().as_list(), [None, num_classes])
     images = random_ops.random_uniform((batch_size, height, width, 3))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(logits, {inputs: images.eval()})
       self.assertEquals(output.shape, (batch_size, num_classes))
@@ -235,7 +235,7 @@ class InceptionV2Test(test.TestCase):
         eval_inputs, num_classes, is_training=False)
     predictions = math_ops.argmax(logits, 1)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(predictions)
       self.assertEquals(output.shape, (batch_size,))
@@ -253,7 +253,7 @@ class InceptionV2Test(test.TestCase):
     logits, _ = inception_v2.inception_v2(eval_inputs, num_classes, reuse=True)
     predictions = math_ops.argmax(logits, 1)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(predictions)
       self.assertEquals(output.shape, (eval_batch_size,))
@@ -264,7 +264,7 @@ class InceptionV2Test(test.TestCase):
     logits, _ = inception_v2.inception_v2(
         images, num_classes=num_classes, spatial_squeeze=False)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       variables.global_variables_initializer().run()
       logits_out = sess.run(logits)
       self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py b/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py
index 6ba02318ed..0f9cca7bbd 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v3_test.py
@@ -226,7 +226,7 @@ class InceptionV3Test(test.TestCase):
     height, width = 299, 299
     num_classes = 1000
     input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(batch_size, None, None, 3))
       logits, end_points = inception_v3.inception_v3(inputs, num_classes)
@@ -249,7 +249,7 @@ class InceptionV3Test(test.TestCase):
     self.assertListEqual(logits.get_shape().as_list(), [None, num_classes])
     images = random_ops.random_uniform((batch_size, height, width, 3))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(logits, {inputs: images.eval()})
       self.assertEquals(output.shape, (batch_size, num_classes))
@@ -264,7 +264,7 @@ class InceptionV3Test(test.TestCase):
         eval_inputs, num_classes, is_training=False)
     predictions = math_ops.argmax(logits, 1)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(predictions)
       self.assertEquals(output.shape, (batch_size,))
@@ -283,7 +283,7 @@ class InceptionV3Test(test.TestCase):
         eval_inputs, num_classes, is_training=False, reuse=True)
     predictions = math_ops.argmax(logits, 1)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(predictions)
       self.assertEquals(output.shape, (eval_batch_size,))
@@ -294,7 +294,7 @@ class InceptionV3Test(test.TestCase):
     logits, _ = inception_v3.inception_v3(
         images, num_classes=num_classes, spatial_squeeze=False)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       variables.global_variables_initializer().run()
       logits_out = sess.run(logits)
       self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
diff --git a/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py b/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py
index 317af3cb29..44fa35ad14 100644
--- a/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/overfeat_test.py
@@ -33,7 +33,7 @@ class OverFeatTest(test.TestCase):
     batch_size = 5
     height, width = 231, 231
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = overfeat.overfeat(inputs, num_classes)
       self.assertEquals(logits.op.name, 'overfeat/fc8/squeezed')
@@ -44,7 +44,7 @@ class OverFeatTest(test.TestCase):
     batch_size = 1
     height, width = 281, 281
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = overfeat.overfeat(inputs, num_classes, spatial_squeeze=False)
       self.assertEquals(logits.op.name, 'overfeat/fc8/BiasAdd')
@@ -55,7 +55,7 @@ class OverFeatTest(test.TestCase):
     batch_size = 5
     height, width = 231, 231
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       _, end_points = overfeat.overfeat(inputs, num_classes)
       expected_names = [
@@ -70,7 +70,7 @@ class OverFeatTest(test.TestCase):
     batch_size = 5
     height, width = 231, 231
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       overfeat.overfeat(inputs, num_classes)
       expected_names = [
@@ -98,7 +98,7 @@ class OverFeatTest(test.TestCase):
     batch_size = 2
     height, width = 231, 231
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       eval_inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = overfeat.overfeat(eval_inputs, is_training=False)
       self.assertListEqual(logits.get_shape().as_list(),
@@ -112,7 +112,7 @@ class OverFeatTest(test.TestCase):
     train_height, train_width = 231, 231
     eval_height, eval_width = 281, 281
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       train_inputs = random_ops.random_uniform(
           (train_batch_size, train_height, train_width, 3))
       logits, _ = overfeat.overfeat(train_inputs)
@@ -132,7 +132,7 @@ class OverFeatTest(test.TestCase):
   def testForward(self):
     batch_size = 1
     height, width = 231, 231
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = overfeat.overfeat(inputs)
       sess.run(variables.global_variables_initializer())
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
index 576444214d..8ff44fe4b5 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
@@ -69,7 +69,7 @@ class ResnetUtilsTest(test.TestCase):
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 6, 8]), [1, 2, 2, 1])
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(x.eval(), expected.eval())
 
   def testSubsampleFourByFour(self):
@@ -77,7 +77,7 @@ class ResnetUtilsTest(test.TestCase):
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 8, 10]), [1, 2, 2, 1])
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(x.eval(), expected.eval())
 
   def testConv2DSameEven(self):
@@ -110,7 +110,7 @@ class ResnetUtilsTest(test.TestCase):
     y4_expected = math_ops.to_float([[48, 37], [37, 22]])
     y4_expected = array_ops.reshape(y4_expected, [1, n2, n2, 1])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       self.assertAllClose(y1.eval(), y1_expected.eval())
       self.assertAllClose(y2.eval(), y2_expected.eval())
@@ -148,7 +148,7 @@ class ResnetUtilsTest(test.TestCase):
     y4 = layers.conv2d(x, 1, [3, 3], stride=2, scope='Conv')
     y4_expected = y2_expected
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       self.assertAllClose(y1.eval(), y1_expected.eval())
       self.assertAllClose(y2.eval(), y2_expected.eval())
@@ -223,7 +223,7 @@ class ResnetUtilsTest(test.TestCase):
       with arg_scope([layers.batch_norm], is_training=False):
         for output_stride in [1, 2, 4, 8, None]:
           with ops.Graph().as_default():
-            with self.test_session() as sess:
+            with self.cached_session() as sess:
               random_seed.set_random_seed(0)
               inputs = create_test_input(1, height, width, 3)
               # Dense feature extraction followed by subsampling.
@@ -364,7 +364,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     for output_stride in [4, 8, 16, 32, None]:
       with arg_scope(resnet_utils.resnet_arg_scope()):
         with ops.Graph().as_default():
-          with self.test_session() as sess:
+          with self.cached_session() as sess:
             random_seed.set_random_seed(0)
             inputs = create_test_input(2, 81, 81, 3)
             # Dense feature extraction followed by subsampling.
@@ -401,7 +401,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     self.assertListEqual(logits.get_shape().as_list(),
                          [None, 1, 1, num_classes])
     images = create_test_input(batch, height, width, 3)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(logits, {inputs: images.eval()})
       self.assertEqual(output.shape, (batch, 1, 1, num_classes))
@@ -415,7 +415,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       output, _ = self._resnet_small(inputs, None, global_pool=global_pool)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(output, {inputs: images.eval()})
       self.assertEqual(output.shape, (batch, 3, 3, 32))
@@ -431,7 +431,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
           inputs, None, global_pool=global_pool, output_stride=output_stride)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(output, {inputs: images.eval()})
       self.assertEqual(output.shape, (batch, 9, 9, 32))
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
index 6bdda18c5b..055ecff1c3 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
@@ -69,7 +69,7 @@ class ResnetUtilsTest(test.TestCase):
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 6, 8]), [1, 2, 2, 1])
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(x.eval(), expected.eval())
 
   def testSubsampleFourByFour(self):
@@ -77,7 +77,7 @@ class ResnetUtilsTest(test.TestCase):
     x = resnet_utils.subsample(x, 2)
     expected = array_ops.reshape(
         constant_op.constant([0, 2, 8, 10]), [1, 2, 2, 1])
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(x.eval(), expected.eval())
 
   def testConv2DSameEven(self):
@@ -110,7 +110,7 @@ class ResnetUtilsTest(test.TestCase):
     y4_expected = math_ops.to_float([[48, 37], [37, 22]])
     y4_expected = array_ops.reshape(y4_expected, [1, n2, n2, 1])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       self.assertAllClose(y1.eval(), y1_expected.eval())
       self.assertAllClose(y2.eval(), y2_expected.eval())
@@ -151,7 +151,7 @@ class ResnetUtilsTest(test.TestCase):
     y4 = layers.conv2d(x, 1, [3, 3], stride=2, scope='Conv')
     y4_expected = y2_expected
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       self.assertAllClose(y1.eval(), y1_expected.eval())
       self.assertAllClose(y2.eval(), y2_expected.eval())
@@ -227,7 +227,7 @@ class ResnetUtilsTest(test.TestCase):
       with arg_scope([layers.batch_norm], is_training=False):
         for output_stride in [1, 2, 4, 8, None]:
           with ops.Graph().as_default():
-            with self.test_session() as sess:
+            with self.cached_session() as sess:
               random_seed.set_random_seed(0)
               inputs = create_test_input(1, height, width, 3)
               # Dense feature extraction followed by subsampling.
@@ -368,7 +368,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     for output_stride in [4, 8, 16, 32, None]:
       with arg_scope(resnet_utils.resnet_arg_scope()):
         with ops.Graph().as_default():
-          with self.test_session() as sess:
+          with self.cached_session() as sess:
             random_seed.set_random_seed(0)
             inputs = create_test_input(2, 81, 81, 3)
             # Dense feature extraction followed by subsampling.
@@ -405,7 +405,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     self.assertListEqual(logits.get_shape().as_list(),
                          [None, 1, 1, num_classes])
     images = create_test_input(batch, height, width, 3)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(logits, {inputs: images.eval()})
       self.assertEqual(output.shape, (batch, 1, 1, num_classes))
@@ -419,7 +419,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       output, _ = self._resnet_small(inputs, None, global_pool=global_pool)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(output, {inputs: images.eval()})
       self.assertEqual(output.shape, (batch, 3, 3, 32))
@@ -435,7 +435,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
           inputs, None, global_pool=global_pool, output_stride=output_stride)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       sess.run(variables.global_variables_initializer())
       output = sess.run(output, {inputs: images.eval()})
       self.assertEqual(output.shape, (batch, 9, 9, 32))
diff --git a/tensorflow/contrib/slim/python/slim/nets/vgg_test.py b/tensorflow/contrib/slim/python/slim/nets/vgg_test.py
index 36628b32d1..71ce4b89cd 100644
--- a/tensorflow/contrib/slim/python/slim/nets/vgg_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/vgg_test.py
@@ -34,7 +34,7 @@ class VGGATest(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_a(inputs, num_classes)
       self.assertEquals(logits.op.name, 'vgg_a/fc8/squeezed')
@@ -45,7 +45,7 @@ class VGGATest(test.TestCase):
     batch_size = 1
     height, width = 256, 256
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_a(inputs, num_classes, spatial_squeeze=False)
       self.assertEquals(logits.op.name, 'vgg_a/fc8/BiasAdd')
@@ -73,7 +73,7 @@ class VGGATest(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       vgg.vgg_a(inputs, num_classes)
       expected_names = [
@@ -107,7 +107,7 @@ class VGGATest(test.TestCase):
     batch_size = 2
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       eval_inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_a(eval_inputs, is_training=False)
       self.assertListEqual(logits.get_shape().as_list(),
@@ -121,7 +121,7 @@ class VGGATest(test.TestCase):
     train_height, train_width = 224, 224
     eval_height, eval_width = 256, 256
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       train_inputs = random_ops.random_uniform(
           (train_batch_size, train_height, train_width, 3))
       logits, _ = vgg.vgg_a(train_inputs)
@@ -141,7 +141,7 @@ class VGGATest(test.TestCase):
   def testForward(self):
     batch_size = 1
     height, width = 224, 224
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_a(inputs)
       sess.run(variables.global_variables_initializer())
@@ -155,7 +155,7 @@ class VGG16Test(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_16(inputs, num_classes)
       self.assertEquals(logits.op.name, 'vgg_16/fc8/squeezed')
@@ -166,7 +166,7 @@ class VGG16Test(test.TestCase):
     batch_size = 1
     height, width = 256, 256
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_16(inputs, num_classes, spatial_squeeze=False)
       self.assertEquals(logits.op.name, 'vgg_16/fc8/BiasAdd')
@@ -197,7 +197,7 @@ class VGG16Test(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       vgg.vgg_16(inputs, num_classes)
       expected_names = [
@@ -241,7 +241,7 @@ class VGG16Test(test.TestCase):
     batch_size = 2
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       eval_inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_16(eval_inputs, is_training=False)
       self.assertListEqual(logits.get_shape().as_list(),
@@ -255,7 +255,7 @@ class VGG16Test(test.TestCase):
     train_height, train_width = 224, 224
     eval_height, eval_width = 256, 256
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       train_inputs = random_ops.random_uniform(
           (train_batch_size, train_height, train_width, 3))
       logits, _ = vgg.vgg_16(train_inputs)
@@ -275,7 +275,7 @@ class VGG16Test(test.TestCase):
   def testForward(self):
     batch_size = 1
     height, width = 224, 224
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_16(inputs)
       sess.run(variables.global_variables_initializer())
@@ -289,7 +289,7 @@ class VGG19Test(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_19(inputs, num_classes)
       self.assertEquals(logits.op.name, 'vgg_19/fc8/squeezed')
@@ -300,7 +300,7 @@ class VGG19Test(test.TestCase):
     batch_size = 1
     height, width = 256, 256
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_19(inputs, num_classes, spatial_squeeze=False)
       self.assertEquals(logits.op.name, 'vgg_19/fc8/BiasAdd')
@@ -332,7 +332,7 @@ class VGG19Test(test.TestCase):
     batch_size = 5
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       vgg.vgg_19(inputs, num_classes)
       expected_names = [
@@ -382,7 +382,7 @@ class VGG19Test(test.TestCase):
     batch_size = 2
     height, width = 224, 224
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       eval_inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_19(eval_inputs, is_training=False)
       self.assertListEqual(logits.get_shape().as_list(),
@@ -396,7 +396,7 @@ class VGG19Test(test.TestCase):
     train_height, train_width = 224, 224
     eval_height, eval_width = 256, 256
     num_classes = 1000
-    with self.test_session():
+    with self.cached_session():
       train_inputs = random_ops.random_uniform(
           (train_batch_size, train_height, train_width, 3))
       logits, _ = vgg.vgg_19(train_inputs)
@@ -416,7 +416,7 @@ class VGG19Test(test.TestCase):
   def testForward(self):
     batch_size = 1
     height, width = 224, 224
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs = random_ops.random_uniform((batch_size, height, width, 3))
       logits, _ = vgg.vgg_19(inputs)
       sess.run(variables.global_variables_initializer())
diff --git a/tensorflow/contrib/slim/python/slim/summaries_test.py b/tensorflow/contrib/slim/python/slim/summaries_test.py
index 873ee78de2..c6017f073e 100644
--- a/tensorflow/contrib/slim/python/slim/summaries_test.py
+++ b/tensorflow/contrib/slim/python/slim/summaries_test.py
@@ -88,7 +88,7 @@ class SummariesTest(test.TestCase):
     summary_op = summary.merge_all()
 
     summary_writer = summary.FileWriter(output_dir)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       new_summary = sess.run(summary_op)
       summary_writer.add_summary(new_summary, 1)
       summary_writer.flush()
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 0b8fc0cdc6..412a2c81a1 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,8 +31,5 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "no_windows",
-        "notap",  # TODO(b/80546574): test is flaky
-    ],
+    tags = ["notap"],  # TODO(b/80546574): test is flaky
 )
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 164f3e58e6..cf55fec488 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -515,6 +515,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client_lib",
+        "//tensorflow/contrib/estimator:head",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
@@ -537,6 +538,7 @@ py_test(
     srcs = ["client/random_forest_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "noasan",
         "nomac",  # b/63258195
         "notsan",
     ],
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 35e8c92aba..db970deff5 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -18,14 +18,16 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
+from tensorflow.contrib.estimator.python.estimator import head as core_head_lib
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
-
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.python import tensor_forest
-
+from tensorflow.python.estimator import estimator as core_estimator
+from tensorflow.python.estimator.export.export_output import PredictOutput
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -34,12 +36,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 
-
 KEYS_NAME = 'keys'
 LOSS_NAME = 'rf_training_loss'
 TREE_PATHS_PREDICTION_KEY = 'tree_paths'
@@ -48,6 +50,11 @@ ALL_SERVING_KEY = 'tensorforest_all'
 EPSILON = 0.000001
 
 
+class ModelBuilderOutputType(object):
+  MODEL_FN_OPS = 0
+  ESTIMATOR_SPEC = 1
+
+
 class TensorForestRunOpAtEndHook(session_run_hook.SessionRunHook):
 
   def __init__(self, op_dict):
@@ -106,20 +113,40 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
       run_context.request_stop()
 
 
-def get_default_head(params, weights_name, name=None):
-  if params.regression:
-    return head_lib.regression_head(
-        weight_column_name=weights_name,
-        label_dimension=params.num_outputs,
-        enable_centered_bias=False,
-        head_name=name)
+def _get_default_head(params, weights_name, output_type, name=None):
+  """Creates a default head based on a type of a problem."""
+  if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+    if params.regression:
+      return head_lib.regression_head(
+          weight_column_name=weights_name,
+          label_dimension=params.num_outputs,
+          enable_centered_bias=False,
+          head_name=name)
+    else:
+      return head_lib.multi_class_head(
+          params.num_classes,
+          weight_column_name=weights_name,
+          enable_centered_bias=False,
+          head_name=name)
   else:
-    return head_lib.multi_class_head(
-        params.num_classes,
-        weight_column_name=weights_name,
-        enable_centered_bias=False,
-        head_name=name)
-
+    if params.regression:
+      return core_head_lib.regression_head(
+          weight_column=weights_name,
+          label_dimension=params.num_outputs,
+          name=name,
+          loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+    else:
+      if params.num_classes == 2:
+        return core_head_lib.binary_classification_head(
+            weight_column=weights_name,
+            name=name,
+            loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+      else:
+        return core_head_lib.multi_class_head(
+            n_classes=params.num_classes,
+            weight_column=weights_name,
+            name=name,
+            loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
 
 def get_model_fn(params,
                  graph_builder_class,
@@ -135,19 +162,27 @@ def get_model_fn(params,
                  report_feature_importances=False,
                  local_eval=False,
                  head_scope=None,
-                 include_all_in_serving=False):
+                 include_all_in_serving=False,
+                 output_type=ModelBuilderOutputType.MODEL_FN_OPS):
   """Return a model function given a way to construct a graph builder."""
   if model_head is None:
-    model_head = get_default_head(params, weights_name)
+    model_head = _get_default_head(params, weights_name, output_type)
 
   def _model_fn(features, labels, mode):
     """Function that returns predictions, training loss, and training op."""
+
     if (isinstance(features, ops.Tensor) or
         isinstance(features, sparse_tensor.SparseTensor)):
       features = {'features': features}
     if feature_columns:
       features = features.copy()
-      features.update(layers.transform_features(features, feature_columns))
+
+      if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+        features.update(layers.transform_features(features, feature_columns))
+      else:
+        for fc in feature_columns:
+          tensor = fc_core._transform_features(features, [fc])[fc]  # pylint: disable=protected-access
+          features[fc.name] = tensor
 
     weights = None
     if weights_name and weights_name in features:
@@ -201,52 +236,95 @@ def get_model_fn(params,
     def _train_fn(unused_loss):
       return training_graph
 
-    model_ops = model_head.create_model_fn_ops(
-        features=features,
-        labels=labels,
-        mode=mode,
-        train_op_fn=_train_fn,
-        logits=logits,
-        scope=head_scope)
 
     # Ops are run in lexigraphical order of their keys. Run the resource
     # clean-up op last.
     all_handles = graph_builder.get_all_resource_handles()
     ops_at_end = {
-        '9: clean up resources': control_flow_ops.group(
-            *[resource_variable_ops.destroy_resource_op(handle)
-              for handle in all_handles])}
+        '9: clean up resources':
+            control_flow_ops.group(*[
+                resource_variable_ops.destroy_resource_op(handle)
+                for handle in all_handles
+            ])
+    }
 
     if report_feature_importances:
       ops_at_end['1: feature_importances'] = (
           graph_builder.feature_importances())
 
-    training_hooks.append(TensorForestRunOpAtEndHook(ops_at_end))
-
-    if early_stopping_rounds:
-      training_hooks.append(
-          TensorForestLossHook(
-              early_stopping_rounds,
-              early_stopping_loss_threshold=early_stopping_loss_threshold,
-              loss_op=model_ops.loss))
-
-    model_ops.training_hooks.extend(training_hooks)
-
-    if keys is not None:
-      model_ops.predictions[keys_name] = keys
-
-    if params.inference_tree_paths:
-      model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
-
-    model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
-    if include_all_in_serving:
-      # In order to serve the variance we need to add the prediction dict
-      # to output_alternatives dict.
-      if not model_ops.output_alternatives:
-        model_ops.output_alternatives = {}
-      model_ops.output_alternatives[ALL_SERVING_KEY] = (
-          constants.ProblemType.UNSPECIFIED, model_ops.predictions)
-    return model_ops
+    training_hooks = [TensorForestRunOpAtEndHook(ops_at_end)]
+
+    if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+      model_ops = model_head.create_model_fn_ops(
+          features=features,
+          labels=labels,
+          mode=mode,
+          train_op_fn=_train_fn,
+          logits=logits,
+          scope=head_scope)
+
+      if early_stopping_rounds:
+        training_hooks.append(
+            TensorForestLossHook(
+                early_stopping_rounds,
+                early_stopping_loss_threshold=early_stopping_loss_threshold,
+                loss_op=model_ops.loss))
+
+      model_ops.training_hooks.extend(training_hooks)
+
+      if keys is not None:
+        model_ops.predictions[keys_name] = keys
+
+      if params.inference_tree_paths:
+        model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
+
+      model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
+
+      if include_all_in_serving:
+        # In order to serve the variance we need to add the prediction dict
+        # to output_alternatives dict.
+        if not model_ops.output_alternatives:
+          model_ops.output_alternatives = {}
+        model_ops.output_alternatives[ALL_SERVING_KEY] = (
+            constants.ProblemType.UNSPECIFIED, model_ops.predictions)
+
+      return model_ops
+
+    else:
+      # Estimator spec
+      estimator_spec = model_head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_fn,
+          logits=logits)
+
+      if early_stopping_rounds:
+        training_hooks.append(
+            TensorForestLossHook(
+                early_stopping_rounds,
+                early_stopping_loss_threshold=early_stopping_loss_threshold,
+                loss_op=estimator_spec.loss))
+
+      estimator_spec = estimator_spec._replace(
+          training_hooks=training_hooks + list(estimator_spec.training_hooks))
+      if keys is not None:
+        estimator_spec.predictions[keys_name] = keys
+      if params.inference_tree_paths:
+        estimator_spec.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
+      estimator_spec.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
+
+      if include_all_in_serving:
+        outputs = estimator_spec.export_outputs
+        if not outputs:
+          outputs = {}
+        outputs = {ALL_SERVING_KEY: PredictOutput(estimator_spec.predictions)}
+        print(estimator_spec.export_outputs)
+        # In order to serve the variance we need to add the prediction dict
+        # to output_alternatives dict.
+        estimator_spec = estimator_spec._replace(export_outputs=outputs)
+
+      return estimator_spec
 
   return _model_fn
 
@@ -493,8 +571,11 @@ class MultiForestMultiHeadEstimator(estimator.Estimator):
               params,
               graph_builder_class,
               device_assigner,
-              model_head=get_default_head(
-                  params, weight_column, name='head{0}'.format(i)),
+              model_head=_get_default_head(
+                  params,
+                  weight_column,
+                  name='head{0}'.format(i),
+                  output_type=ModelBuilderOutputType.MODEL_FN_OPS),
               weights_name=weight_column,
               keys_name=keys_column,
               early_stopping_rounds=early_stopping_rounds,
@@ -509,3 +590,142 @@ class MultiForestMultiHeadEstimator(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
+
+
+class CoreTensorForestEstimator(core_estimator.Estimator):
+  """A CORE estimator that can train and evaluate a random forest.
+
+  Example:
+
+  ```python
+  params = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
+      num_classes=2, num_features=40, num_trees=10, max_nodes=1000)
+
+  # Estimator using the default graph builder.
+  estimator = CoreTensorForestEstimator(params, model_dir=model_dir)
+
+  # Or estimator using TrainingLossForest as the graph builder.
+  estimator = CoreTensorForestEstimator(
+      params, graph_builder_class=tensor_forest.TrainingLossForest,
+      model_dir=model_dir)
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    ...
+  def input_fn_eval: # returns x, y
+    ...
+  estimator.train(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+
+  # Predict returns an iterable of dicts.
+  results = list(estimator.predict(x=x))
+  prob0 = results[0][eval_metrics.INFERENCE_PROB_NAME]
+  prediction0 = results[0][eval_metrics.INFERENCE_PRED_NAME]
+  ```
+  """
+
+  def __init__(self,
+               params,
+               device_assigner=None,
+               model_dir=None,
+               feature_columns=None,
+               graph_builder_class=tensor_forest.RandomForestGraphs,
+               config=None,
+               weight_column=None,
+               keys_column=None,
+               feature_engineering_fn=None,
+               early_stopping_rounds=100,
+               early_stopping_loss_threshold=0.001,
+               num_trainers=1,
+               trainer_id=0,
+               report_feature_importances=False,
+               local_eval=False,
+               version=None,
+               head=None,
+               include_all_in_serving=False):
+    """Initializes a TensorForestEstimator instance.
+
+    Args:
+      params: ForestHParams object that holds random forest hyperparameters.
+        These parameters will be passed into `model_fn`.
+      device_assigner: An `object` instance that controls how trees get
+        assigned to devices. If `None`, will use
+        `tensor_forest.RandomForestDeviceAssigner`.
+      model_dir: Directory to save model parameters, graph, etc. To continue
+        training a previously saved model, load checkpoints saved to this
+        directory into an estimator.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `_FeatureColumn`.
+      graph_builder_class: An `object` instance that defines how TF graphs for
+        random forest training and inference are built. By default will use
+        `tensor_forest.RandomForestGraphs`. Can be overridden by version
+        kwarg.
+      config: `RunConfig` object to configure the runtime settings.
+      weight_column: A string defining feature column name representing
+        weights. Will be multiplied by the loss of the example. Used to
+        downweight or boost examples during training.
+      keys_column: A string naming one of the features to strip out and
+        pass through into the inference/eval results dict.  Useful for
+        associating specific examples with their prediction.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      early_stopping_rounds: Allows training to terminate early if the forest is
+        no longer growing. 100 by default.  Set to a Falsy value to disable
+        the default training hook.
+      early_stopping_loss_threshold: Percentage (as fraction) that loss must
+        improve by within early_stopping_rounds steps, otherwise training will
+        terminate.
+      num_trainers: Number of training jobs, which will partition trees
+        among them.
+      trainer_id: Which trainer this instance is.
+      report_feature_importances: If True, print out feature importances
+        during evaluation.
+      local_eval: If True, don't use a device assigner for eval. This is to
+        support some common setups where eval is done on a single machine, even
+        though training might be distributed.
+      version: Unused.
+      head: A heads_lib.Head object that calculates losses and such. If None,
+        one will be automatically created based on params.
+      include_all_in_serving: if True, allow preparation of the complete
+        prediction dict including the variance to be exported for serving with
+        the Servo lib; and it also requires calling export_savedmodel with
+        default_output_alternative_key=ALL_SERVING_KEY, i.e.
+        estimator.export_savedmodel(export_dir_base=your_export_dir,
+          serving_input_fn=your_export_input_fn,
+          default_output_alternative_key=ALL_SERVING_KEY)
+        if False, resort to default behavior, i.e. export scores and
+          probabilities but no variances. In this case
+          default_output_alternative_key should be None while calling
+          export_savedmodel().
+        Note, that due to backward compatibility we cannot always set
+        include_all_in_serving to True because in this case calling
+        export_saved_model() without
+        default_output_alternative_key=ALL_SERVING_KEY (legacy behavior) the
+        saved_model_export_utils.get_output_alternatives() would raise
+        ValueError.
+
+    Returns:
+      A `TensorForestEstimator` instance.
+    """
+
+    super(CoreTensorForestEstimator, self).__init__(
+        model_fn=get_model_fn(
+            params.fill(),
+            graph_builder_class,
+            device_assigner,
+            feature_columns=feature_columns,
+            model_head=head,
+            weights_name=weight_column,
+            keys_name=keys_column,
+            early_stopping_rounds=early_stopping_rounds,
+            early_stopping_loss_threshold=early_stopping_loss_threshold,
+            num_trainers=num_trainers,
+            trainer_id=trainer_id,
+            report_feature_importances=report_feature_importances,
+            local_eval=local_eval,
+            include_all_in_serving=include_all_in_serving,
+            output_type=ModelBuilderOutputType.ESTIMATOR_SPEC),
+        model_dir=model_dir,
+        config=config)
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest_test.py b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
index ac42364d25..aa0016b740 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest_test.py
@@ -23,7 +23,39 @@ import numpy as np
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.tensor_forest.client import random_forest
 from tensorflow.contrib.tensor_forest.python import tensor_forest
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_utils
+
+
+def _get_classification_input_fns():
+  iris = base.load_iris()
+  data = iris.data.astype(np.float32)
+  labels = iris.target.astype(np.int32)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=data, y=labels, batch_size=150, num_epochs=None, shuffle=False)
+
+  predict_input_fn = numpy_io.numpy_input_fn(
+      x=data[:1,], y=None, batch_size=1, num_epochs=1, shuffle=False)
+  return train_input_fn, predict_input_fn
+
+
+def _get_regression_input_fns():
+  boston = base.load_boston()
+  data = boston.data.astype(np.float32)
+  labels = boston.target.astype(np.int32)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=data, y=labels, batch_size=506, num_epochs=None, shuffle=False)
+
+  predict_input_fn = numpy_io.numpy_input_fn(
+      x=data[:1,], y=None, batch_size=1, num_epochs=1, shuffle=False)
+  return train_input_fn, predict_input_fn
 
 
 class TensorForestTrainerTests(test.TestCase):
@@ -39,32 +71,287 @@ class TensorForestTrainerTests(test.TestCase):
         inference_tree_paths=True)
     classifier = random_forest.TensorForestEstimator(hparams.fill())
 
+    input_fn, predict_input_fn = _get_classification_input_fns()
+    classifier.fit(input_fn=input_fn, steps=100)
+    res = classifier.evaluate(input_fn=input_fn, steps=10)
+
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+    predictions = list(classifier.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0.576117, 0.211942, 0.211942]],
+                        [pred['probabilities'] for pred in predictions])
+
+  def testRegression(self):
+    """Tests regression using matrix data as input."""
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=5,
+        max_nodes=1000,
+        num_classes=1,
+        num_features=13,
+        regression=True,
+        split_after_samples=20)
+
+    regressor = random_forest.TensorForestEstimator(hparams.fill())
+
+    input_fn, predict_input_fn = _get_regression_input_fns()
+
+    regressor.fit(input_fn=input_fn, steps=100)
+    res = regressor.evaluate(input_fn=input_fn, steps=10)
+    self.assertGreaterEqual(0.1, res['loss'])
+
+    predictions = list(regressor.predict(input_fn=predict_input_fn))
+    self.assertAllClose([24.], [pred['scores'] for pred in predictions], atol=1)
+
+  def testAdditionalOutputs(self):
+    """Tests multi-class classification using matrix data as input."""
+    hparams = tensor_forest.ForestHParams(
+        num_trees=1,
+        max_nodes=100,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+    classifier = random_forest.TensorForestEstimator(
+        hparams.fill(), keys_column='keys', include_all_in_serving=True)
+
     iris = base.load_iris()
     data = iris.data.astype(np.float32)
     labels = iris.target.astype(np.int32)
 
-    classifier.fit(x=data, y=labels, steps=100, batch_size=50)
-    classifier.evaluate(x=data, y=labels, steps=10)
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': data,
+            'keys': np.arange(len(iris.data)).reshape(150, 1)
+        },
+        y=labels,
+        batch_size=10,
+        num_epochs=1,
+        shuffle=False)
 
-  def testRegression(self):
+    classifier.fit(input_fn=input_fn, steps=100)
+    predictions = list(classifier.predict(input_fn=input_fn))
+    # Check that there is a key column, tree paths and var.
+    for pred in predictions:
+      self.assertTrue('keys' in pred)
+      self.assertTrue('tree_paths' in pred)
+      self.assertTrue('prediction_variance' in pred)
+
+  def _assert_checkpoint(self, model_dir, global_step):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertLessEqual(
+        reader.get_tensor(ops.GraphKeys.GLOBAL_STEP), global_step)
+
+  def testEarlyStopping(self):
     """Tests multi-class classification using matrix data as input."""
+    hparams = tensor_forest.ForestHParams(
+        num_trees=100,
+        max_nodes=10000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+    classifier = random_forest.TensorForestEstimator(
+        hparams.fill(),
+        # Set a crazy threshold - 30% loss change.
+        early_stopping_loss_threshold=0.3,
+        early_stopping_rounds=2)
+
+    input_fn, _ = _get_classification_input_fns()
+    classifier.fit(input_fn=input_fn, steps=100)
+
+    # We stopped early.
+    self._assert_checkpoint(classifier.model_dir, global_step=5)
+
+
+class CoreTensorForestTests(test.TestCase):
+
+  def testTrainEvaluateInferDoesNotThrowErrorForClassifier(self):
+    head_fn = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
 
     hparams = tensor_forest.ForestHParams(
         num_trees=3,
         max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+
+    est = random_forest.CoreTensorForestEstimator(hparams.fill(), head=head_fn)
+
+    input_fn, predict_input_fn = _get_classification_input_fns()
+
+    est.train(input_fn=input_fn, steps=100)
+    res = est.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0.576117, 0.211942, 0.211942]],
+                        [pred['probabilities'] for pred in predictions])
+
+  def testRegression(self):
+    """Tests regression using matrix data as input."""
+    head_fn = head_lib._regression_head(
+        label_dimension=1,
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=5,
+        max_nodes=1000,
         num_classes=1,
         num_features=13,
         regression=True,
         split_after_samples=20)
 
-    regressor = random_forest.TensorForestEstimator(hparams.fill())
+    regressor = random_forest.CoreTensorForestEstimator(
+        hparams.fill(), head=head_fn)
+
+    input_fn, predict_input_fn = _get_regression_input_fns()
+
+    regressor.train(input_fn=input_fn, steps=100)
+    res = regressor.evaluate(input_fn=input_fn, steps=10)
+    self.assertGreaterEqual(0.1, res['loss'])
+
+    predictions = list(regressor.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[24.]], [pred['predictions'] for pred in predictions], atol=1)
+
+  def testWithFeatureColumns(self):
+    head_fn = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=3,
+        max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+
+    est = random_forest.CoreTensorForestEstimator(
+        hparams.fill(),
+        head=head_fn,
+        feature_columns=[core_feature_column.numeric_column('x')])
+
+    iris = base.load_iris()
+    data = {'x': iris.data.astype(np.float32)}
+    labels = iris.target.astype(np.int32)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x=data, y=labels, batch_size=150, num_epochs=None, shuffle=False)
+
+    est.train(input_fn=input_fn, steps=100)
+    res = est.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+  def testAutofillsClassificationHead(self):
+    hparams = tensor_forest.ForestHParams(
+        num_trees=3,
+        max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+
+    est = random_forest.CoreTensorForestEstimator(hparams.fill())
+
+    input_fn, _ = _get_classification_input_fns()
+
+    est.train(input_fn=input_fn, steps=100)
+    res = est.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertEqual(1.0, res['accuracy'])
+    self.assertAllClose(0.55144483, res['loss'])
+
+  def testAutofillsRegressionHead(self):
+    hparams = tensor_forest.ForestHParams(
+        num_trees=5,
+        max_nodes=1000,
+        num_classes=1,
+        num_features=13,
+        regression=True,
+        split_after_samples=20)
+
+    regressor = random_forest.CoreTensorForestEstimator(hparams.fill())
+
+    input_fn, predict_input_fn = _get_regression_input_fns()
+
+    regressor.train(input_fn=input_fn, steps=100)
+    res = regressor.evaluate(input_fn=input_fn, steps=10)
+    self.assertGreaterEqual(0.1, res['loss'])
+
+    predictions = list(regressor.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[24.]], [pred['predictions'] for pred in predictions], atol=1)
+
+  def testAdditionalOutputs(self):
+    """Tests multi-class classification using matrix data as input."""
+    hparams = tensor_forest.ForestHParams(
+        num_trees=1,
+        max_nodes=100,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
+    classifier = random_forest.CoreTensorForestEstimator(
+        hparams.fill(), keys_column='keys', include_all_in_serving=True)
+
+    iris = base.load_iris()
+    data = iris.data.astype(np.float32)
+    labels = iris.target.astype(np.int32)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': data,
+            'keys': np.arange(len(iris.data)).reshape(150, 1)
+        },
+        y=labels,
+        batch_size=10,
+        num_epochs=1,
+        shuffle=False)
+
+    classifier.train(input_fn=input_fn, steps=100)
+    predictions = list(classifier.predict(input_fn=input_fn))
+    # Check that there is a key column, tree paths and var.
+    for pred in predictions:
+      self.assertTrue('keys' in pred)
+      self.assertTrue('tree_paths' in pred)
+      self.assertTrue('prediction_variance' in pred)
+
+  def _assert_checkpoint(self, model_dir, global_step):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertLessEqual(
+        reader.get_tensor(ops.GraphKeys.GLOBAL_STEP), global_step)
+
+  def testEarlyStopping(self):
+    head_fn = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    hparams = tensor_forest.ForestHParams(
+        num_trees=3,
+        max_nodes=1000,
+        num_classes=3,
+        num_features=4,
+        split_after_samples=20,
+        inference_tree_paths=True)
 
-    boston = base.load_boston()
-    data = boston.data.astype(np.float32)
-    labels = boston.target.astype(np.int32)
+    est = random_forest.CoreTensorForestEstimator(
+        hparams.fill(),
+        head=head_fn,
+        # Set a crazy threshold - 30% loss change.
+        early_stopping_loss_threshold=0.3,
+        early_stopping_rounds=2)
 
-    regressor.fit(x=data, y=labels, steps=100, batch_size=50)
-    regressor.evaluate(x=data, y=labels, steps=10)
+    input_fn, _ = _get_classification_input_fns()
+    est.train(input_fn=input_fn, steps=100)
+    # We stopped early.
+    self._assert_checkpoint(est.model_dir, global_step=8)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
index 69a0143a4e..1ed3d8ca2e 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
-#define LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
 #include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
@@ -43,4 +43,4 @@ void GetFeatureSet(int32 tree_num, int32 node_num, int32 random_seed,
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py
index 980f53253d..cc053f3b94 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/k_feature_routing_function_op_test.py
@@ -58,7 +58,7 @@ class KFeatureRoutingFunctionTest(test_util.TensorFlowTestCase):
     self.assertEquals(self.params.num_features_per_node, 2)
 
   def testRoutingFunction(self):
-    with self.test_session():
+    with self.cached_session():
       route_tensor = gen_training_ops.k_feature_routing_function(
           self.input_data,
           self.tree_weights,
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py
index a27fd49d32..554f7b0d7a 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests/routing_function_op_test.py
@@ -36,7 +36,7 @@ class RoutingFunctionTest(test_util.TensorFlowTestCase):
     self.ops = training_ops.Load()
 
   def testRoutingFunction(self):
-    with self.test_session():
+    with self.cached_session():
       route_tensor = gen_training_ops.routing_function(
           self.input_data, self.tree_weights, self.tree_thresholds, max_nodes=3)
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/data_spec.h b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
index bb33400214..336a7a3239 100644
--- a/tensorflow/contrib/tensor_forest/kernels/data_spec.h
+++ b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
@@ -15,8 +15,8 @@
 // This is a surrogate for using a proto, since it doesn't seem to be possible
 // to use protos in a dynamically-loaded/shared-linkage library, which is
 // what is used for custom ops in tensorflow/contrib.
-#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
-#define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_DATA_SPEC_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_DATA_SPEC_H_
 #include <unordered_map>
 
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -139,4 +139,4 @@ class TensorForestDataSpec {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_DATA_SPEC_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index 03aab1b61e..e04eb60f9b 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
 
 #include <limits>
 
@@ -302,4 +302,4 @@ void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
index 6cb2c881e2..7716536ba4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc
@@ -54,17 +54,24 @@ InequalityDecisionNodeEvaluator::InequalityDecisionNodeEvaluator(
   CHECK(safe_strto32(test.feature_id().id().value(), &feature_num_))
       << "Invalid feature ID: [" << test.feature_id().id().value() << "]";
   threshold_ = test.threshold().float_value();
-  include_equals_ =
-      test.type() == decision_trees::InequalityTest::LESS_OR_EQUAL;
+  _test_type = test.type();
 }
 
 int32 InequalityDecisionNodeEvaluator::Decide(
     const std::unique_ptr<TensorDataSet>& dataset, int example) const {
   const float val = dataset->GetExampleValue(example, feature_num_);
-  if (val < threshold_ || (include_equals_ && val == threshold_)) {
-    return left_child_id_;
-  } else {
-    return right_child_id_;
+  switch (_test_type) {
+    case decision_trees::InequalityTest::LESS_OR_EQUAL:
+      return val <= threshold_ ? left_child_id_ : right_child_id_;
+    case decision_trees::InequalityTest::LESS_THAN:
+      return val < threshold_ ? left_child_id_ : right_child_id_;
+    case decision_trees::InequalityTest::GREATER_OR_EQUAL:
+      return val >= threshold_ ? left_child_id_ : right_child_id_;
+    case decision_trees::InequalityTest::GREATER_THAN:
+      return val > threshold_ ? left_child_id_ : right_child_id_;
+    default:
+      LOG(ERROR) << "Unknown split test type: " << _test_type;
+      return -1;
   }
 }
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index 3db351c328..6497787f84 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -55,9 +55,7 @@ class InequalityDecisionNodeEvaluator : public BinaryDecisionNodeEvaluator {
  protected:
   int32 feature_num_;
   float threshold_;
-
-  // If decision is '<=' as opposed to '<'.
-  bool include_equals_;
+  ::tensorflow::decision_trees::InequalityTest_Type _test_type;
 };
 
 // Evaluator for splits with multiple weighted features.
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
index af5cf72a3c..3db1335563 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
@@ -60,6 +60,40 @@ TEST(InequalityDecisionNodeEvaluatorTest, TestStrictlyLess) {
   ASSERT_EQ(eval->Decide(dataset, 4), 1);
 }
 
+TEST(InequalityDecisionNodeEvaluatorTest, TestGreaterOrEqual) {
+  InequalityTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.mutable_threshold()->set_float_value(3.0);
+  test.set_type(InequalityTest::GREATER_OR_EQUAL);
+  std::unique_ptr<InequalityDecisionNodeEvaluator> eval(
+      new InequalityDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 1);
+  ASSERT_EQ(eval->Decide(dataset, 3), 0);
+  ASSERT_EQ(eval->Decide(dataset, 4), 0);
+}
+
+TEST(InequalityDecisionNodeEvaluatorTest, TestStrictlyGreater) {
+  InequalityTest test;
+  test.mutable_feature_id()->mutable_id()->set_value("0");
+  test.mutable_threshold()->set_float_value(3.0);
+  test.set_type(InequalityTest::GREATER_THAN);
+  std::unique_ptr<InequalityDecisionNodeEvaluator> eval(
+      new InequalityDecisionNodeEvaluator(test, 0, 1));
+
+  std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
+      new tensorflow::tensorforest::TestableDataSet(
+          {0.0, 1.0, 2.0, 3.0, 4.0, 5.0}, 1));
+
+  ASSERT_EQ(eval->Decide(dataset, 2), 1);
+  ASSERT_EQ(eval->Decide(dataset, 3), 1);
+  ASSERT_EQ(eval->Decide(dataset, 4), 0);
+}
+
 TEST(MatchingDecisionNodeEvaluatorTest, Basic) {
   MatchingValuesTest test;
   test.mutable_feature_id()->mutable_id()->set_value("0");
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
index d43884481a..99c5800391 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
@@ -130,7 +130,11 @@ void TensorDataSet::RandomSample(int example,
       num_total_features += num_sparse;
     }
   }
-  int rand_feature = rng_->Uniform(num_total_features);
+  int rand_feature = 0;
+  {
+    mutex_lock lock(mu_);
+    rand_feature = rng_->Uniform(num_total_features);
+  }
   if (rand_feature < available_features_.size()) {  // it's dense.
     *feature_id = available_features_[rand_feature];
     *type = input_spec_.GetDenseFeatureType(rand_feature);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index 95f75b4d7e..4945b53007 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -25,6 +25,7 @@
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 namespace tensorforest {
@@ -120,6 +121,8 @@ class TensorDataSet {
   int32 split_sampling_random_seed_;
   std::unique_ptr<random::PhiloxRandom> single_rand_;
   std::unique_ptr<random::SimplePhilox> rng_;
+  // Mutex for using random number generator.
+  mutable mutex mu_;
 };
 }  // namespace tensorforest
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index fc0d22d112..122a67a407 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -279,7 +279,9 @@ tf_cuda_library(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -293,6 +295,31 @@ tf_cuda_library(
     ]) + tf_custom_op_library_additional_deps(),
 )
 
+tf_cuda_cc_test(
+    name = "convert_graph_test",
+    size = "medium",
+    srcs = ["convert/convert_graph_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_conversion",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 # Library for the segmenting portion of TensorRT operation creation
 cc_library(
     name = "segment",
@@ -387,17 +414,19 @@ cuda_py_tests(
     name = "tf_trt_integration_test",
     srcs = [
         "test/base_test.py",
-        # "test/batch_matmul_test.py",
-        # "test/biasadd_matmul_test.py",
-        # "test/binary_tensor_weight_broadcast_test.py",  # Blocked by trt4 installation
-        # "test/concatenation_test.py",  # Blocked by trt4 installation
+        "test/batch_matmul_test.py",
+        "test/biasadd_matmul_test.py",
+        "test/binary_tensor_weight_broadcast_test.py",
+        "test/concatenation_test.py",
         "test/const_broadcast_test.py",
+        "test/manual_test.py",
+        "test/memory_alignment_test.py",
         "test/multi_connection_neighbor_engine_test.py",
         "test/neighboring_engine_test.py",
-        # "test/unary_test.py",  # Blocked by trt4 installation
-        # "test/vgg_block_nchw_test.py",
-        # "test/vgg_block_test.py",
-        "test/memory_alignment_test.py",
+        "test/rank_two_test.py",
+        "test/unary_test.py",
+        "test/vgg_block_nchw_test.py",
+        "test/vgg_block_test.py",
     ],
     additional_deps = [
         ":tf_trt_integration_test_base",
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 21ec8b0b30..b019c99882 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -31,6 +31,9 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -772,33 +775,55 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
     const ConversionParams& params, const EngineInfo& engine) {
   int cuda_device_id = -1;
   tensorflow::Allocator* dev_allocator = nullptr;
-  if (params.cluster) {
-    std::vector<tensorflow::Device*> devices;
-    if (!engine.device.empty() && params.cluster->GetDeviceSet()) {
-      DeviceNameUtils::ParsedName parsed_name;
-      if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
-          parsed_name.has_id) {
-        params.cluster->GetDeviceSet()->FindMatchingDevices(parsed_name,
-                                                            &devices);
+  if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
+      engine.device.empty()) {
+    // If device is not set, use the first found GPU device for the conversion.
+    for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
+      TfGpuId tf_gpu_id(tf_gpu_id_value);
+      CudaGpuId cuda_gpu_id;
+      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+      if (s.ok()) {
+        VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
+                << cuda_gpu_id.value();
+        cuda_device_id = cuda_gpu_id.value();
+        GPUOptions gpu_options;
+        // If the TF to Cuda gpu id mapping exist, the device and corresponding
+        // allocator must have been initialized already, so the
+        // GetGPUAllocator() call won't create a new allocator.
+        dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
+            gpu_options, tf_gpu_id, 1);
+        break;
       }
+      LOG(ERROR) << "TF GPU with id " << tf_gpu_id_value << " does not exist "
+                 << s;
     }
-    if (!devices.empty()) {
-      if (devices.size() > 1) {
-        string msg = "Found multiple matching devices using name '";
-        StrAppend(&msg, engine.device, "': ");
-        for (auto d : devices) StrAppend(&msg, d->name(), ", ");
-        StrAppend(&msg, ". Will get the allocator from first one.");
-        LOG(WARNING) << msg;
-      }
-      tensorflow::AllocatorAttributes alloc_attr;
-      cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
-      dev_allocator = devices[0]->GetAllocator(alloc_attr);
-      VLOG(1) << "Using allocator " << dev_allocator->Name()
-              << " and cuda_device_id " << cuda_device_id;
-    } else {
-      LOG(WARNING) << "Cluster is set but device '" << engine.device
-                   << "' is not found in the cluster";
+    return std::make_pair(cuda_device_id, dev_allocator);
+  }
+
+  // Use the device requested by the engine.
+  auto device_set = params.cluster->GetDeviceSet();
+  std::vector<tensorflow::Device*> devices;
+  DeviceNameUtils::ParsedName parsed_name;
+  if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
+      parsed_name.has_id) {
+    device_set->FindMatchingDevices(parsed_name, &devices);
+  }
+  if (!devices.empty()) {
+    if (devices.size() > 1) {
+      string msg = "Found multiple matching devices using name '";
+      StrAppend(&msg, engine.device, "': ");
+      for (auto d : devices) StrAppend(&msg, d->name(), ", ");
+      StrAppend(&msg, ". Will get the allocator from first one.");
+      LOG(WARNING) << msg;
     }
+    tensorflow::AllocatorAttributes alloc_attr;
+    cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
+    dev_allocator = devices[0]->GetAllocator(alloc_attr);
+    VLOG(1) << "Using allocator " << dev_allocator->Name()
+            << " and cuda_device_id " << cuda_device_id;
+  } else {
+    LOG(WARNING) << "Cluster is set but device '" << engine.device
+                 << "' is not found in the cluster";
   }
   return std::make_pair(cuda_device_id, dev_allocator);
 }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 9d986e4890..3525202369 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -84,6 +85,11 @@ std::vector<int> GetLinkedTensorRTVersion();
 
 // Return runtime time TensorRT library version information.
 std::vector<int> GetLoadedTensorRTVersion();
+
+// Helper method for the conversion, expose for testing.
+std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
+    const ConversionParams& params, const EngineInfo& engine);
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
new file mode 100644
index 0000000000..8146bed4b0
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
+#include "tensorflow/core/public/session.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class FakeCluster : public grappler::Cluster {
+ public:
+  FakeCluster() : Cluster(0) {}
+
+  void SetDeviceSet(const DeviceSet* device_set) { device_set_ = device_set; }
+
+  const DeviceSet* GetDeviceSet() const override { return device_set_; }
+
+  string type() const override { return ""; }
+  Status Provision() override { return Status::OK(); }
+  Status Initialize(const grappler::GrapplerItem& item) override {
+    return Status::OK();
+  }
+  Status Run(const GraphDef& graph_def,
+             const std::vector<std::pair<string, Tensor>>& feed,
+             const std::vector<string>& fetch,
+             RunMetadata* metadata) override {
+    return Status::OK();
+  }
+
+ private:
+  const DeviceSet* device_set_;
+};
+
+TEST(ConvertGraphTest, GetDeviceAndAllocator) {
+  ConversionParams params;
+  EngineInfo engine_info;
+  {
+    // params.cluster is not set, and no gpu device is available.
+    auto result = GetDeviceAndAllocator(params, engine_info);
+    EXPECT_EQ(-1, result.first);
+    EXPECT_EQ(nullptr, result.second);
+  }
+
+  // Create a session with two (virtual) gpu device.
+  SessionOptions options;
+  ConfigProto* config = &options.config;
+  GPUOptions* gpu_options = config->mutable_gpu_options();
+  auto virtual_devices =
+      gpu_options->mutable_experimental()->add_virtual_devices();
+  virtual_devices->add_memory_limit_mb(200);
+  virtual_devices->add_memory_limit_mb(200);
+  std::unique_ptr<Session> session(NewSession(options));
+
+  {
+    // params.cluster is not set, should find and return first gpu id and
+    // corresponding allocator.
+    auto result = GetDeviceAndAllocator(params, engine_info);
+    EXPECT_EQ(0, result.first);
+    EXPECT_NE(nullptr, result.second);
+    EXPECT_EQ("GPU_0_bfc", result.second->Name());
+  }
+
+  FakeCluster cluster;
+  params.cluster = &cluster;
+  {
+    // params.cluster->GetDeviceSet() returns null, should find and return first
+    // gpu id and corresponding allocator.
+    auto result = GetDeviceAndAllocator(params, engine_info);
+    EXPECT_EQ(0, result.first);
+    EXPECT_NE(nullptr, result.second);
+    EXPECT_EQ("GPU_0_bfc", result.second->Name());
+  }
+
+  // Build the DeviceSet.
+  DeviceSet device_set;
+  const DeviceMgr* device_mgr = nullptr;
+  TF_ASSERT_OK(session->LocalDeviceManager(&device_mgr));
+  for (auto d : device_mgr->ListDevices()) {
+    device_set.AddDevice(d);
+  }
+  cluster.SetDeviceSet(&device_set);
+  {
+    // engine_info.device is not set, should find and return first gpu id and
+    // corresponding allocator.
+    auto result = GetDeviceAndAllocator(params, engine_info);
+    EXPECT_EQ(0, result.first);
+    EXPECT_NE(nullptr, result.second);
+    EXPECT_EQ("GPU_0_bfc", result.second->Name());
+  }
+
+  engine_info.device = "/GPU:1";
+  {
+    // Set to use second device.
+    auto result = GetDeviceAndAllocator(params, engine_info);
+    EXPECT_EQ(0, result.first);
+    EXPECT_NE(nullptr, result.second);
+    EXPECT_EQ("GPU_1_bfc", result.second->Name());
+  }
+
+  engine_info.device = "/GPU:3";
+  {
+    // Set to use nonexistent device.
+    auto result = GetDeviceAndAllocator(params, engine_info);
+    EXPECT_EQ(-1, result.first);
+    EXPECT_EQ(nullptr, result.second);
+  }
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 35fa590254..c98b07ad8b 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.pb.h"        // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -77,6 +78,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
+// TODO(aaroey): put these constants into some class.
+const char* const kInputPHName = "TensorRTInputPH_";
+const char* const kOutputPHName = "TensorRTOutputPH_";
+
 namespace convert {
 using ::tensorflow::str_util::Split;
 using ::tensorflow::strings::StrAppend;
@@ -155,12 +160,22 @@ tensorflow::Status ValidateInputProperties(const PartialTensorShape& shape,
   for (int d = 1; d < shape.dims(); ++d) {
     if (shape.dim_size(d) < 0) {
       return tensorflow::errors::InvalidArgument(
-          "Input tensor has a unknown non-batch dimemension at dim ", d);
+          "Input tensor with shape ", shape.DebugString(),
+          " has an unknown non-batch dimemension at dim ", d);
     }
   }
   return Status::OK();
 }
 
+string DebugString(const nvinfer1::Dims& dims) {
+  string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d=");
+  for (int i = 0; i < nvinfer1::Dims::MAX_DIMS; ++i) {
+    StrAppend(&out, dims.d[i], ",");
+  }
+  StrAppend(&out, ")");
+  return out;
+}
+
 // Return whether or not the broadcast is feasible;
 bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
                                const bool operand_l_is_tensor,
@@ -353,6 +368,13 @@ class TRT_ShapedWeights {
   // Default converter
   operator nvinfer1::Weights() const { return GetWeightsForTRT(); }
 
+  string DebugString() const {
+    return StrCat(
+        "TRT_ShapedWeights(shape=", convert::DebugString(shape_), ", type=",
+        type_, ", values=", reinterpret_cast<uintptr_t>(values_),
+        ", empty_weight_flag=", empty_weight_flag_, ")");
+  }
+
   // TODO(aaroey): make these private.
   nvinfer1::Dims shape_;
   tensorflow::DataType type_;
@@ -367,11 +389,14 @@ class TRT_TensorOrWeights {
  public:
   explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor)
       : tensor_(tensor), weights_(DT_FLOAT), variant_(TRT_NODE_TENSOR) {}
+
   explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
       : tensor_(nullptr), weights_(weights), variant_(TRT_NODE_WEIGHTS) {}
+
   // TODO(aaroey): use rvalue reference.
   TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
       : tensor_(rhs.tensor_), weights_(rhs.weights_), variant_(rhs.variant_) {}
+
   ~TRT_TensorOrWeights() {}
 
   bool is_tensor() const { return variant_ == TRT_NODE_TENSOR; }
@@ -381,18 +406,22 @@ class TRT_TensorOrWeights {
     CHECK(is_tensor());
     return tensor_;
   }
+
   const nvinfer1::ITensor* tensor() const {
     CHECK(is_tensor());
     return tensor_;
   }
+
   TRT_ShapedWeights& weights() {
     CHECK(is_weights());
     return weights_;
   }
+
   const TRT_ShapedWeights& weights() const {
     CHECK(is_weights());
     return weights_;
   }
+
   nvinfer1::Dims shape() const {
     if (is_tensor()) {
       return tensor()->getDimensions();
@@ -401,6 +430,18 @@ class TRT_TensorOrWeights {
     }
   }
 
+  string DebugString() const {
+    string output = "TRT_TensorOrWeights(type=";
+    if (is_tensor()) {
+      StrAppend(&output, "tensor @", reinterpret_cast<uintptr_t>(tensor_),
+                ", shape=", convert::DebugString(tensor_->getDimensions()));
+    } else {
+      StrAppend(&output, "weights=", weights_.DebugString());
+    }
+    StrAppend(&output, ")");
+    return output;
+  }
+
  private:
   nvinfer1::ITensor* tensor_;
   TRT_ShapedWeights weights_;
@@ -555,7 +596,7 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
 }
 
 void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
-                       TRT_ShapedWeights* oweights, int num_groups) {
+                       TRT_ShapedWeights* oweights, const int num_groups) {
   CHECK_EQ(iweights.type_, oweights->type_);
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   // K indexes over output channels, C over input channels, and R and S over the
@@ -563,13 +604,13 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   const int r = iweights.shape_.d[0];
   const int s = iweights.shape_.d[1];
   // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
-  VLOG(2) << "num_groups: " << num_groups;
   const int c = iweights.shape_.d[2] / num_groups;
-  VLOG(2) << "c" << iweights.shape_.d[2] << " then " << c;
   const int k = iweights.shape_.d[3] * num_groups;
-  VLOG(2) << "k" << iweights.shape_.d[3] << " then " << k;
-  VLOG(2) << "r" << iweights.shape_.d[0] << " then " << r;
-  VLOG(2) << "s" << iweights.shape_.d[1] << " then " << s;
+  VLOG(2) << "num_groups: " << num_groups
+          << "c" << iweights.shape_.d[2] << " then " << c
+          << "k" << iweights.shape_.d[3] << " then " << k
+          << "r" << iweights.shape_.d[0] << " then " << r
+          << "s" << iweights.shape_.d[1] << " then " << s;
   oweights->shape_.d[0] = k / num_groups;
   oweights->shape_.d[1] = c * num_groups;
   oweights->shape_.d[2] = r;
@@ -607,63 +648,15 @@ using OpConverter =
                                      std::vector<TRT_TensorOrWeights>*)>;
 
 class Converter {
-  // TODO(aaroey): fix the order of members.
-  std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
-  std::unordered_map<string, OpConverter> op_registry_;
-  OpConverter plugin_converter_;
-  nvinfer1::INetworkDefinition* trt_network_;
-  std::list<std::vector<uint8_t>> temp_bufs_;
-  // TODO(aaroey): inline the definition of TRTWeightStore here, and add APIs to
-  // operate the stored weights instead of operating it directly.
-  TRTWeightStore* weight_store_;
-  bool fp16_;
-  void register_op_converters();
-  tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
-                                std::vector<TRT_TensorOrWeights>* inputs) {
-    for (auto const& input_name : node_def.input()) {
-      /*************************************************************************
-       * TODO(jie): handle case 1) here.
-       * Normalizes the inputs and extracts associated metadata:
-       * 1) Inputs can contain a colon followed by a suffix of characters.
-       *    That suffix may be a single number (e.g. inputName:1) or several
-       *    word characters separated from a number by a colon
-       *    (e.g. inputName:foo:1). The
-       *    latter case is used to denote inputs and outputs of functions.
-       * 2) Control dependency inputs contain caret at the beginning and we
-       *    remove this and annotate the edge as a control dependency.
-       ************************************************************************/
-      // skip control nodes
-      if (input_name[0] == '^') continue;
-      string name = input_name;
-      auto first = name.find_first_of(':');
-      // TODO(aaroey): why removing the colon but not the zero? A bug?
-      if (first != string::npos && first + 2 == name.size() &&
-          name[first + 1] == '0')
-        name.erase(first);
-
-      VLOG(2) << "retrieve input: " << name;
-      if (trt_tensors_.count(name)) {
-        inputs->push_back(trt_tensors_.at(name));
-      } else {
-        // TODO(aaroey): this should not happen, make it a CHECK.
-        // TODO(aaroey): use StrCat for pattern like this.
-        string msg("Node ");
-        StrAppend(&msg, node_def.name(), " should have an input named '", name,
-                  "' but it is not available");
-        LOG(ERROR) << msg;
-        return tensorflow::errors::InvalidArgument(msg);
-      }
-    }
-    return tensorflow::Status::OK();
-  }
-
  public:
   explicit Converter(nvinfer1::INetworkDefinition* trt_network,
                      TRTWeightStore* ws, bool fp16)
       : trt_network_(trt_network), weight_store_(ws), fp16_(fp16) {
     this->register_op_converters();
   }
+
   TRTWeightStore* weight_store() { return weight_store_; }
+
   TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
                                      nvinfer1::Dims shape) {
     TRT_ShapedWeights weights(type, nullptr, shape);
@@ -672,8 +665,10 @@ class Converter {
     weights.SetValues(weight_store_->store_.back().data());
     return weights;
   }
+
   // TODO(aaroey): fix all the namings.
   bool isFP16() { return fp16_; }
+
   TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
@@ -684,7 +679,6 @@ class Converter {
     const string& op = node_def.op();
     std::vector<TRT_TensorOrWeights> outputs;
     if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) {
-      // TODO(aaroey): plugin_converter_ is not set, fix it.
       TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs));
     } else {
       if (!op_registry_.count(op)) {
@@ -702,7 +696,8 @@ class Converter {
       if (output.is_tensor()) {
         output.tensor()->setName(output_name.c_str());
       }
-      VLOG(2) << "Write out tensor: " << output_name;
+      VLOG(2) << "Adding out tensor " << output_name << ": "
+              << output.DebugString();
       if (!trt_tensors_.insert({output_name, output}).second) {
         return tensorflow::errors::AlreadyExists(
             "Output tensor already exists for op: " + op);
@@ -751,6 +746,63 @@ class Converter {
     layer->setReshapeDimensions(reshape_dims);
     return layer->getOutput(0);
   }
+
+ private:
+  std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
+  std::unordered_map<string, OpConverter> op_registry_;
+  OpConverter plugin_converter_;
+  nvinfer1::INetworkDefinition* trt_network_;
+  std::list<std::vector<uint8_t>> temp_bufs_;
+
+  // TODO(aaroey): inline the definition of TRTWeightStore here, and add APIs to
+  // operate the stored weights instead of operating it directly.
+  TRTWeightStore* weight_store_;
+
+  bool fp16_;
+
+  void register_op_converters();
+
+  tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
+                                std::vector<TRT_TensorOrWeights>* inputs) {
+    for (auto const& input_name : node_def.input()) {
+      /*************************************************************************
+       * TODO(jie): handle case 1) here.
+       * Normalizes the inputs and extracts associated metadata:
+       * 1) Inputs can contain a colon followed by a suffix of characters.
+       *    That suffix may be a single number (e.g. inputName:1) or several
+       *    word characters separated from a number by a colon
+       *    (e.g. inputName:foo:1). The
+       *    latter case is used to denote inputs and outputs of functions.
+       * 2) Control dependency inputs contain caret at the beginning and we
+       *    remove this and annotate the edge as a control dependency.
+       ************************************************************************/
+      // skip control nodes
+      if (input_name[0] == '^') continue;
+      string name = input_name;
+      auto first = name.find_first_of(':');
+      // TODO(aaroey): why removing the colon but not the zero? A bug?
+      // TODO(aaroey): use TensorId
+      if (first != string::npos && first + 2 == name.size() &&
+          name[first + 1] == '0') {
+        name.erase(first);
+      }
+
+      if (trt_tensors_.count(name)) {
+        TRT_TensorOrWeights& input = trt_tensors_.at(name);
+        inputs->push_back(input);
+        VLOG(2) << "Retrieved input " << name << ": " << input.DebugString();
+      } else {
+        // TODO(aaroey): this should not happen, make it a CHECK.
+        // TODO(aaroey): use StrCat for pattern like this.
+        string msg("Node ");
+        StrAppend(&msg, node_def.name(), " should have an input named '", name,
+                  "' but it is not available");
+        LOG(ERROR) << msg;
+        return tensorflow::errors::InvalidArgument(msg);
+      }
+    }
+    return tensorflow::Status::OK();
+  }
 };
 
 TRT_ShapedWeights ConvertFP32ToFP16(Converter& ctx,
@@ -1187,17 +1239,11 @@ tensorflow::Status ConvertConv2DHelper(
   VLOG(2) << "groups count: " << num_groups;
 
   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-
-  VLOG(2) << "weight shape: " << weights_rsck.shape_.nbDims;
-  for (int i = 0; i < weights_rsck.shape_.nbDims; i++) {
-    VLOG(2) << weights_rsck.shape_.d[i];
-  }
-
+  VLOG(2) << "weight shape: " << weights_rsck.DebugString();
   if (weights_rsck.shape_.nbDims != 4) {
     return tensorflow::errors::Internal(
         "Conv2D expects kernel of dimension 4, at: " + node_def.name());
   }
-
   if (ctx.isFP16()) {
     weights_rsck = ConvertFP32ToFP16(ctx, inputs.at(1).weights());
   }
@@ -1209,16 +1255,13 @@ tensorflow::Status ConvertConv2DHelper(
   nvinfer1::DimsHW kernel_size;
   kernel_size.h() = weights.shape_.d[2];
   kernel_size.w() = weights.shape_.d[3];
-  VLOG(2) << "RSCK: ";
-  for (int i = 0; i < 4; i++) {
-    VLOG(2) << "     " << weights.shape_.d[i];
-  }
+  VLOG(2) << "RSCK: " << weights.DebugString();
   VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
 
   // TODO(jie): stride. (NHWC/NCHW)
   const auto tf_stride = attrs.get<std::vector<int>>("strides");
   VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
-  VLOG(2) << "stride!!!: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
+  VLOG(2) << "stride: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
           << tf_stride[3];
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
@@ -1240,10 +1283,7 @@ tensorflow::Status ConvertConv2DHelper(
     // TODO(jie): handle asymmetric padding
     VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
             << padding[1].first << padding[1].second;
-
-    auto dim_before = tensor->getDimensions();
-    VLOG(2) << "TENSOR before: " << dim_before.d[0] << ", " << dim_before.d[1]
-            << dim_before.d[2] << ", " << dim_before.d[3];
+    VLOG(2) << "TENSOR before: " << DebugString(tensor->getDimensions());
     auto pad_layer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -1251,9 +1291,7 @@ tensorflow::Status ConvertConv2DHelper(
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
-    auto dim_after = tensor->getDimensions();
-    VLOG(2) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
-            << dim_after.d[2] << ", " << dim_after.d[3];
+    VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions());
   }
 
   nvinfer1::IConvolutionLayer* layer =
@@ -1266,17 +1304,12 @@ tensorflow::Status ConvertConv2DHelper(
   layer->setName(node_def.name().c_str());
   layer->setNbGroups(num_groups);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  auto dim_after = output_tensor->getDimensions();
-  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1] << ", "
-          << dim_after.d[2] << ", " << dim_after.d[3];
-
+  VLOG(2) << "TENSOR out: " << DebugString(output_tensor->getDimensions());
+  VLOG(2) << "data_format: " << data_format;
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
     output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
     TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name());
-  } else {
-    VLOG(2) << "NCHW !!!!";
   }
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -1990,22 +2023,22 @@ tensorflow::Status ConvertReduce(Converter& ctx,
     return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32");
   }
 
-  const auto keep_dims = attrs.get<bool>("keep_dims");
-  auto index_list_data =
-      static_cast<int*>(const_cast<void*>(index_list.GetValues()));
-
   int axes = 0;
   if (index_list.count() == 0) {
     return tensorflow::errors::InvalidArgument(
         "TRT cannot support reduce on all (batch) dimensions, at",
         node_def.name());
   } else {
+    auto index_list_data =
+        static_cast<int*>(const_cast<void*>(index_list.GetValues()));
     for (int i = 0; i < index_list.count(); i++) {
-      if (index_list_data[i] == 0) {
+      int axis = index_list_data[i];
+      if (axis < 0) axis += tensor->getDimensions().nbDims + 1;
+      if (axis == 0) {
         return tensorflow::errors::InvalidArgument(
             "TRT cannot reduce at batch dimension, at", node_def.name());
       }
-      axes |= (1 << (index_list_data[i] - 1));
+      axes |= (1 << (axis - 1));
     }
   }
 
@@ -2025,6 +2058,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
                                              " , at ", node_def.name());
   }
 
+  const auto keep_dims = attrs.get<bool>("keep_dims");
   nvinfer1::ILayer* layer =
       ctx.network()->addReduce(*const_cast<nvinfer1::ITensor*>(tensor),
                                reduce_operation, axes, keep_dims);
@@ -2694,8 +2728,6 @@ tensorflow::Status ConvertGraphDefToEngine(
     VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op();
     if (tensorflow::str_util::StartsWith(node_name, kInputPHName) &&
         (node_def.op() == "Placeholder")) {
-      nvinfer1::DimsCHW input_dim_pseudo_chw;
-      for (int i = 0; i < 8; i++) input_dim_pseudo_chw.d[i] = 0;
       int32 slot_number = -1;
       if (!tensorflow::strings::safe_strto32(
               node_name.c_str() + strlen(kInputPHName), &slot_number)) {
@@ -2713,28 +2745,25 @@ tensorflow::Status ConvertGraphDefToEngine(
         LOG(WARNING) << error_message;
         return Status(status.code(), error_message);
       }
-      if (VLOG_IS_ON(1)) {
-        string dim_str("dims=");
-        StrAppend(&dim_str, "[ ", shape.dim_size(0));
-        for (int i = 1; i < shape.dims(); i++) {
-          StrAppend(&dim_str, ", ", shape.dim_size(i));
-        }
-        StrAppend(&dim_str, " ]");
-        VLOG(1) << dim_str;
-      }
+
+#if NV_TENSORRT_MAJOR == 3
+      nvinfer1::DimsCHW input_dim;
+#elif NV_TENSORRT_MAJOR > 3
+      nvinfer1::Dims input_dim;
+#endif
       for (int i = 1; i < shape.dims(); i++) {
-        input_dim_pseudo_chw.d[i - 1] = shape.dim_size(i);
+        input_dim.d[i - 1] = shape.dim_size(i);
       }
-
-      input_dim_pseudo_chw.nbDims = shape.dims() - 1;
-      nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-          node_name.c_str(), dtype, input_dim_pseudo_chw);
+      input_dim.nbDims = shape.dims() - 1;
+      nvinfer1::ITensor* input_tensor =
+          converter.network()->addInput(node_name.c_str(), dtype, input_dim);
       if (!input_tensor) {
         return tensorflow::errors::InvalidArgument(
             "Failed to create Input layer tensor ", node_name,
             " rank=", shape.dims() - 1);
       }
-      VLOG(1) << "Input tensor name :" << node_name;
+      VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
+              << DebugString(input_dim);
       if (!converter.insert_input_tensor(node_name, input_tensor)) {
         return tensorflow::errors::AlreadyExists(
             "Output tensor already exists for op: " + node_name);
@@ -2937,10 +2966,25 @@ bool InputEdgeValidator::operator()(const tensorflow::Edge* in_edge) const {
             << ": " << status;
     return false;
   }
-  if (shape.dims() < 3 && in_edge->src()->type_string() != "Const") {
+
+
+  if (in_edge->src()->type_string() != "Const" &&
+#if NV_TENSORRT_MAJOR == 3
+      // TRT 3.x only support 4 dimensional input tensor.
+      shape.dims() != 4) {
+#else
+      // Single dimensional input tensor is not supported since the first
+      // dimension is treated as batch dimension.
+      shape.dims() < 2) {
+#endif
     VLOG(1) << "--> Need to remove input node " << in_edge->dst()->name()
-            << " which has an input at port " << in_edge->dst_input()
-            << " with #dim<3 and is not a const: " << shape;
+            << " which has an input at port " << in_edge->dst_input() << " with"
+#if NV_TENSORRT_MAJOR == 3
+            << " #dim!=4"
+#else
+            << " #dim<2"
+#endif
+            << " and is not a const: " << shape;
     return false;
   }
   return true;
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index a60253740f..9274027e63 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -36,8 +36,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
-static const char* kInputPHName = "TensorRTInputPH_";
-static const char* kOutputPHName = "TensorRTOutputPH_";
+extern const char* const kInputPHName;
+extern const char* const kOutputPHName;
+
 namespace convert {
 
 struct EngineConnection {
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
index f33f2cc4d6..ff4fba58bf 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h"
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
@@ -37,7 +38,6 @@ tensorflow::Status TRTOptimizationPass::Init(
     const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
   VLOG(1) << "Called INIT for " << name_ << " with config = " << config;
   if (config == nullptr) {
-    maximum_workspace_size_ = 2 << 30;
     return tensorflow::Status::OK();
   }
   const auto params = config->parameter_map();
@@ -47,7 +47,6 @@ tensorflow::Status TRTOptimizationPass::Init(
   if (params.count("max_batch_size")) {
     maximum_batch_size_ = params.at("max_batch_size").i();
   }
-  is_dynamic_op_ = false;
   if (params.count("is_dynamic_op")) {
     is_dynamic_op_ = params.at("is_dynamic_op").b();
   }
@@ -58,27 +57,15 @@ tensorflow::Status TRTOptimizationPass::Init(
       batches_.push_back(i);
     }
   }
-  max_cached_batches_ = 1;
   if (params.count("maximum_cached_engines")) {
     max_cached_batches_ = params.at("maximum_cached_engines").i();
   }
   if (params.count("max_workspace_size_bytes")) {
-    maximum_workspace_size_ = params.at("max_workspace_size_bytes").i();
+    max_workspace_size_bytes_ = params.at("max_workspace_size_bytes").i();
   }
   if (params.count("precision_mode")) {
-    string pm = Uppercase(params.at("precision_mode").s());
-    if (pm == "FP32") {
-      precision_mode_ = 0;
-    } else if (pm == "FP16") {
-      precision_mode_ = 1;
-    } else if (pm == "INT8") {
-      precision_mode_ = 2;
-    } else {
-      LOG(ERROR) << "Unknown precision mode '" << pm << "'";
-      return tensorflow::errors::InvalidArgument(
-          "Unknown precision mode argument" + pm +
-          " Valid values are FP32, FP16, INT8");
-    }
+    TF_RETURN_IF_ERROR(GetPrecisionMode(
+        Uppercase(params.at("precision_mode").s()), &precision_mode_));
   }
   return tensorflow::Status::OK();
 }
@@ -255,7 +242,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   cp.input_graph_def = &item.graph;
   cp.output_names = &nodes_to_preserve;
   cp.max_batch_size = maximum_batch_size_;
-  cp.max_workspace_size_bytes = maximum_workspace_size_;
+  cp.max_workspace_size_bytes = max_workspace_size_bytes_;
   cp.output_graph_def = optimized_graph;
   cp.precision_mode = precision_mode_;
   cp.minimum_segment_size = minimum_segment_size_;
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
index 463ed3883e..71b51d1368 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -36,7 +36,9 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
         minimum_segment_size_(3),
         precision_mode_(0),
         maximum_batch_size_(-1),
-        maximum_workspace_size_(-1) {
+        is_dynamic_op_(false),
+        max_cached_batches_(1),
+        max_workspace_size_bytes_(256LL << 20) {
     VLOG(1) << "Constructing " << name_;
   }
 
@@ -57,14 +59,14 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
                       const tensorflow::grappler::GrapplerItem& item);
 
  private:
-  string name_;
+  const string name_;
   int minimum_segment_size_;
   int precision_mode_;
   int maximum_batch_size_;
   bool is_dynamic_op_;
   std::vector<int> batches_;
   int max_cached_batches_;
-  int64_t maximum_workspace_size_;
+  int64_t max_workspace_size_bytes_;
 };
 
 }  // namespace convert
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
index 2de7973750..11335d7da6 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
 
 #include <vector>
 
+#define EIGEN_USE_GPU
 #include "tensorflow/core/framework/op_kernel.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
@@ -80,5 +81,5 @@ REGISTER_KERNEL_BUILDER(Name("IncPluginTRT").Device(DEVICE_GPU), IncPluginTRT);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
 #endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
index bc15b51e05..19f39e6d3d 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
@@ -42,4 +42,4 @@ class TRTResourceManager {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCE_TRT_RESOURCE_MANAGER_H_
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index b43f1b190f..c82d4a0183 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -74,6 +74,7 @@ class SimpleNode {
 
   const std::vector<SimpleEdge*>& in_edges() const { return in_edges_; }
   const std::vector<SimpleEdge*>& out_edges() const { return out_edges_; }
+
   std::vector<SimpleNode*> in_nodes() const {
     std::vector<SimpleNode*> res;
     res.reserve(in_edges_.size());
@@ -82,6 +83,16 @@ class SimpleNode {
     }
     return res;
   }
+
+  std::vector<SimpleNode*> out_nodes() const {
+    std::vector<SimpleNode*> res;
+    res.reserve(out_edges_.size());
+    for (const auto e : out_edges_) {
+      if (e) res.push_back(e->dst());
+    }
+    return res;
+  }
+
   const string& name() const { return node_->name(); }
   const tensorflow::Node* tf_node() const { return node_; }
   int id() const { return id_; }
@@ -215,45 +226,53 @@ SimpleGraph::~SimpleGraph() {
 
 namespace {
 
-bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src,
-                 const std::vector<SimpleNode*>& start) {
-  // Copied from TF ReverseDFS, which only works for tensorflow::Graph.
+// Copied from TF ReverseDFS, which only works for tensorflow::Graph.
+void StableDFS(const SimpleGraph& g, bool reverse,
+               const std::vector<const SimpleNode*>& start,
+               const std::function<bool(const SimpleNode*)>& enter,
+               const std::function<bool(const SimpleNode*)>& leave) {
+  // Stack of work to do.
   struct Work {
-    SimpleNode* node;
+    const SimpleNode* node;
     bool leave;  // Are we entering or leaving n?
   };
-
   std::vector<Work> stack(start.size());
   for (int i = 0; i < start.size(); ++i) {
     stack[i] = Work{start[i], false};
   }
 
-  std::vector<bool> visited(g->num_node_ids(), false);
+  auto get_nodes = reverse ? [](const SimpleNode* n) { return n->in_nodes(); }
+                           : [](const SimpleNode* n) { return n->out_nodes(); };
+  std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
     Work w = stack.back();
     stack.pop_back();
 
     auto n = w.node;
     if (w.leave) {
-      if (n == src) {
-        return true;
-      }
+      if (leave && !leave(n)) return;
       continue;
     }
 
     if (visited[n->id()]) continue;
     visited[n->id()] = true;
-    // Arrange to call leave(n) when all done with descendants.
-    stack.push_back(Work{n, true});
+    if (enter && !enter(n)) return;
 
-    auto nodes = n->in_nodes();
-    for (const auto node : nodes) {
+    // Arrange to call leave(n) when all done with descendants.
+    if (leave) stack.push_back(Work{n, true});
+
+    auto nodes = get_nodes(n);
+    std::vector<const SimpleNode*> nodes_sorted(nodes.begin(), nodes.end());
+    std::sort(nodes_sorted.begin(), nodes_sorted.end(),
+              [](const SimpleNode* lhs, const SimpleNode* rhs) {
+                return lhs->name() < rhs->name();
+              });
+    for (const SimpleNode* node : nodes_sorted) {
       if (!visited[node->id()]) {
         stack.push_back(Work{node, false});
       }
     }
   }
-  return false;
 }
 
 bool CanContractEdge(const SimpleEdge* edge,
@@ -289,14 +308,21 @@ bool CanContractEdge(const SimpleEdge* edge,
   // To achieve this goal, the correct way seems to be:
   // 1. remove any direct edge from src->dst;
   // 2. detect if src can reach dst, if so they cannot be merged.
-  std::vector<SimpleNode*> dfs_start_nodes;
-  for (SimpleNode* node : dst->in_nodes()) {
+  std::vector<const SimpleNode*> dfs_start_nodes;
+  for (const SimpleNode* node : dst->in_nodes()) {
     if (node != src) {
       dfs_start_nodes.push_back(node);
     }
   }
-
-  const bool has_cycle = CheckCycles(graph, src, dfs_start_nodes);
+  bool has_cycle = false;
+  StableDFS(*graph, /*reverse=*/true, dfs_start_nodes, /*enter=*/nullptr,
+            [&has_cycle, src](const SimpleNode* n) {
+              if (n == src) {
+                has_cycle = true;
+                return false;
+              }
+              return true;
+            });
   return !has_cycle;
 }
 }  // namespace
@@ -403,15 +429,13 @@ tensorflow::Status SegmentGraph(
   // In the future if we have a measure of how beneficial it is to include a
   // given node in a TRT subgraph then we can revisit this algorithm to take
   // advantage of that information.
-  std::vector<tensorflow::Node*> tforder;
-  tensorflow::GetPostOrder(*tf_graph, &tforder);
-  // use postorder implementation from tensorflow and construct mirror in
-  // internal format
-  std::vector<SimpleNode*> order;
-  order.reserve(tforder.size());
-  for (const auto tfnode : tforder) {
-    order.push_back(graph->FindNodeId(tfnode->id()));
-  }
+  std::vector<const SimpleNode*> order;
+  order.reserve(graph->num_node_ids());
+  StableDFS(*graph, /*reverse=*/false, {graph->source_node()},
+            /*enter=*/nullptr, [&order](const SimpleNode* n) {
+              order.push_back(n);
+              return true;
+            });
   for (const SimpleNode* node : order) {
     // All output nodes of 'node' have been visited...
     VLOG(3) << "Trying node " << node->name() << " id=" << node->id();
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
index 8ea5a63735..e9ac833d55 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -40,6 +40,7 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [100, 24, 24, 2]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -62,19 +63,21 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
         identity = array_ops.identity(relu, "identity")
         pool = nn_ops.max_pool(
             identity, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-      array_ops.squeeze(pool, name=self.output_name)
+      array_ops.squeeze(pool, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-        # breaks the connection check, fix it.
-        # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add",
-        #   "relu", "identity", "max_pool"]
-        expected_engines=["my_trt_op_0"],
-        expected_output_dims=(100, 6, 6, 6),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(100, 6, 6, 6)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
+    # breaks the connection check, fix it.
+    # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add",
+    #   "relu", "identity", "max_pool"]
+    return ["my_trt_op_0"]
 
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -85,6 +88,7 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [100, 24, 24, 2]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -115,20 +119,22 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
         q = math_ops.mul(q, edge, name="mul1")
         s = math_ops.add(p, q, name="add1")
         s = math_ops.sub(s, r, name="sub1")
-      array_ops.squeeze(s, name=self.output_name)
+      array_ops.squeeze(s, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-        # breaks the connection check, fix it.
-        # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1",
-        #   "add", "sub1"];
-        # - my_trt_op_1 should have ["weights","conv", "div"]
-        expected_engines=["my_trt_op_0", "my_trt_op_1"],
-        expected_output_dims=(100, 12, 12, 6),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(100, 12, 12, 6)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
+    # breaks the connection check, fix it.
+    # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1",
+    #   "add", "sub1"];
+    # - my_trt_op_1 should have ["weights","conv", "div"]
+    return ["my_trt_op_0", "my_trt_op_1"]
 
 
 class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
@@ -143,6 +149,7 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Create a graph containing two segment."""
     input_name = "input"
     input_dims = [2, 32, 32, 3]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -161,18 +168,20 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
         c = constant_op.constant(1.0, name="c3")
         n = math_ops.add(n, c, name="add3")
         n = math_ops.mul(n, n, name="mul3")
-      array_ops.squeeze(n, name=self.output_name)
+      array_ops.squeeze(n, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines={
-            # Only the first engine is built.
-            "my_trt_op_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
-        },
-        expected_output_dims=tuple(input_dims),
-        allclose_atol=1.e-06,
-        allclose_rtol=1.e-06)
+        output_names=[output_name],
+        expected_output_dims=[tuple(input_dims)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        # Only the first engine is built.
+        "my_trt_op_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
+    }
 
 
 class PartiallyConvertedTestB(PartiallyConvertedTestA):
@@ -184,13 +193,12 @@ class PartiallyConvertedTestB(PartiallyConvertedTestA):
     trt_convert.clear_test_values("")
     trt_convert.add_test_value("my_trt_op_0:CreateTRTNode", "fail")
 
-  def GetParams(self):
-    """Create a graph containing two segment."""
-    return super(PartiallyConvertedTestB, self).GetParams()._replace(
-        expected_engines={
-            # Only the second engine is built.
-            "my_trt_op_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
-        })
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        # Only the second engine is built.
+        "my_trt_op_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
+    }
 
 
 class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
@@ -199,6 +207,7 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
     """Create a graph containing multiple segment."""
     input_name = "input"
     input_dims = [2, 32, 32, 3]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -221,18 +230,20 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
           n = math_ops.add(n, c, name="add2")
           n = math_ops.mul(n, n, name="mul1")
           n = math_ops.add(n, n, name="add3")
-      array_ops.squeeze(n, name=self.output_name)
+      array_ops.squeeze(n, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines={
-            "my_trt_op_0": ["add", "add1", "mul"],
-            "my_trt_op_1": ["add2", "add3", "mul1"]
-        },
-        expected_output_dims=tuple(input_dims),
-        allclose_atol=1.e-06,
-        allclose_rtol=1.e-06)
+        output_names=[output_name],
+        expected_output_dims=[tuple(input_dims)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        "my_trt_op_0": ["add", "add1", "mul"],
+        "my_trt_op_1": ["add2", "add3", "mul1"]
+    }
 
 
 class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
@@ -241,6 +252,7 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
     """Create a graph containing single segment."""
     input_name = "input"
     input_dims = [2, 32, 32, 3]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -251,15 +263,17 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
         n = math_ops.add(n, c, name="add")
         n = math_ops.mul(n, n, name="mul")
         n = math_ops.add(n, n, name="add1")
-      array_ops.squeeze(n, name=self.output_name)
+      array_ops.squeeze(n, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines={"my_trt_op_0": ["c", "add", "add1", "mul"]},
-        expected_output_dims=tuple(input_dims),
-        allclose_atol=1.e-06,
-        allclose_rtol=1.e-06)
+        output_names=[output_name],
+        expected_output_dims=[tuple(input_dims)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {"my_trt_op_0": ["c", "add", "add1", "mul"]}
 
 
 class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -268,6 +282,7 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
     """Create a graph containing multiple segment."""
     input_name = "input"
     input_dims = [2, 32, 32, 3]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -282,22 +297,24 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
         n = math_ops.add(n, c, name="add2")
         n = math_ops.mul(n, n, name="mul1")
         n = math_ops.add(n, n, name="add3")
-      array_ops.squeeze(n, name=self.output_name)
+      array_ops.squeeze(n, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines={
-            "my_trt_op_0": ["add2", "add3", "mul1"],
-            # Why segment ["add", "add1", "mul"] was assigned segment id 1
-            # instead of 0: the parent node of this segment is actually const
-            # node 'c', but it's removed later since it's const output of the
-            # segment which is not allowed.
-            "my_trt_op_1": ["add", "add1", "mul"]
-        },
-        expected_output_dims=tuple(input_dims),
-        allclose_atol=1.e-06,
-        allclose_rtol=1.e-06)
+        output_names=[output_name],
+        expected_output_dims=[tuple(input_dims)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        "my_trt_op_0": ["add2", "add3", "mul1"],
+        # Why segment ["add", "add1", "mul"] was assigned segment id 1
+        # instead of 0: the parent node of this segment is actually const
+        # node 'c', but it's removed later since it's const output of the
+        # segment which is not allowed.
+        "my_trt_op_1": ["add", "add1", "mul"]
+    }
 
 
 class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
@@ -306,6 +323,7 @@ class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
     """Create a graph containing multiple segment."""
     input_name = "input"
     input_dims = [2, 32, 32, 3]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -328,18 +346,20 @@ class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
           mul1 = math_ops.mul(add2, add2, name="mul1")
         with g.control_dependencies([d1, d2, add, add1]):
           add3 = math_ops.add(mul1, mul1, name="add3")
-      array_ops.squeeze(add3, name=self.output_name)
+      array_ops.squeeze(add3, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines={
-            "my_trt_op_0": ["c1", "add", "add1", "mul"],
-            "my_trt_op_1": ["c2", "add2", "add3", "mul1"]
-        },
-        expected_output_dims=tuple(input_dims),
-        allclose_atol=1.e-06,
-        allclose_rtol=1.e-06)
+        output_names=[output_name],
+        expected_output_dims=[tuple(input_dims)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        "my_trt_op_0": ["c1", "add", "add1", "mul"],
+        "my_trt_op_1": ["c2", "add2", "add3", "mul1"]
+    }
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
index 2e1107e303..2f153c6f2f 100644
--- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
@@ -37,6 +37,7 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [12, 5, 8, 12]
+    output_name = "output"
     w1_name = "matmul_w1"
     w1_dims = [12, 5, 12, 7]
     w2_name = "matmul_w2"
@@ -61,15 +62,46 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
         x3 = x3 + f
         x3 = gen_array_ops.reshape(x3, [12, 5, 8, 7])
         out = x1 + x2 + x3
-      array_ops.squeeze(out, name=self.output_name)
+      array_ops.squeeze(out, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name, w1_name, w2_name],
         input_dims=[input_dims, w1_dims, w2_dims],
-        expected_engines=["my_trt_op_0"],
-        expected_output_dims=(12, 5, 8, 7),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(12, 5, 8, 7)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    if (run_params.dynamic_engine and
+        not trt_test.IsQuantizationMode(run_params.precision_mode)):
+      return ["my_trt_op_0", "my_trt_op_1"]
+    return ["my_trt_op_1"]
+
+  def ExpectedEnginesToRun(self, run_params):
+    """Return the expected engines to run."""
+    return ["my_trt_op_1"]
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    # TODO(aaroey): Trt library will fail like:
+    #
+    # ../builder/cudnnBuilder2.cpp:685:
+    # virtual std::vector<nvinfer1::query::Ports<
+    #     nvinfer1::query::TensorRequirements>>
+    # nvinfer1::builder::Node::getSupportedFormats(
+    #     const nvinfer1::query::Ports<nvinfer1::query::AbstractTensor>&,
+    #     const nvinfer1::cudnn::HardwareContext&,
+    #     nvinfer1::builder::Format::Type,
+    #     const nvinfer1::builder::FormatTypeHack&) const:
+    # Assertion `sf' failed.
+    #
+    # To reproduce, run:
+    # bazel test -c opt --copt=-mavx \
+    #   --test_arg=BatchMatMulTest.testTfTrt_ToolConversion_INT8_DynamicEngine \
+    #   tensorflow/contrib/tensorrt:batch_matmul_test
+    #
+    # Investigate and fix it.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
index 8be32f59b4..62f4e525f7 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -38,6 +38,7 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [48, 12]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
@@ -97,18 +98,59 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
 
       out = array_ops.concat(
           [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11], axis=-1)
-      out = array_ops.squeeze(out, name=self.output_name)
+      out = array_ops.squeeze(out, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=[
-            "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3",
-            "my_trt_op_4", "my_trt_op_5", "my_trt_op_6"
-        ],
-        expected_output_dims=(48, 89),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(48, 89)])
+
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    return super(BiasaddMatMulTest,
+                 self).GetConversionParams(run_params)._replace(
+                     max_batch_size=48, maximum_cached_engines=2)
+
+  def _ValidEngines(self):
+    """Engines expected to build and run."""
+    return [
+        "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_6",
+        "my_trt_op_7", "my_trt_op_8", "my_trt_op_9"
+    ]
+
+  def _InvalidEngines(self):
+    """Engines that will cause conversion error at building time."""
+    return ["my_trt_op_3", "my_trt_op_4", "my_trt_op_5"]
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    # In dynamic engine mode the engines are built in execution time, not in
+    # conversion time, so build errors occurs later. Here three of the engines
+    # will be failed to built but the corresponding engine op are still created.
+    # TODO(aaroey, jjsjann123): fix this.
+    if (run_params.dynamic_engine and
+        not trt_test.IsQuantizationMode(run_params.precision_mode)):
+      return self._ValidEngines() + self._InvalidEngines()
+    return self._ValidEngines()
+
+  def ExpectedEnginesToRun(self, run_params):
+    """Return the expected engines to run."""
+    return self._ValidEngines()
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
+    # mode, which is a bug. Re-enable this when trt library is fixed.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-03
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-03
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
index 9316b14da0..f126ed4238 100644
--- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -37,6 +37,7 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [10, 24, 24, 20]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
@@ -104,32 +105,34 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
       a = constant_op.constant(np.random.randn(24, 20), dtype=dtype)
       f = x + a
       x = math_ops.sigmoid(f)
-      gen_array_ops.reshape(x, [5, -1], name=self.output_name)
+      gen_array_ops.reshape(x, [5, -1], name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=[
-            "my_trt_op_0",
-            "my_trt_op_1",
-            "my_trt_op_2",
-            "my_trt_op_3",
-            "my_trt_op_4",
-            "my_trt_op_5",
-            "my_trt_op_6",
-            "my_trt_op_7",
-            "my_trt_op_8",
-            "my_trt_op_9",
-            "my_trt_op_10",
-            "my_trt_op_11",
-            "my_trt_op_12",
-            "my_trt_op_13",
-            "my_trt_op_14",
-            "my_trt_op_15",
-        ],
-        expected_output_dims=(5, 23040),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(5, 23040)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return [
+        "my_trt_op_0",
+        "my_trt_op_1",
+        "my_trt_op_2",
+        "my_trt_op_3",
+        "my_trt_op_4",
+        "my_trt_op_5",
+        "my_trt_op_6",
+        "my_trt_op_7",
+        "my_trt_op_8",
+        "my_trt_op_9",
+        "my_trt_op_10",
+        "my_trt_op_11",
+        "my_trt_op_12",
+        "my_trt_op_13",
+        "my_trt_op_14",
+        "my_trt_op_15",
+    ]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py
index 1874b9dd45..465cb02296 100644
--- a/tensorflow/contrib/tensorrt/test/concatenation_test.py
+++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py
@@ -37,6 +37,7 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [2, 3, 3, 1]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
@@ -68,15 +69,17 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
       concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1)
       concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3)
       x = array_ops.concat([concat1, concat2], axis=-1)
-      gen_array_ops.reshape(x, [2, -1], name=self.output_name)
+      gen_array_ops.reshape(x, [2, -1], name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=["my_trt_op_0"],
-        expected_output_dims=(2, 126),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(2, 126)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["my_trt_op_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
index 8c59000b70..e32f047866 100644
--- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
@@ -36,6 +36,7 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = 'input'
     input_dims = [5, 12, 12, 2]
+    output_name = 'output'
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
@@ -53,15 +54,25 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
           dtype=dtype,
           name='filt3')
       y3 = nn.conv2d(z2, filt3, strides=[1, 1, 1, 1], padding='SAME', name='y3')
-      nn.relu(y3, name='output')
+      nn.relu(y3, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=['my_trt_op_0'],
-        expected_output_dims=(5, 12, 12, 1),
-        allclose_atol=1.e-02,
-        allclose_rtol=1.e-02)
+        output_names=[output_name],
+        expected_output_dims=[(5, 12, 12, 1)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ['my_trt_op_0']
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-04 if run_params.precision_mode == 'FP32' else 1.e-02
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-04 if run_params.precision_mode == 'FP32' else 1.e-02
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/tensorrt/test/manual_test.py b/tensorflow/contrib/tensorrt/test/manual_test.py
new file mode 100644
index 0000000000..1187c759b4
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/manual_test.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Basic tests for TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ast
+import os
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class ManualTest(trt_test.TfTrtIntegrationTestBase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(ManualTest, self).__init__(methodName)
+    self._params_map = None
+
+  def _GetEnv(self):
+    """Get an environment variable specifying the manual test parameters.
+
+    The value of the environment variable is the string representation of a dict
+    which should contain the following keys:
+    - 'graph_path': the file path to the serialized frozen graphdef
+    - 'input_names': TfTrtIntegrationTestParams.input_names
+    - 'input_dims': TfTrtIntegrationTestParams.input_dims
+    - 'expected_output_dims': TfTrtIntegrationTestParams.expected_output_dims
+    - 'output_name': the name of op to fetch
+    - 'expected_engines_to_run': ExpectedEnginesToRun() will return this
+    - 'expected_engines_to_build': ExpectedEnginesToBuild() will return this
+    - 'max_batch_size': ConversionParams.max_batch_size
+
+    Returns:
+      The value of the environment variable.
+    """
+    return os.getenv('TRT_MANUAL_TEST_PARAMS', '')
+
+  def _GetParamsMap(self):
+    """Parse the environment variable as a dict and return it."""
+    if self._params_map is None:
+      self._params_map = ast.literal_eval(self._GetEnv())
+    return self._params_map
+
+  def GetParams(self):
+    """Testing conversion of manually provided frozen graph."""
+    params_map = self._GetParamsMap()
+    gdef = graph_pb2.GraphDef()
+    with gfile.Open(params_map['graph_path'], 'rb') as f:
+      gdef.ParseFromString(f.read())
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=gdef,
+        input_names=params_map['input_names'],
+        input_dims=params_map['input_dims'],
+        output_names=params_map['output_names'],
+        expected_output_dims=params_map['expected_output_dims'])
+
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    conversion_params = super(ManualTest, self).GetConversionParams(run_params)
+    params_map = self._GetParamsMap()
+    if 'max_batch_size' in params_map:
+      conversion_params = conversion_params._replace(
+          max_batch_size=params_map['max_batch_size'])
+    return conversion_params
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return self._GetParamsMap()['expected_engines_to_build']
+
+  def ExpectedEnginesToRun(self, run_params):
+    """Return the expected engines to run."""
+    params_map = self._GetParamsMap()
+    if 'expected_engines_to_run' in params_map:
+      return params_map['expected_engines_to_run']
+    return self.ExpectedEnginesToBuild(run_params)
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    params_map = self._GetParamsMap()
+    if 'atol' in params_map:
+      return params_map['atol']
+    return 1.e-3
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    params_map = self._GetParamsMap()
+    if 'rtol' in params_map:
+      return params_map['rtol']
+    return 1.e-3
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    return len(self._GetEnv())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
index 66eb6be757..bc7c90081f 100644
--- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
@@ -36,6 +36,7 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [2, 15, 15, 3]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
@@ -57,15 +58,25 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
             strides=[1, 1, 1, 1],
             padding="VALID",
             name="conv_2")
-      array_ops.squeeze(out, name=self.output_name)
+      array_ops.squeeze(out, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=["my_trt_op_0"],
-        expected_output_dims=(2, 15, 15, 10),
-        allclose_atol=1.e-02,
-        allclose_rtol=1.e-02)
+        output_names=[output_name],
+        expected_output_dims=[(2, 15, 15, 10)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["my_trt_op_0"]
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-06 if run_params.precision_mode == "FP32" else 1.e-02
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 0.1
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
index fd55b8cd99..11be4feaf7 100644
--- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -38,6 +38,7 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [2, 3, 7, 5]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
@@ -72,15 +73,17 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
       t = t + q
       t = t + d
       t = t - edge3
-      array_ops.squeeze(t, name=self.output_name)
+      array_ops.squeeze(t, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=["my_trt_op_0", "my_trt_op_1"],
-        expected_output_dims=(2, 4, 5, 4),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(2, 4, 5, 4)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["my_trt_op_0", "my_trt_op_1"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
index 51c905a50b..eddeafa38b 100644
--- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
@@ -37,6 +37,7 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [2, 3, 7, 5]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
@@ -54,18 +55,20 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
       t = math_ops.mul(conv, b, name="mul")
       e = self.trt_incompatible_op(conv, name="incompatible")
       t = math_ops.sub(t, e, name="sub")
-      array_ops.squeeze(t, name=self.output_name)
+      array_ops.squeeze(t, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines={
-            "my_trt_op_0": ["bias", "mul", "sub"],
-            "my_trt_op_1": ["weights", "conv"]
-        },
-        expected_output_dims=(2, 4, 5, 4),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(2, 4, 5, 4)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        "my_trt_op_0": ["bias", "mul", "sub"],
+        "my_trt_op_1": ["weights", "conv"]
+    }
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/contrib/tensorrt/test/rank_two_test.py
new file mode 100644
index 0000000000..74a4a05925
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/rank_two_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Test for rank 2 input in TF-TRT."""
+    input_names = ["input", "input2"]
+    # Two paths: first with rank 2 input, second with rank 4 input.
+    input_dims = [[12, 5], [12, 5, 2, 2]]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      outputs = []
+      for i in range(2):
+        x = array_ops.placeholder(
+            dtype=dtypes.float32, shape=input_dims[i], name=input_names[i])
+        c = constant_op.constant(1.0, name="c%d_1" % i)
+        q = math_ops.add(x, c, name="add%d_1" % i)
+        q = math_ops.abs(q, name="abs%d_1" % i)
+        c = constant_op.constant(2.2, name="c%d_2" % i)
+        q = math_ops.add(q, c, name="add%d_2" % i)
+        q = math_ops.abs(q, name="abs%d_2" % i)
+        c = constant_op.constant(3.0, name="c%d_3" % i)
+        q = math_ops.add(q, c, name="add%d_3" % i)
+        if i == 0:
+          for j in range(2):
+            q = array_ops.expand_dims(q, -1, name="expand%d_%d" % (i, j))
+        q = gen_math_ops.reciprocal(q, name="reciprocal%d" % i)
+        outputs.append(q)
+      # Combine both paths
+      q = math_ops.add(outputs[0], outputs[1], name="add")
+      array_ops.squeeze(q, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=input_names,
+        input_dims=input_dims,
+        output_names=[output_name],
+        expected_output_dims=[tuple(input_dims[1])])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        "my_trt_op_0": [
+            "add0_1", "add0_2", "add0_3", "c0_1", "c0_2", "c0_3", "abs0_1",
+            "abs0_2"
+        ],
+        "my_trt_op_1": [
+            "add", "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3",
+            "abs1_1", "abs1_2", "reciprocal0", "reciprocal1"
+        ],
+    }
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
+    # mode, which is a bug. Re-enable this when trt library is fixed.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
index 6f85ada464..65ca21cf37 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -31,6 +31,7 @@ from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
 # pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -39,18 +40,23 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
 TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
-    "gdef", "input_names", "input_dims", "expected_engines",
-    "expected_output_dims", "allclose_atol", "allclose_rtol"
+    "gdef", "input_names", "input_dims", "output_names", "expected_output_dims"
 ])
 
 RunParams = namedtuple(
     "RunParams",
     ["use_optimizer", "precision_mode", "dynamic_engine", "test_name"])
 
+ConversionParams = namedtuple("ConversionParams", [
+    "max_batch_size", "max_workspace_size_bytes", "precision_mode",
+    "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
+    "cached_engine_batches"
+])
+
 PRECISION_MODES = ["FP32", "FP16", "INT8"]
 
 
-def _IsQuantizationMode(mode):
+def IsQuantizationMode(mode):
   return mode == "INT8"
 
 
@@ -64,10 +70,6 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
 
   @property
-  def output_name(self):
-    return "output"
-
-  @property
   def trt_incompatible_op(self):
     return math_ops.sin
 
@@ -112,6 +114,10 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     super(TfTrtIntegrationTestBase, cls).setUpClass()
     trt_convert.enable_test_value()
 
+  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
+    super(TfTrtIntegrationTestBase, self).__init__(methodName)
+    self._trt_test_params = None
+
   def setUp(self):
     """Setup method."""
     super(TfTrtIntegrationTestBase, self).setUp()
@@ -122,43 +128,97 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     """Return a TfTrtIntegrationTestParams for test, implemented by subclass."""
     raise NotImplementedError()
 
-  def _PrepareRun(self, params, graph_state):
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    return ConversionParams(
+        max_batch_size=max([
+            dims[0] for dims in self._GetParamsCached().input_dims if len(dims)
+        ]),
+        max_workspace_size_bytes=1 << 25,
+        precision_mode=self._ToBytes(run_params.precision_mode),
+        minimum_segment_size=2,
+        is_dynamic_op=run_params.dynamic_engine,
+        maximum_cached_engines=1,
+        cached_engine_batches=None)
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    return True
+
+  def VerifyRunForEngine(self, engine_name, graph_state, expect_run=True):
+    """Verify the state of a particular engine after sess.run()."""
+    if graph_state == GraphState.ORIGINAL:
+      self._ExpectCalibration(engine_name, "")
+      self._ExpectNativeSegment(engine_name, "")
+      self._ExpectTrtEngine(engine_name, "")
+    elif graph_state == GraphState.CALIBRATE:
+      self._ExpectCalibration(engine_name, "done")
+      self._ExpectNativeSegment(engine_name, "done")
+      self._ExpectTrtEngine(engine_name, "")
+    elif graph_state == GraphState.INFERENCE:
+      self._ExpectCalibration(engine_name, "")
+      if expect_run:
+        self._ExpectNativeSegment(engine_name, "")
+        self._ExpectTrtEngine(engine_name, "done")
+      else:
+        self._ExpectNativeSegment(engine_name, "done")
+        self._ExpectTrtEngine(engine_name, "")
+
+  def VerifyRun(self, run_params, graph_state):
+    """Verify the state of all engines after sess.run()."""
+    for engine_name in self.ExpectedEnginesToBuild(run_params):
+      expect_run = (engine_name in self.ExpectedEnginesToRun(run_params))
+      self.VerifyRunForEngine(engine_name, graph_state, expect_run)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build, implemented by subclass."""
+    raise NotImplementedError()
+
+  def ExpectedEnginesToRun(self, run_params):
+    """Return the expected engines to run."""
+    return self.ExpectedEnginesToBuild(run_params)
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-06 if run_params.precision_mode == "FP32" else 1.e-03
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-06 if run_params.precision_mode == "FP32" else 1.e-03
+
+  def _GetParamsCached(self):
+    if self._trt_test_params is None:
+      self._trt_test_params = self.GetParams()
+    return self._trt_test_params
+
+  def _PrepareRun(self, graph_state):
     """Set up necessary testing environment before calling sess.run()."""
     # Clear test values added by TRTEngineOp.
     trt_convert.clear_test_values("my_trt_op_.*:ExecuteTrtEngine")
     trt_convert.clear_test_values("my_trt_op_.*:ExecuteCalibration")
     trt_convert.clear_test_values("my_trt_op_.*:ExecuteNativeSegment")
 
-  def _VerifyRun(self, params, graph_state):
-    """Verify the state after sess.run()."""
-    for engine_name in params.expected_engines:
-      if graph_state == GraphState.ORIGINAL:
-        self._ExpectCalibration(engine_name, "")
-        self._ExpectNativeSegment(engine_name, "")
-        self._ExpectTrtEngine(engine_name, "")
-      elif graph_state == GraphState.CALIBRATE:
-        self._ExpectCalibration(engine_name, "done")
-        self._ExpectNativeSegment(engine_name, "done")
-        self._ExpectTrtEngine(engine_name, "")
-      elif graph_state == GraphState.INFERENCE:
-        self._ExpectCalibration(engine_name, "")
-        self._ExpectNativeSegment(engine_name, "")
-        self._ExpectTrtEngine(engine_name, "done")
-
-  def _GetConfigProto(self, params, run_params, graph_state):
+  def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
       rewriter_cfg = rewriter_config_pb2.RewriterConfig()
       rewriter_cfg.optimizers.extend(["constfold", "layout"])
       custom_op = rewriter_cfg.custom_optimizers.add()
       custom_op.name = "TensorRTOptimizer"
-      custom_op.parameter_map["minimum_segment_size"].i = 2
-      custom_op.parameter_map["max_batch_size"].i = max(
-          [dims[0] for dims in params.input_dims])
-      custom_op.parameter_map["is_dynamic_op"].b = run_params.dynamic_engine
-      custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
-      custom_op.parameter_map["precision_mode"].s = self._ToBytes(
-          run_params.precision_mode)
+      trt_params = self.GetConversionParams(run_params)
+      custom_op.parameter_map["max_batch_size"].i = trt_params.max_batch_size
+      custom_op.parameter_map["max_workspace_size_bytes"].i = (
+          trt_params.max_workspace_size_bytes)
+      custom_op.parameter_map["precision_mode"].s = trt_params.precision_mode
+      custom_op.parameter_map["minimum_segment_size"].i = (
+          trt_params.minimum_segment_size)
+      custom_op.parameter_map["is_dynamic_op"].b = trt_params.is_dynamic_op
+      custom_op.parameter_map["maximum_cached_engines"].i = (
+          trt_params.maximum_cached_engines)
+      if trt_params.cached_engine_batches:
+        custom_op.parameter_map["cached_engine_batches"].list.i.extend(
+            trt_params.cached_engine_batches)
+
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
@@ -190,53 +250,67 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _ExpectNativeSegment(self, engine_name, value):
     self._ExpectTestValue(engine_name, "ExecuteNativeSegment", value)
 
-  def _RunGraph(self, params, gdef, input_data, config, graph_state,
+  def _RunGraph(self,
+                run_params,
+                gdef,
+                input_data,
+                config,
+                graph_state,
                 num_runs=2):
     """Run given graphdef multiple times."""
+    params = self._GetParamsCached()
     assert len(params.input_names) == len(input_data)
     g = ops.Graph()
     with g.as_default():
       io_ops = importer.import_graph_def(
           graph_def=gdef,
-          return_elements=params.input_names + [self.output_name],
+          return_elements=params.input_names + params.output_names,
           name="")
-      inp = [i.outputs[0] for i in io_ops[:-1]]
-      assert len(inp) == len(input_data)
-      out = io_ops[-1].outputs[0]
+      inputs = [op.outputs[0] for op in io_ops[:len(params.input_names)]]
+      assert len(inputs) == len(input_data)
+      outputs = [op.outputs[0] for op in io_ops[len(params.input_names):]]
     with self.test_session(
         graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
       val = None
       # Defaults to 2 runs to verify result across multiple runs is same.
       for _ in range(num_runs):
-        self._PrepareRun(params, graph_state)
-        new_val = sess.run(out,
-                           {inp[i]: input_data[i] for i in range(len(inp))})
-        self.assertEqual(params.expected_output_dims, new_val.shape)
+        self._PrepareRun(graph_state)
+        new_val = sess.run(
+            outputs, {inputs[i]: input_data[i] for i in range(len(inputs))})
+        output_len = len(params.expected_output_dims)
+        self.assertEqual(output_len, len(new_val))
+        for i in range(output_len):
+          self.assertEqual(params.expected_output_dims[i], new_val[i].shape)
         if val is not None:
-          self.assertAllEqual(val, new_val)
+          self.assertAllClose(val, new_val, atol=1.e-06, rtol=1.e-06)
         val = new_val
-        self._VerifyRun(params, graph_state)
+        self.VerifyRun(run_params, graph_state)
     return val
 
   # Use real data that is representative of the inference dataset
   # for calibration. For this test script it is random data.
-  def _RunCalibration(self, params, gdef, input_data, config):
+  def _RunCalibration(self, run_params, gdef, input_data, config):
     """Run calibration on given graph."""
     return self._RunGraph(
-        params, gdef, input_data, config, GraphState.CALIBRATE, num_runs=5)
+        run_params, gdef, input_data, config, GraphState.CALIBRATE, num_runs=5)
 
-  def _GetTrtGraphDef(self, params, run_params, gdef):
+  def _GetTrtGraphDef(self, run_params, gdef):
     """Return trt converted graphdef."""
+    params = self._GetParamsCached()
+    trt_params = self.GetConversionParams(run_params)
+    logging.info(trt_params)
     return trt_convert.create_inference_graph(
         input_graph_def=gdef,
-        outputs=[self.output_name],
-        max_batch_size=max([dims[0] for dims in params.input_dims]),
-        max_workspace_size_bytes=1 << 25,
-        precision_mode=run_params.precision_mode,
-        minimum_segment_size=2,
-        is_dynamic_op=run_params.dynamic_engine)
-
-  def _WriteGraph(self, params, run_params, gdef, graph_state):
+        outputs=params.input_names + params.output_names,
+        max_batch_size=trt_params.max_batch_size,
+        max_workspace_size_bytes=trt_params.max_workspace_size_bytes,
+        precision_mode=trt_params.precision_mode,
+        minimum_segment_size=trt_params.minimum_segment_size,
+        is_dynamic_op=trt_params.is_dynamic_op,
+        maximum_cached_engines=trt_params.maximum_cached_engines,
+        cached_engine_batches=trt_params.cached_engine_batches)
+
+  def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
       label = "Original"
     elif graph_state == GraphState.CALIBRATE:
@@ -247,15 +321,17 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         self.__class__.__name__ + "_" + run_params.test_name + "_" + label +
         ".pbtxt")
     temp_dir = os.getenv("TRT_TEST_TMPDIR", self.get_temp_dir())
-    logging.info("Writing graph to %s/%s", temp_dir, graph_name)
-    graph_io.write_graph(gdef, temp_dir, graph_name)
+    if temp_dir:
+      logging.info("Writing graph to %s/%s", temp_dir, graph_name)
+      graph_io.write_graph(gdef, temp_dir, graph_name)
 
-  def _VerifyConnections(self, params, converted_gdef):
+  def _VerifyConnections(self, expected_engines, converted_gdef):
+    params = self._GetParamsCached()
     old_to_new_node_map = {
         self._ToString(node.name): self._ToString(node.name)
         for node in params.gdef.node
     }
-    for engine_name, node_names in params.expected_engines.items():
+    for engine_name, node_names in expected_engines.items():
       for node_name in node_names:
         old_to_new_node_map[node_name] = engine_name
     name_to_node_map = {
@@ -310,97 +386,114 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         msg="expected:\n%s\nvs actual:\n%s" % (sorted(
             expected_input_map.items()), sorted(actual_input_map.items())))
 
-  def _VerifyGraphDef(self, params, run_params, gdef, graph_state):
-    self._WriteGraph(params, run_params, gdef, graph_state)
+  def _VerifyGraphDef(self, run_params, gdef, graph_state):
+    self._WriteGraph(run_params, gdef, graph_state)
 
+    expected_engines = self.ExpectedEnginesToBuild(run_params)
     num_engines = 0
     for node in gdef.node:
       if node.op == "TRTEngineOp":
+        logging.info("Found TRTEngineOp: " + node.name)
+    for node in gdef.node:
+      if node.op == "TRTEngineOp":
         num_engines += 1
-        self.assertTrue(node.name in params.expected_engines)
-        self.assertTrue(len(node.attr["serialized_segment"].s))
-        self.assertTrue(len(node.attr["segment_funcdef_name"].s))
+        self.assertTrue(node.name in expected_engines, node.name)
+        self.assertTrue(len(node.attr["serialized_segment"].s), node.name)
+        self.assertTrue(len(node.attr["segment_funcdef_name"].s), node.name)
         self.assertEqual(
             self._ToBytes(run_params.precision_mode),
-            node.attr["precision_mode"].s)
+            node.attr["precision_mode"].s, node.name)
 
         is_dynamic_engine = not node.attr["static_engine"].b
-        self.assertEqual(run_params.dynamic_engine, is_dynamic_engine)
+        self.assertEqual(run_params.dynamic_engine, is_dynamic_engine,
+                         node.name)
 
         has_calibration_data = len(node.attr["calibration_data"].s)
-        if (_IsQuantizationMode(run_params.precision_mode) and
+        if (IsQuantizationMode(run_params.precision_mode) and
             graph_state == GraphState.INFERENCE):
-          self.assertTrue(has_calibration_data)
+          self.assertTrue(has_calibration_data, node.name)
         else:
-          self.assertFalse(has_calibration_data)
+          self.assertFalse(has_calibration_data, node.name)
     if graph_state == GraphState.ORIGINAL:
       self.assertEqual(0, num_engines)
     else:
-      self.assertEqual(num_engines, len(params.expected_engines))
-      if isinstance(params.expected_engines, dict):
-        self._VerifyConnections(params, gdef)
+      self.assertEqual(num_engines, len(expected_engines))
+      if isinstance(expected_engines, dict):
+        self._VerifyConnections(expected_engines, gdef)
       # TODO(aaroey): consider verifying the corresponding TF function.
 
-  def RunTest(self, params, run_params):
+  def RunTest(self, run_params):
+    if not self.ShouldRunTest(run_params):
+      return
     assert run_params.precision_mode in PRECISION_MODES
-    input_data = [np.random.random_sample(dims) for dims in params.input_dims]
+
+    params = self._GetParamsCached()
     input_gdef = params.gdef
-    self._VerifyGraphDef(params, run_params, input_gdef, GraphState.ORIGINAL)
+    input_dtypes = {}
+    for node in input_gdef.node:
+      if self._ToString(node.name) in params.input_names:
+        assert self._ToString(node.op) == "Placeholder"
+        input_dtypes[self._ToString(node.name)] = (
+            dtypes.as_dtype(node.attr["dtype"].type).as_numpy_dtype())
+    assert len(params.input_names) == len(input_dtypes)
+
+    input_data = []
+    for i in range(len(params.input_names)):
+      dtype = input_dtypes[params.input_names[i]]
+      # Multiply the input by some constant to avoid all zeros input for integer
+      # types.
+      scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
+      dims = params.input_dims[i]
+      input_data.append((scale * np.random.random_sample(dims)).astype(dtype))
+    self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
     # Get reference result without running trt.
-    config_no_trt = self._GetConfigProto(params, run_params,
-                                         GraphState.ORIGINAL)
+    config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
     logging.info("Running original graph w/o trt, config:\n%s",
                  str(config_no_trt))
-    ref_result = self._RunGraph(params, input_gdef, input_data, config_no_trt,
-                                GraphState.ORIGINAL)
+    ref_result = self._RunGraph(run_params, input_gdef, input_data,
+                                config_no_trt, GraphState.ORIGINAL)
 
     # Run calibration if necessary.
-    if _IsQuantizationMode(run_params.precision_mode):
+    if IsQuantizationMode(run_params.precision_mode):
 
-      calib_config = self._GetConfigProto(params, run_params,
-                                          GraphState.CALIBRATE)
+      calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
       logging.info("Running calibration graph, config:\n%s", str(calib_config))
       if run_params.use_optimizer:
-        result = self._RunCalibration(params, input_gdef, input_data,
+        result = self._RunCalibration(run_params, input_gdef, input_data,
                                       calib_config)
       else:
-        calib_gdef = self._GetTrtGraphDef(params, run_params, input_gdef)
-        self._VerifyGraphDef(params, run_params, calib_gdef,
-                             GraphState.CALIBRATE)
-        result = self._RunCalibration(params, calib_gdef, input_data,
+        calib_gdef = self._GetTrtGraphDef(run_params, input_gdef)
+        self._VerifyGraphDef(run_params, calib_gdef, GraphState.CALIBRATE)
+        result = self._RunCalibration(run_params, calib_gdef, input_data,
                                       calib_config)
-      infer_gdef = trt_convert.calib_graph_to_infer_graph(calib_gdef)
-      self._VerifyGraphDef(params, run_params, infer_gdef, GraphState.INFERENCE)
+      infer_gdef = trt_convert.calib_graph_to_infer_graph(
+          calib_gdef, run_params.dynamic_engine)
+      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
 
       self.assertAllClose(
           ref_result,
           result,
-          atol=params.allclose_atol,
-          rtol=params.allclose_rtol)
+          atol=self.ExpectedAbsoluteTolerance(run_params),
+          rtol=self.ExpectedRelativeTolerance(run_params))
     else:
       infer_gdef = input_gdef
 
     # Run inference.
-    infer_config = self._GetConfigProto(params, run_params,
-                                        GraphState.INFERENCE)
+    infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
     logging.info("Running final inference graph, config:\n%s",
                  str(infer_config))
-    if run_params.use_optimizer:
-      result = self._RunGraph(params, infer_gdef, input_data, infer_config,
-                              GraphState.INFERENCE)
-    else:
-      trt_infer_gdef = self._GetTrtGraphDef(params, run_params, infer_gdef)
-      self._VerifyGraphDef(params, run_params, trt_infer_gdef,
-                           GraphState.INFERENCE)
-      result = self._RunGraph(params, trt_infer_gdef, input_data, infer_config,
-                              GraphState.INFERENCE)
+    if not run_params.use_optimizer:
+      infer_gdef = self._GetTrtGraphDef(run_params, infer_gdef)
+      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
 
+    result = self._RunGraph(run_params, infer_gdef, input_data, infer_config,
+                            GraphState.INFERENCE)
     self.assertAllClose(
         ref_result,
         result,
-        atol=params.allclose_atol,
-        rtol=params.allclose_rtol)
+        atol=self.ExpectedAbsoluteTolerance(run_params),
+        rtol=self.ExpectedRelativeTolerance(run_params))
 
   def testIdempotence(self):
     # Test that applying tensorrt optimizer or offline conversion tools multiple
@@ -421,13 +514,12 @@ def _AddTests(test_class):
     """Gets a single test method based on the parameters."""
 
     def _Test(self):
-      params = self.GetParams()
       logging.info(
           "Running test %s with parameters: use_optimizer=%s, "
           "precision_mode=%s, dynamic_engine=%s",
           "testTfTrt_" + run_params.test_name, run_params.use_optimizer,
           run_params.precision_mode, run_params.dynamic_engine)
-      self.RunTest(params, run_params)
+      self.RunTest(run_params)
 
     return _Test
 
@@ -435,7 +527,7 @@ def _AddTests(test_class):
   dynamic_engine_options = [False, True]
   for (use_optimizer, precision_mode, dynamic_engine) in itertools.product(
       use_optimizer_options, PRECISION_MODES, dynamic_engine_options):
-    if _IsQuantizationMode(precision_mode):
+    if IsQuantizationMode(precision_mode):
       if use_optimizer:
         # TODO(aaroey): if use_optimizer is True we need to get the inference
         # graphdef using custom python wrapper class, which is not currently
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
index 500057a36d..8736bfb644 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -38,6 +38,7 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [12, 5, 8, 1, 1, 12]
+    output_name = "output"
     input2_name = "input_2"
     input2_dims = [12, 5, 8, 1, 12, 1, 1]
     g = ops.Graph()
@@ -95,18 +96,20 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
 
       q = a * b
       q = q / c
-      array_ops.squeeze(q, name=self.output_name)
+      array_ops.squeeze(q, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name, input2_name],
         input_dims=[input_dims, input2_dims],
-        expected_engines=[
-            "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3",
-            "my_trt_op_4"
-        ],
-        expected_output_dims=(12, 5, 8, 12),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(12, 5, 8, 12)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return [
+        "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3",
+        "my_trt_op_4"
+    ]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
index ab4d224db4..b0271a04b3 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
@@ -38,15 +38,14 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [5, 2, 8, 8]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
       x, _, _ = nn_impl.fused_batch_norm(
-          x,
-          np.random.randn(2).astype(np.float32),
-          np.random.randn(2).astype(np.float32),
-          mean=np.random.randn(2).astype(np.float32),
-          variance=np.random.randn(2).astype(np.float32),
+          x, [1.0, 1.0], [0.0, 0.0],
+          mean=[0.5, 0.5],
+          variance=[1.0, 1.0],
           data_format="NCHW",
           is_training=False)
       e = constant_op.constant(
@@ -67,15 +66,17 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
           "VALID",
           data_format="NCHW",
           name="max_pool")
-      array_ops.squeeze(v, name="output")
+      array_ops.squeeze(v, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=["my_trt_op_0"],
-        expected_output_dims=(5, 6, 2, 2),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(5, 6, 2, 2)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["my_trt_op_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
index 56bdf848ea..d7c165784b 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
@@ -38,15 +38,14 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     dtype = dtypes.float32
     input_name = "input"
     input_dims = [5, 8, 8, 2]
+    output_name = "output"
     g = ops.Graph()
     with g.as_default():
       x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
       x, _, _ = nn_impl.fused_batch_norm(
-          x,
-          np.random.randn(2).astype(np.float32),
-          np.random.randn(2).astype(np.float32),
-          mean=np.random.randn(2).astype(np.float32),
-          variance=np.random.randn(2).astype(np.float32),
+          x, [1.0, 1.0], [0.0, 0.0],
+          mean=[0.5, 0.5],
+          variance=[1.0, 1.0],
           is_training=False)
       e = constant_op.constant(
           np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
@@ -58,15 +57,17 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
       idty = array_ops.identity(relu, "ID")
       v = nn_ops.max_pool(
           idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-      array_ops.squeeze(v, name="output")
+      array_ops.squeeze(v, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
-        expected_engines=["my_trt_op_0"],
-        expected_output_dims=(5, 2, 2, 6),
-        allclose_atol=1.e-03,
-        allclose_rtol=1.e-03)
+        output_names=[output_name],
+        expected_output_dims=[(5, 2, 2, 6)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["my_trt_op_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 355303acf6..21c0c30c19 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -16,6 +16,7 @@ config_setting(
 py_binary(
     name = "predict",
     srcs = ["predict.py"],
+    data = ["data/period_trend.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = select({
diff --git a/tensorflow/contrib/timeseries/examples/predict.py b/tensorflow/contrib/timeseries/examples/predict.py
index 8147d40caa..b036911314 100644
--- a/tensorflow/contrib/timeseries/examples/predict.py
+++ b/tensorflow/contrib/timeseries/examples/predict.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import os
 import sys
 
 import numpy as np
@@ -40,6 +41,10 @@ except ImportError:
 FLAGS = None
 
 
+_MODULE_PATH = os.path.dirname(__file__)
+_DEFAULT_DATA_FILE = os.path.join(_MODULE_PATH, "data/period_trend.csv")
+
+
 def structural_ensemble_train_and_predict(csv_file_name):
   # Cycle between 5 latent values over a period of 100. This leads to a very
   # smooth periodic component (and a small model), which is a good fit for our
@@ -115,9 +120,12 @@ def main(unused_argv):
   if not HAS_MATPLOTLIB:
     raise ImportError(
         "Please install matplotlib to generate a plot from this example.")
+  input_filename = FLAGS.input_filename
+  if input_filename is None:
+    input_filename = _DEFAULT_DATA_FILE
   make_plot("Structural ensemble",
-            *structural_ensemble_train_and_predict(FLAGS.input_filename))
-  make_plot("AR", *ar_train_and_predict(FLAGS.input_filename))
+            *structural_ensemble_train_and_predict(input_filename))
+  make_plot("AR", *ar_train_and_predict(input_filename))
   pyplot.show()
 
 
@@ -126,7 +134,7 @@ if __name__ == "__main__":
   parser.add_argument(
       "--input_filename",
       type=str,
-      required=True,
-      help="Input csv file.")
+      required=False,
+      help="Input csv file (omit to use the data/period_trend.csv).")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
index 5eb4deefb9..de547f835d 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
@@ -195,7 +195,7 @@ class ARModelTest(test.TestCase):
     self.train_helper(input_window_size=10,
                       loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS,
                       train_steps=300,
-                      max_loss=2.5,
+                      max_loss=50.,  # Just make sure there are no exceptions.
                       anomaly_distribution=None)
 
   def test_autoregression_normal_multiple_periods(self):
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index 983455f63d..461fe22210 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -69,8 +69,10 @@ class TimeSeriesRegressorTest(test.TestCase):
         input_pipeline.NumpyReader(features), shuffle_seed=3, num_threads=1,
         batch_size=16, window_size=16)
     first_estimator.train(input_fn=train_input_fn, steps=1)
-    first_loss_before_fit = first_estimator.evaluate(
-        input_fn=eval_input_fn, steps=1)["loss"]
+    first_evaluation = first_estimator.evaluate(
+        input_fn=eval_input_fn, steps=1)
+    first_loss_before_fit = first_evaluation["loss"]
+    self.assertAllEqual(first_loss_before_fit, first_evaluation["average_loss"])
     self.assertAllEqual([], first_loss_before_fit.shape)
     first_estimator.train(input_fn=train_input_fn, steps=1)
     first_loss_after_fit = first_estimator.evaluate(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 32194e400e..1f9f9b7aa6 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
@@ -123,6 +124,8 @@ class TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acce
     metrics[feature_keys.FilteringResults.STATE_TUPLE] = (
         _identity_metric_nested(feature_keys.FilteringResults.STATE_TUPLE,
                                 model_outputs.end_state))
+    metrics[metric_keys.MetricKeys.LOSS_MEAN] = metrics_impl.mean(
+        model_outputs.loss, name="average_loss")
     return estimator_lib.EstimatorSpec(
         loss=model_outputs.loss,
         mode=mode,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index bda3b53aca..e65e7b74d4 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -172,6 +172,7 @@ class EvaluationMetricsTests(test.TestCase):
     evaluation = estimator.evaluate(input_fn, steps=1)
     self.assertIn("plain_boring_metric386", evaluation)
     self.assertIn("fun_metric101", evaluation)
+    self.assertIn("average_loss", evaluation)
     # The values are deterministic because of fixed tf_random_seed.
     # However if they become flaky, remove such exacts comparisons.
     self.assertAllClose(evaluation["plain_boring_metric386"], 1.130380)
@@ -398,6 +399,7 @@ class OneShotTests(parameterized.TestCase):
         num_threads=1, batch_size=16, window_size=16)
     estimator.train(input_fn=train_input_fn, steps=5)
     result = estimator.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertIn("average_loss", result)
     self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
     input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
     export_location = estimator.export_savedmodel(_new_temp_dir(),
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
index b9f8620fd8..02d2524b66 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
@@ -290,7 +290,7 @@ class InputStatisticsTests(test.TestCase):
           time_series_reader=input_pipeline.NumpyReader(features))
       statistics = stat_object.initialize_graph(
           features=input_fn()[0])
-      with self.test_session(graph=graph) as session:
+      with self.session(graph=graph) as session:
         variables.global_variables_initializer().run()
         coordinator = coordinator_lib.Coordinator()
         queue_runner_impl.start_queue_runners(session, coord=coordinator)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
index 1fb4a3c121..c2eaa78493 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
@@ -190,13 +190,13 @@ class StateSpaceEquivalenceTests(test.TestCase):
         estimator.build_raw_serving_input_receiver_fn())
     with ops.Graph().as_default() as graph:
       random_model.initialize_graph()
-      with self.test_session(graph=graph) as session:
+      with self.session(graph=graph) as session:
         variables.global_variables_initializer().run()
         evaled_start_state = session.run(random_model.get_start_state())
     evaled_start_state = [
         state_element[None, ...] for state_element in evaled_start_state]
     with ops.Graph().as_default() as graph:
-      with self.test_session(graph=graph) as session:
+      with self.session(graph=graph) as session:
         signatures = loader.load(
             session, [tag_constants.SERVING], export_location)
         first_split_filtering = saved_model_utils.filter_continuation(
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 2abf402e6c..56e451e2e3 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -265,7 +265,6 @@ tf_py_test(
         ":datasets",
     ],
     grpc_enabled = True,
-    tags = ["no_windows"],
 )
 
 tf_py_test(
diff --git a/tensorflow/contrib/tpu/profiler/op_profile.proto b/tensorflow/contrib/tpu/profiler/op_profile.proto
index 1f249de314..feb177a7da 100644
--- a/tensorflow/contrib/tpu/profiler/op_profile.proto
+++ b/tensorflow/contrib/tpu/profiler/op_profile.proto
@@ -8,6 +8,8 @@ message Profile {
   Node by_category = 1;
   // Root of a profile broken down by program structure.
   Node by_program_structure = 2;
+  // Per program profile, indexed by hlo module name of the program.
+  map<string, Node> per_program = 3;
 }
 
 // An entry in the profile tree. (An instruction, or set of instructions).
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index 2cc17d6d92..bf807af68b 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -119,7 +119,9 @@ message OptimizationParameters {
 
   // Whether to use gradient accumulation (do two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
-  // apply them using the actual optimization algorithm).
+  // apply them using the actual optimization algorithm). This feature is
+  // experimental -- it has not been fully verified and may cause training
+  // crashes and/or failures.
   bool use_gradient_accumulation = 15;
 
   // Optimization algorithm parameters; which field is selected determines which
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index ff893a722f..1d1cb48e8e 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -54,7 +54,7 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver
+from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver as tpu_cluster_resolver_lib
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.ops import tpu_ops
@@ -80,12 +80,61 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
+
+_SESSIONS = {}
+
+
+def tpu_session(cluster_resolver):
+  """Construct or return a `tf.Session` connected to the given cluster."""
+  global _SESSIONS
+  master = cluster_resolver.master()
+  if master not in _SESSIONS:
+    cluster_spec = cluster_resolver.cluster_spec()
+    config = config_pb2.ConfigProto(isolate_session_state=True)
+    if cluster_spec:
+      config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+
+    graph = ops.Graph()
+    session = tf_session.Session(graph=graph, target=master, config=config)
+
+    with graph.as_default():
+      session.run(tpu.initialize_system())
+
+    _SESSIONS[master] = session
+  return _SESSIONS[master]
+
+
+def reset_tpu_sessions():
+  _SESSIONS.clear()
 
 
 # Work-around dependency cycle between DistributionStrategy and TPU lib.
-def TPUDistributionStrategy(*args, **kw):  # pylint: disable=invalid-name
+def TPUDistributionStrategy(tpu_cluster_resolver=None, num_cores=None):  # pylint: disable=invalid-name
+  """Construct a TPUDistributionStrategy."""
   from tensorflow.contrib.distribute.python import tpu_strategy  # pylint: disable=g-import-not-at-top
-  return tpu_strategy.TPUStrategy(*args, **kw)
+  # TODO(b/112705069): Remove this when TPUStrategy API is consistent.
+  # We are including this for (a) backwards compatibility for open sourced
+  # releases of TensorFlow and (b) to work around a circular dependency
+  # where keras_support and tpu_strategy depends on each other. Once we release
+  # a final version and remove support for the old API, this will be deleted.
+  # (See bug above for more details)
+  if tpu_cluster_resolver is None:
+    tpu_cluster_resolver = tpu_cluster_resolver_lib.TPUClusterResolver('')
+
+  args, _, _, _ = tf_inspect.getargspec(tpu_strategy.TPUStrategy.__init__)
+  if len(args) == 4:
+    logging.info('Detected new TPUStrategy API.')
+    return tpu_strategy.TPUStrategy(tpu_cluster_resolver,
+                                    steps_per_run=1,
+                                    num_cores=num_cores)
+  else:
+    logging.info('Detected old TPUStrategy API.')
+    strategy = tpu_strategy.TPUStrategy(num_cores_per_host=8)
+    strategy._tpu_cluster_resolver = tpu_cluster_resolver
+
+  return strategy
 
 
 class TPUEmbedding(embeddings.Embedding):
@@ -666,9 +715,10 @@ class TPUFunction(object):
 
       # Clone our CPU model, running within the TPU device context.
       with TPURewriteContext(tpu_input_map):
-        # TODO(power): Replicate variables.
-        with ops.device('/device:TPU:0'):
-          self._cloned_model = models.clone_model(self.model)
+        with variable_scope.variable_scope('tpu_model_%s' % id(self.model)):
+          # TODO(power): Replicate variables.
+          with ops.device('/device:TPU:0'):
+            self._cloned_model = models.clone_model(self.model)
 
       # Create a copy of the optimizer for this graph.
       if isinstance(self.model.optimizer, keras_optimizers.TFOptimizer):
@@ -845,7 +895,7 @@ class TPUFunction(object):
 class KerasTPUModel(models.Model):
   """TPU compatible Keras model wrapper."""
 
-  def __init__(self, cpu_model, tpu_name_or_address, strategy):
+  def __init__(self, cpu_model, strategy):
     super(models.Model, self).__init__(  # pylint: disable=bad-super-call
         inputs=cpu_model.inputs,
         outputs=cpu_model.outputs,
@@ -862,27 +912,14 @@ class KerasTPUModel(models.Model):
     self.train_function = None
     self._strategy = strategy
 
-    self._tpu_name_or_address = tpu_name_or_address
+    cluster_resolver = self._strategy._tpu_cluster_resolver
+    self._tpu_name_or_address = cluster_resolver.get_master()
     self._cpu_model = cpu_model
     self._tpu_model = None
     self._tpu_weights_initialized = False
-    self._graph = ops.Graph()
-
-    self._cluster_resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu_name_or_address)
-    master = self._cluster_resolver.master()
-    cluster_spec = self._cluster_resolver.cluster_spec()
-    self._session = tf_session.Session(
-        graph=self._graph,
-        target=master,
-        config=config_pb2.ConfigProto(isolate_session_state=True))
-
-    # TODO(saeta): Confirm the lines below work in ClusterSpec propagation env.
-    if cluster_spec:
-      self._session.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
 
-    with self._graph.as_default():
-      self._session.run(tpu.initialize_system())
+    self._session = tpu_session(cluster_resolver)
+    self._graph = self._session.graph
 
     # If the input CPU model has already been compiled, compile our TPU model
     # immediately.
@@ -1137,7 +1174,7 @@ Output shape: %(output_shape)s
 
 
 @experimental
-def tpu_model(model, tpu_name_or_address=None, strategy=None):
+def tpu_model(model, strategy=None):
   """Copy `model` along with weights to the TPU.  Returns a TPU model.
 
   Usage:
@@ -1148,7 +1185,7 @@ def tpu_model(model, tpu_name_or_address=None, strategy=None):
 
   # If `num_cores_per_host` is greater than one, batch parallelism will be used
   # to run on multiple TPU cores.
-  strategy = keras_support.TPUDistributionStrategy(num_cores_per_host=8)
+  strategy = keras_support.TPUDistributionStrategy(tpu_cluster_resolver)
   model = keras_support.tpu_model(model, strategy)
   model.compile(
       optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0),
@@ -1158,10 +1195,6 @@ def tpu_model(model, tpu_name_or_address=None, strategy=None):
 
   Args:
     model: A `KerasTPUModel`.
-    tpu_name_or_address: A string that is either the name of the Cloud TPU,
-      the grpc address of the Cloud TPU, or (Googlers only) the BNS name of the
-      Cloud TPU. If tpu_name_or_address is None, the TPUClusterResolver will
-      examine the environment to determine a potential Cloud TPU to use.
     strategy: `TPUDistributionStrategy`.  The strategy to use for replicating
               model across multiple TPU cores.
 
@@ -1176,9 +1209,8 @@ def tpu_model(model, tpu_name_or_address=None, strategy=None):
   # TODO(xiejw): Validate TPU model. TPUModel only?
   # TODO(xiejw): Validate replicas. Full or 1. Shall we allow subset?
   # TODO(xiejw): Adds reduction option.
+
   if strategy is None:
-    strategy = TPUDistributionStrategy(num_cores_per_host=1)
-  return KerasTPUModel(
-      cpu_model=model,
-      tpu_name_or_address=tpu_name_or_address,
-      strategy=strategy)
+    strategy = TPUDistributionStrategy()
+
+  return KerasTPUModel(cpu_model=model, strategy=strategy)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 8d05e081a7..18e0abdda2 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -65,7 +65,7 @@ class TPUConfig(
       The number of model replicas in the system. For non-model-parallelism
       case, this number equals the total number of TPU cores. For
       model-parallelism, the total number of TPU cores equals
-      product(computation_shape) * num_shards.
+      num_cores_per_replica * num_shards.
     num_cores_per_replica: Defaults to `None`, which disables model parallelism.
       An integer which describes the number of TPU cores per model replica. This
       is required by model-parallelism which enables partitioning
@@ -103,7 +103,7 @@ class TPUConfig(
       input mode.
 
     Raises:
-      ValueError: If `computation_shape` or `computation_shape` are invalid.
+      ValueError: If `num_cores_per_replica` is not 1, 2, 4 or 8.
   """
 
   def __new__(cls,
@@ -137,7 +137,7 @@ class TPUConfig(
         raise ValueError(
             'input_partition_dims requires setting num_cores_per_replica.')
 
-    # Parse computation_shape
+    # Check num_cores_per_replica
     if num_cores_per_replica is not None:
       if num_cores_per_replica not in [1, 2, 4, 8]:
         raise ValueError(
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 806ae1c4c9..19359cb612 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -390,12 +390,6 @@ class _InternalTPUContext(object):
       logging.info('_is_running_on_cpu: eval_on_tpu disabled')
       return True
 
-    if mode != model_fn_lib.ModeKeys.PREDICT:
-      return False
-
-    # There are actually 2 use cases when running with mode.PREDICT: prediction
-    # and saving the model.  We run actual predictions on the TPU, but
-    # model export is run on the CPU.
     if is_export_mode:
       return True
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 029492b489..1ff04f5c26 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -45,6 +45,7 @@ from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest as data_nest
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util as estimator_util
@@ -204,6 +205,12 @@ def _increase_eval_step_op(iterations_per_loop):
       use_locking=True)
 
 
+def _extract_key_names(tensor_or_dict):
+  if isinstance(tensor_or_dict, dict):
+    return sorted(tensor_or_dict.keys())
+  return []
+
+
 class _SIGNAL(object):
   """Signal used to control the thread of infeed/outfeed.
 
@@ -224,7 +231,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
   `metric_fn` runs on CPU to generate metrics and `tensors` represents the
   `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
   To be precise, TPU evaluation expects a slightly different signature from the
-  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
+  @{tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a
   dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
   The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
   `tensors` usually specify the model logits, which are transferred back from
@@ -247,7 +254,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
   sending tensors from TPU to CPU. To reduce the overhead, try reducing the
   size of the tensors. The `tensors` are concatenated along their major (batch)
   dimension, and so must be >= rank 1. The `host_call` is useful for writing
-  summaries with `tf.contrib.summary.create_file_writer`.
+  summaries with @{tf.contrib.summary.create_file_writer}.
   """
 
   def __new__(cls,
@@ -711,8 +718,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
       features, labels = inputs.features_and_labels()
       signals = inputs.signals()
 
-      inputs_structure_recorder.validate_and_record_structure(
-          features, labels, signals)
+      inputs_structure_recorder.validate_and_record_structure(features, labels)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
               features, labels, signals))
@@ -756,9 +762,13 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
     if not is_dataset:
       raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
                       'input pipeline configuration.')
+
     if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      # TODO(b/XXX): Add predict support for PER_HOST_V2
-      raise TypeError('Most PREDICT not yet supported in PER_HOST_V2 mode.')
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True,
+          num_invocations_per_step=ctx.num_of_replicas_per_host)
 
     hooks.append(inputs.dataset_initializer_hook())
     tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
@@ -768,6 +778,7 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
     control_deps = []
     per_host_sharded_inputs = []
     num_replicas_per_host = ctx.num_of_replicas_per_host
+    cached_signals = None
     with ops.device(device):
       if not inputs.is_dataset:
         raise TypeError('`input_fn` must return a `Dataset` for this mode.')
@@ -775,21 +786,32 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
         # Use control dependencies to ensure a deterministic ordering.
         with ops.control_dependencies(control_deps):
           features, labels = inputs.features_and_labels()  # Calls get_next()
+          signals = inputs.signals()
+
+          # All the replicas share the replica 0's stopping singal.
+          # This avoids inconsistent state among different model replcias.
+          if cached_signals:
+            signals['stopping'] = cached_signals['stopping']
+          else:
+            cached_signals = signals
 
         inputs_structure_recorder.validate_and_record_structure(
             features, labels)
         flattened_inputs = (
             inputs_structure_recorder.flatten_features_and_labels(
-                features, labels))
+                features, labels, signals))
         control_deps.extend(flattened_inputs)
         per_host_sharded_inputs.append(flattened_inputs)
 
       if inputs_structure_recorder.flattened_input_dims:
+        input_partition_dims = inputs_structure_recorder.flattened_input_dims
+        if signals:
+          input_partition_dims += [None] * len(signals)
         # pylint: disable=protected-access
         infeed_queue = tpu_feed._PartitionedInfeedQueue(
             number_of_tuple_elements=len(per_host_sharded_inputs[0]),
             host_id=host_id,
-            input_partition_dims=inputs_structure_recorder.flattened_input_dims,
+            input_partition_dims=input_partition_dims,
             device_assignment=ctx.device_assignment)
         per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
             per_host_sharded_inputs)
@@ -801,7 +823,13 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
             tpu_ordinal_function=tpu_ordinal_function_impl)
       captured_infeed_queue.capture(infeed_queue)
 
-    return per_host_enqueue_ops
+    if signals is None:
+      return per_host_enqueue_ops
+    else:
+      return {
+          'ops': per_host_enqueue_ops,
+          'signals': signals,
+      }
 
   return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
@@ -859,7 +887,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
             signals = inputs.signals()
 
             inputs_structure_recorder.validate_and_record_structure(
-                features, labels, signals)
+                features, labels)
             flattened_inputs = (
                 inputs_structure_recorder.flatten_features_and_labels(
                     features, labels, signals))
@@ -901,17 +929,19 @@ class _InputPipeline(object):
   inputs returned by the `input_fn` can have one of the following forms:
   1. features
   2. (features, labels)
+  3. ((arbitrarily nested structure of features), labels)
 
   Internally, form 1 is reformed to `(features, None)` as features and labels
   are passed separately to underlying methods. For TPU training, TPUEstimator
   may expect multiple `features` and `labels` tuples one for each core.
 
   TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  `features` can be `Tensor` or dict of string name to `Tensor`,
-  and `labels` could be `None`, `Tensor`, or dict of string name to `Tensor`.
-  TPU infeed/outfeed library expects flattened tensor list. So, `features` and
-  `labels` need to be flattened, before infeed enqueue, and the structure of
-  them needs to be recorded, in order to restore them after infeed dequeue.
+  and `labels`).  `features` can be `Tensor`, dict of string name to `Tensor`,
+  or nested tuples and `labels` could be `None`, `Tensor`, or dict of string
+  name to `Tensor`. TPU infeed/outfeed library expects flattened tensor list.
+  So, `features` and `labels` need to be flattened, before infeed enqueue, and
+  the structure of them needs to be recorded, in order to restore them after
+  infeed dequeue.
   """
 
   class InputsStructureRecorder(object):
@@ -919,10 +949,7 @@ class _InputPipeline(object):
 
     def __init__(self, input_partition_dims=None):
       # Holds the structure of inputs
-      self._feature_names = []
-      self._label_names = []
-      self._has_labels = False
-      self._signals_helper = None
+      self._feature_structure = {}
       self._flattened_input_dims = None
 
       if input_partition_dims:
@@ -949,7 +976,7 @@ class _InputPipeline(object):
       return self._flattened_input_dims
 
     def has_labels(self):
-      return self._has_labels
+      return 'labels' in self._feature_structure
 
     def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
                             label_dims_names, label_names, has_labels):
@@ -977,35 +1004,16 @@ class _InputPipeline(object):
 
       return flattened_input_dims
 
-    def validate_and_record_structure(self, features, labels, signals=None):
+    def validate_and_record_structure(self, features, labels):
       """Validates and records the structure of `features` and `labels`."""
-
-      def _extract_key_names(tensor_or_dict):
-        if tensor_or_dict is None:
-          return []
-        return sorted(tensor_or_dict.keys()) if isinstance(
-            tensor_or_dict, dict) else []
-
       # Extract structure.
       has_labels = labels is not None
       feature_names = _extract_key_names(features)
       label_names = _extract_key_names(labels)
 
-      if signals is not None and self._signals_helper is None:
-        # Record signals helper.
-        self._signals_helper = _SignalsHelper(signals)
-
-      if self._initialized:
-        # Verify the structure is same. The following should never happen.
-        assert feature_names == self._feature_names, 'feature keys mismatched'
-        assert label_names == self._label_names, 'label keys mismatched'
-        assert has_labels == self._has_labels, 'label presence mismatched'
-      else:
+      if not self._initialized:
         # Record structure.
         self._initialized = True
-        self._feature_names = feature_names
-        self._label_names = label_names
-        self._has_labels = has_labels
         if self._feature_dims is not None:
           feature_dims_names = _extract_key_names(self._feature_dims)
           if feature_dims_names != feature_names:
@@ -1027,24 +1035,12 @@ class _InputPipeline(object):
 
     def flatten_features_and_labels(self, features, labels, signals=None):
       """Flattens the `features` and `labels` to a single tensor list."""
-      flattened_inputs = []
-      if self._feature_names:
-        # We need a fixed ordering for enqueueing and dequeueing.
-        flattened_inputs.extend(
-            [features[name] for name in self._feature_names])
-      else:
-        flattened_inputs.append(features)
-
+      self._feature_structure['features'] = features
       if labels is not None:
-        if self._label_names:
-          # We need a fixed ordering for enqueueing and dequeueing.
-          flattened_inputs.extend([labels[name] for name in self._label_names])
-        else:
-          flattened_inputs.append(labels)
-
+        self._feature_structure['labels'] = labels
       if signals is not None:
-        flattened_inputs.extend(_SignalsHelper.as_tensor_list(signals))
-      return flattened_inputs
+        self._feature_structure['signals'] = signals
+      return data_nest.flatten(self._feature_structure)
 
     def unflatten_features_and_labels(self, flattened_inputs):
       """Restores the flattened inputs to original features and labels form.
@@ -1061,49 +1057,13 @@ class _InputPipeline(object):
         ValueError: If the number of expected tensors from `flattened_inputs`
           mismatches the recorded structure.
       """
-      expected_num_features = (
-          len(self._feature_names) if self._feature_names else 1)
-      if self._has_labels:
-        expected_num_labels = (
-            len(self._label_names) if self._label_names else 1)
-      else:
-        expected_num_labels = 0
-
-      expected_num_signals = (
-          self._signals_helper.num_signals if self._signals_helper else 0)
 
-      expected_num_tensors = (
-          expected_num_features + expected_num_labels + expected_num_signals)
-
-      if expected_num_tensors != len(flattened_inputs):
-        raise ValueError(
-            'The number of flattened tensors mismatches expected num. '
-            'Expected {}, got {}'.format(expected_num_tensors,
-                                         len(flattened_inputs)))
-      if self._feature_names:
-        unflattened_features = dict(
-            zip(self._feature_names, flattened_inputs[:expected_num_features]))
-      else:
-        # Single tensor case
-        unflattened_features = flattened_inputs[0]
-
-      if expected_num_labels == 0:
-        unflattened_label = None
-      elif self._label_names:
-        label_list = flattened_inputs[
-            expected_num_features:expected_num_features + expected_num_labels]
-        unflattened_label = dict(zip(self._label_names, label_list))
-      else:
-        # Single tensor case.
-        unflattened_label = flattened_inputs[expected_num_features]
-
-      signals = None
-      if expected_num_signals != 0:
-        tensor_list_for_signals = flattened_inputs[
-            expected_num_features + expected_num_labels:]
-        signals = self._signals_helper.unflatten(tensor_list_for_signals)
-
-      return _Inputs(unflattened_features, unflattened_label, signals=signals)
+      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
+                                                      flattened_inputs)
+      return _Inputs(
+          unflattened_inputs['features'],
+          unflattened_inputs.get('labels'),
+          signals=unflattened_inputs.get('signals'))
 
   def __init__(self, input_fn, batch_axis, ctx):
     """Constructor.
@@ -1505,12 +1465,14 @@ class _ModelFnWrapper(object):
               'The {} to the model returned by input_fn must have static shape.'
               ' Tensor: {}'.format(obj_name, obj))
       else:
-        for (key, tensor) in obj.items():
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                'The {} to the model returned by input_fn must have static '
-                'shape. Key: \'{}\', Tensor: {}'.format(
-                    obj_name, key, tensor))
+        for (key, value) in obj.items():
+          flattened_tensors = data_nest.flatten(value)
+          for tensor in flattened_tensors:
+            if not tensor.get_shape().is_fully_defined():
+              raise ValueError(
+                  'The {} to the model returned by input_fn must have static '
+                  'shape. Key: \'{}\', Tensor: {}'.format(
+                      obj_name, key, tensor))
 
     validate(features, 'features')
     if labels is not None:
@@ -2184,9 +2146,10 @@ class TPUEstimator(estimator_lib.Estimator):
                                mode=model_fn_lib.ModeKeys.PREDICT,
                                export_tags=None,
                                check_variables=True):
-    if mode != model_fn_lib.ModeKeys.PREDICT:
+    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
       raise NotImplementedError(
-          'TPUEstimator only handles mode PREDICT for export_savedmodel(); '
+          'TPUEstimator only handles mode PREDICT for exporting '
+          'when `export_to_tpu` is `True`; '
           'got {}.'.format(mode))
 
     (super(TPUEstimator, self).
@@ -2484,16 +2447,12 @@ class TPUEstimator(estimator_lib.Estimator):
       with self._ctx.with_mode(mode) as ctx:
         model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
 
-        if mode != model_fn_lib.ModeKeys.PREDICT:
+        # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
+        # but not in `export_savedmodel()`.
+        if self._is_input_fn_invoked:
           is_export_mode = False
         else:
-          # For export_savedmodel, input_fn is never passed to Estimator. So, by
-          # checking the self._is_input_fn_invoked bit, we can know, given the
-          # mode == PREDICT, it is the .predict API, not export_savedmodel API.
-          if self._is_input_fn_invoked:
-            is_export_mode = False
-          else:
-            is_export_mode = True
+          is_export_mode = True
 
         # Clear the bit.
         self._is_input_fn_invoked = None
@@ -2865,8 +2824,6 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
 
 def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  num_cores = ctx.num_cores
-
   (single_tpu_predict_step, host_calls, captured_scaffold_fn,
    captured_predict_hooks
   ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
@@ -2885,7 +2842,7 @@ def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   (dummy_predict_op,) = tpu.shard(
       multi_tpu_predict_steps_on_single_shard,
       inputs=[],
-      num_shards=num_cores,
+      num_shards=ctx.num_replicas,
       outputs_from_all_shards=False,
       device_assignment=ctx.device_assignment)
 
@@ -3103,16 +3060,48 @@ class _Inputs(object):
 class _InputsWithStoppingSignals(_Inputs):
   """Inputs with `_StopSignals` inserted into the dataset."""
 
-  def __init__(self, dataset, batch_size, add_padding=False):
+  def __init__(self,
+               dataset,
+               batch_size,
+               add_padding=False,
+               num_invocations_per_step=1):
 
     assert dataset is not None
-
     user_provided_dataset = dataset.map(
         _InputsWithStoppingSignals.insert_stopping_signal(
             stop=False, batch_size=batch_size, add_padding=add_padding))
-    final_batch_dataset = dataset.take(1).map(
-        _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=True, batch_size=batch_size, add_padding=add_padding))
+    if num_invocations_per_step == 1:
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+    else:
+      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
+      # user_provided_dataset and stop properly.
+      # For example, if num_invocations_per_step is 2, we append 3 additional
+      # padding batches: b1, b2, b3.
+      # If user_provided_dataset contains two batches: a1, a2
+      # Step 1: [a1, a2]
+      # Step 2: [b1, b2] -> STOP
+      # If user_provided_dataset contains three batches: a1, a2, a3.
+      # The training loops:
+      # Step 1: [a1, a2]
+      # Step 2: [a3, b1]
+      # Step 3: [b2, b3] -> STOP.
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+      final_batch_dataset = final_batch_dataset.repeat(
+          2 * num_invocations_per_step - 1)
+
+      def _set_mask(data_dict):
+        signals = data_dict['signals']
+        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
+        data_dict['signals'] = signals
+        return data_dict
+
+      # Mask out the extra batch.
+      final_batch_dataset = final_batch_dataset.map(_set_mask)
+
     dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
 
     super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
@@ -3338,26 +3327,6 @@ class _PaddingSignals(object):
     return padding_mask
 
 
-class _SignalsHelper(object):
-  """A general helper class to handle common signals manipulation."""
-
-  def __init__(self, signals):
-    self._signal_keys = []
-    for key in sorted(iter(signals.keys())):
-      self._signal_keys.append(key)
-
-  @property
-  def num_signals(self):
-    return len(self._signal_keys)
-
-  def unflatten(self, tensor_list):
-    return dict(zip(self._signal_keys, tensor_list))
-
-  @staticmethod
-  def as_tensor_list(signals):
-    return [signals[key] for key in sorted(iter(signals.keys()))]
-
-
 def _verify_cross_hosts_transfer_size(tensor_dict, message):
   total_size = 0
   tensor_structure = {}
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
index 3e90957e6d..bd530fdc3a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
@@ -286,6 +286,59 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(sliced_features)
 
+  def test_slice_with_multi_invocations_per_step(self):
+    num_samples = 3
+    batch_size = 2
+
+    params = {'batch_size': batch_size}
+    input_fn, (a, b) = make_input_fn(num_samples=num_samples)
+
+    with ops.Graph().as_default():
+      dataset = input_fn(params)
+      inputs = tpu_estimator._InputsWithStoppingSignals(
+          dataset, batch_size, add_padding=True, num_invocations_per_step=2)
+      hook = inputs.dataset_initializer_hook()
+      features, _ = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      sliced_features = (
+          tpu_estimator._PaddingSignals.slice_tensor_or_dict(features, signals))
+
+      with session.Session() as sess:
+        hook.begin()
+        hook.after_create_session(sess, coord=None)
+
+        result, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual(a[:batch_size], result['a'])
+        self.assertAllEqual(b[:batch_size], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # This is the final partial batch.
+        result, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertEqual(1, len(result['a']))
+        self.assertAllEqual(a[batch_size:num_samples], result['a'])
+        self.assertAllEqual(b[batch_size:num_samples], result['b'])
+        self.assertAllEqual([[0.]] * batch_size, evaluated_signals['stopping'])
+
+        # We should see 3 continuous batches with STOP ('1') as signals and all
+        # of them have mask 1.
+        _, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([1.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        _, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([1.] * batch_size,
+                            evaluated_signals['padding_mask'])
+
+        _, evaluated_signals = sess.run([sliced_features, signals])
+        self.assertAllEqual([[1.]] * batch_size, evaluated_signals['stopping'])
+        self.assertAllEqual([1.] * batch_size,
+                            evaluated_signals['padding_mask'])
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(sliced_features)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
index 53d33f4077..74a675b645 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
@@ -186,3 +186,7 @@ class CrossShardOptimizer(optimizer.Optimizer):
       A list of strings.
     """
     return self._opt.get_slot_names(*args, **kwargs)
+
+  def variables(self):
+    """Forwarding the variables from the underlying optimizer."""
+    return self._opt.variables()
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index edd71fb250..3547e71184 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -14,7 +14,9 @@
 # ==============================================================================
 """Training and input utilities.
 
-See @{$python/contrib.training} guide.
+See
+[Contrib Training](https://tensorflow.org/api_guides/python/contrib.training)
+guide.
 
 @@batch_sequences_with_states
 @@NextQueuedSequenceBatch
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index df07ff44ee..afeef978f3 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -108,7 +108,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
                   expected_seq4_batch1, expected_seq4_batch2,
                   key=None, make_keys_unique=False):
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       next_batch = sqss.batch_sequences_with_states(
           input_key=key if key is not None else self.key,
           input_sequences=self.sequences,
@@ -332,7 +332,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
         "seq4": self.sequences["seq4"],
     }
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    ".*should be a multiple of: 3, but saw "
                                    "value: 4. Consider setting pad=True."):
@@ -508,7 +508,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
-    with ops.Graph().as_default() as g, self.test_session(graph=g):
+    with ops.Graph().as_default() as g, self.session(graph=g):
       sequences = {
           "key_1": constant_op.constant([1, 2, 3]),  # length 3
           "key_2": constant_op.constant([1.5, 2.5])  # length 2
@@ -520,7 +520,7 @@ class PaddingTest(test.TestCase):
         padded_seq["key_1"].eval()
 
   def testPadding(self):
-    with ops.Graph().as_default() as g, self.test_session(graph=g):
+    with ops.Graph().as_default() as g, self.session(graph=g):
       sequences = {
           "key_1": constant_op.constant([1, 2]),
           "key_2": constant_op.constant([0.5, -1.0]),
@@ -549,7 +549,7 @@ class PaddingTest(test.TestCase):
     val2 = np.array([9, 12])
     shape2 = np.array([5])
 
-    with ops.Graph().as_default() as g, self.test_session(graph=g):
+    with ops.Graph().as_default() as g, self.session(graph=g):
       sp_tensor1 = sparse_tensor.SparseTensor(
           indices=array_ops.constant(ind1, dtypes.int64),
           values=array_ops.constant(val1, dtypes.int64),
diff --git a/tensorflow/contrib/training/python/training/bucket_ops_test.py b/tensorflow/contrib/training/python/training/bucket_ops_test.py
index 504f1fcd41..b259e0ee83 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops_test.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops_test.py
@@ -112,7 +112,7 @@ class BucketTest(test.TestCase):
     self.assertAllEqual(
         [[32], [32, None], [32, 3], [None, None]],
         [out.get_shape().as_list() for out in bucketed_dynamic[1]])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for v in range(32):
         self.enqueue_inputs(sess, {
             self.scalar_int_feed: v,
@@ -162,7 +162,7 @@ class BucketTest(test.TestCase):
     self.assertAllEqual(
         [[None], [None, None], [None, 3], [None, None]],
         [out.get_shape().as_list() for out in bucketed_dynamic[1]])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for v in range(15):
         self.enqueue_inputs(sess, {
             self.scalar_int_feed: v,
@@ -204,7 +204,7 @@ class BucketTest(test.TestCase):
     self.assertAllEqual(
         [[32], [32, None], [32, 3], [None, None]],
         [out.get_shape().as_list() for out in bucketed_dynamic[1]])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for v in range(64):
         self.enqueue_inputs(sess, {
             self.scalar_int_feed: v,
@@ -286,7 +286,7 @@ class BucketTest(test.TestCase):
     self.assertAllEqual(
         [[32], [32, None], [32, 3]],
         [out.get_shape().as_list() for out in bucketed_dynamic[1]])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for v in range(128):
         self.enqueue_inputs(sess, {
             self.scalar_int_feed: v,
@@ -405,7 +405,7 @@ class BucketBySequenceLengthTest(test.TestCase):
               num_pairs_to_enqueue - (batch_size - 1) * num_buckets,
               num_pairs_dequeued)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       coord = coordinator.Coordinator()
 
       # Feed the inputs, then close the input thread.
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index c36d00e842..ec47fe5d97 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -67,7 +67,7 @@ class CheckpointIteratorTest(test.TestCase):
     global_step = variables.get_or_create_global_step()
     saver = saver_lib.Saver()  # Saves the global step.
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       session.run(variables_lib.global_variables_initializer())
       save_path = os.path.join(checkpoint_dir, 'model.ckpt')
       saver.save(session, save_path, global_step=global_step)
diff --git a/tensorflow/contrib/training/python/training/resample_test.py b/tensorflow/contrib/training/python/training/resample_test.py
index 774241a816..8665a24883 100644
--- a/tensorflow/contrib/training/python/training/resample_test.py
+++ b/tensorflow/contrib/training/python/training/resample_test.py
@@ -44,7 +44,7 @@ class ResampleTest(test.TestCase):
         ([3], [0, 0, 0]),
         ([0, 1, 2, 3], [1, 2, 2, 3, 3, 3]),
     ]
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for inputs, expected in cases:
         array_inputs = numpy.array(inputs, dtype=numpy.int32)
         actual = sess.run(resample._repeat_range(array_inputs))
@@ -65,7 +65,7 @@ class ResampleTest(test.TestCase):
 
     init = control_flow_ops.group(variables.local_variables_initializer(),
                                   variables.global_variables_initializer())
-    with self.test_session() as s:
+    with self.cached_session() as s:
       s.run(init)  # initialize
 
       # outputs
@@ -112,7 +112,7 @@ class ResampleTest(test.TestCase):
     init = control_flow_ops.group(variables.local_variables_initializer(),
                                   variables.global_variables_initializer())
     expected_sum_op = math_ops.reduce_sum(vals)
-    with self.test_session() as s:
+    with self.cached_session() as s:
       s.run(init)
       expected_sum = n * s.run(expected_sum_op)
 
@@ -147,7 +147,7 @@ class ResampleTest(test.TestCase):
 
     resampled = resample.resample_at_rate([vals], rates)
 
-    with self.test_session() as s:
+    with self.cached_session() as s:
       rs, = s.run(resampled, {
           vals: list(range(count)),
           rates: numpy.zeros(
diff --git a/tensorflow/contrib/training/python/training/sampling_ops_test.py b/tensorflow/contrib/training/python/training/sampling_ops_test.py
index bf7fb4fd48..1aeff7dc80 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops_test.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops_test.py
@@ -146,7 +146,7 @@ class StratifiedSampleTest(test.TestCase):
 
     for illegal_label in illegal_labels:
       # Run session that should fail.
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           sess.run([val_tf, lbl_tf],
                    feed_dict={label_ph: illegal_label,
@@ -154,7 +154,7 @@ class StratifiedSampleTest(test.TestCase):
 
     for illegal_prob in illegal_probs:
       # Run session that should fail.
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           sess.run([prob_tf],
                    feed_dict={label_ph: valid_labels,
@@ -172,7 +172,7 @@ class StratifiedSampleTest(test.TestCase):
     summary_op = logging_ops.merge_summary(
         ops.get_collection(ops.GraphKeys.SUMMARIES))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
@@ -197,7 +197,7 @@ class StratifiedSampleTest(test.TestCase):
         batch_size,
         init_probs=[0, .3, 0, .7, 0],
         enqueue_many=True)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
@@ -228,7 +228,7 @@ class StratifiedSampleTest(test.TestCase):
 
     # Run graph to make sure there are no shape-related runtime errors.
     for vals, labels in legal_input_pairs:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         sess.run([val_tf, labels_tf],
                  feed_dict={vals_ph: vals,
                             labels_ph: labels})
@@ -253,7 +253,7 @@ class StratifiedSampleTest(test.TestCase):
     self.assertEqual(len(val_list), len(val_input_batch))
     self.assertTrue(isinstance(lbls, ops.Tensor))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
@@ -283,7 +283,7 @@ class StratifiedSampleTest(test.TestCase):
     # Run session and keep track of how frequently the labels and values appear.
     data_l = []
     label_l = []
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Need to initialize variables that keep running total of classes seen.
       variables.global_variables_initializer().run()
 
@@ -374,7 +374,7 @@ class RejectionSampleTest(test.TestCase):
         'rejection_sample/prob_with_checks:0')
 
     # Run session that should fail.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for illegal_prob in [-0.1, 1.1]:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           sess.run(prob_tensor, feed_dict={prob_ph: illegal_prob})
@@ -393,7 +393,7 @@ class RejectionSampleTest(test.TestCase):
     sample = sampling_ops.rejection_sample(tensor_list, accept_prob_fn,
                                            batch_size)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
diff --git a/tensorflow/contrib/training/python/training/sampling_ops_threading_test.py b/tensorflow/contrib/training/python/training/sampling_ops_threading_test.py
index ca78c0029e..73ad859ab3 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops_threading_test.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops_threading_test.py
@@ -59,7 +59,7 @@ class SamplingOpsThreadingTest(test.TestCase):
     out_tensor = queue.dequeue()
 
     # Run the multi-threaded session.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Need to initialize variables that keep running total of classes seen.
       variables.global_variables_initializer().run()
 
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver_test.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver_test.py
index 7aebd9d9fe..8932b905c9 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver_test.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.platform import test
 class SequenceQueueingStateSaverTest(test.TestCase):
 
   def testSequenceInputWrapper(self):
-    with self.test_session():
+    with self.cached_session():
       length = 3
       key = "key"
       padded_length = 4
@@ -54,7 +54,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
       self.assertTrue(isinstance(input_wrapper.context["context1"], ops.Tensor))
 
   def testStateSaverWithTwoSimpleSteps(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size_value = 2
       batch_size = constant_op.constant(batch_size_value)
       num_unroll = 2
@@ -159,7 +159,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
       self.assertEqual(0, state_saver.barrier.ready_size().eval())
 
   def testStateSaverFailsIfPaddedLengthIsNotMultipleOfNumUnroll(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size = constant_op.constant(32)
       num_unroll = 17
       bad_padded_length = 3
@@ -194,7 +194,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
                  })
 
   def _testStateSaverFailsIfCapacityTooSmall(self, batch_size):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       num_unroll = 2
       length = array_ops.placeholder(dtypes.int32)
       key = array_ops.placeholder(dtypes.string)
@@ -243,7 +243,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
       self._testStateSaverFailsIfCapacityTooSmall(batch_size)
 
   def testStateSaverFailsIfInconsistentPaddedLength(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size = constant_op.constant(32)
       num_unroll = 17
       length = array_ops.placeholder(dtypes.int32)
@@ -282,7 +282,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
 
   def testStateSaverFailsIfInconsistentWriteState(self):
     # TODO(b/26910386): Identify why this infrequently causes timeouts.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size = constant_op.constant(1)
       num_unroll = 17
       length = array_ops.placeholder(dtypes.int32)
@@ -326,7 +326,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
   def testStateSaverWithManyInputsReadWriteThread(self):
     batch_size_value = 32
     num_proc_threads = 100
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size = constant_op.constant(batch_size_value)
       num_unroll = 17
       length = array_ops.placeholder(dtypes.int32)
@@ -490,7 +490,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
       self.assertGreater(processed_count[0], 2 * 20 * batch_size_value)
 
   def testStateSaverProcessesExamplesInOrder(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size_value = 32
       batch_size = constant_op.constant(batch_size_value)
       num_unroll = 17
@@ -563,7 +563,7 @@ class SequenceQueueingStateSaverTest(test.TestCase):
       self.assertEqual(get_ready_size.eval(), 0)
 
   def testStateSaverCanHandleVariableBatchsize(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       batch_size = array_ops.placeholder(dtypes.int32)
       num_unroll = 17
       length = array_ops.placeholder(dtypes.int32)
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
index 4a46e9a49e..3269d5fef2 100644
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
+++ b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
@@ -62,7 +62,7 @@ class SGDRDecayTest(test_util.TensorFlowTestCase):
   def get_sgdr_values(self, lr, initial_period_steps, t_mul, iters):
     """Get an array with learning rate values from the consecutive steps
     using current tensorflow implementation."""
-    with self.test_session():
+    with self.cached_session():
       step = placeholder(dtypes.int32)
 
       decay = sgdr_decay(lr, step, initial_period_steps, t_mul)
@@ -76,7 +76,7 @@ class SGDRDecayTest(test_util.TensorFlowTestCase):
     """Compare values generated by tensorflow implementation to the values
     generated by the original implementation
     (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
-    with self.test_session():
+    with self.cached_session():
       lr = 10.0
       init_steps = 2
       t_mul = 3
@@ -92,7 +92,7 @@ class SGDRDecayTest(test_util.TensorFlowTestCase):
   def testMDecay(self):
     """Test m_mul argument. Check values for learning rate at the beginning
     of the first, second, third and fourth period. """
-    with self.test_session():
+    with self.cached_session():
       step = placeholder(dtypes.int32)
 
       lr = 0.1
@@ -121,7 +121,7 @@ class SGDRDecayTest(test_util.TensorFlowTestCase):
   def testCos(self):
     """Check learning rate values at the beginning, in the middle
     and at the end of the period."""
-    with self.test_session():
+    with self.cached_session():
       step = placeholder(dtypes.int32)
       lr = 0.2
       t_e = 1000
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
index df0a186f4f..d9b0511a98 100644
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
@@ -79,7 +79,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
     iterator = dataset.make_one_shot_iterator()
     queue_handle, value = iterator.get_next()
     enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual([[0, 0, 0]], sess.run(value))
       value_1, _ = sess.run([value, enqueue_negative])
       self.assertAllEqual([[1, 0, 0]], value_1)
@@ -101,7 +101,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
     iterator = dataset.make_one_shot_iterator()
     queue_handle, value = iterator.get_next()
     enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual([0], sess.run(value))
       value_1, _ = sess.run([value, enqueue_negative])
       self.assertEqual([1], value_1)
@@ -126,7 +126,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
     enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
                                                   array_ops.expand_dims(
                                                       value[0], axis=0))
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       value_0, _ = sess.run([value, enqueue_negative])
       self.assertAllEqual([0, 1], value_0)
       value_1, _ = sess.run([value, enqueue_zeroth])
@@ -147,7 +147,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
         tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
         for i in range(1000)
     ]
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       value_0, _ = sess.run((value, enqueue_many_more))
       self.assertEqual([0], value_0)
       rest = []
@@ -174,7 +174,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
     iterator = dataset.make_one_shot_iterator()
     queue_handle, value = iterator.get_next()
     enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       i = 0
       while i < 4:
         received, _ = sess.run((value, enqueue))
@@ -199,7 +199,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
             batch_size=1, padded_shapes=[2]))
     iterator = dataset.make_one_shot_iterator()
     _, value = iterator.get_next()
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.assertRaisesOpError(
           r"Incompatible input shapes at component 0 between "
           r"input dataset this dataset: \[3\] vs. \[2\]"):
@@ -224,7 +224,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
                                                      np.array(
                                                          [[1]], dtype=np.int32))
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with self.assertRaisesOpError(
           "mismatched number of tensors.  Queue expects 1 tensors but "
           "tried to insert 2"):
@@ -274,7 +274,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
     with ops.control_dependencies([enqueue_rest_op]):
       calc = array_ops.identity(value_head)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
       self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
       self.assertAllEqual([[6, 6]], sess.run(calc))
@@ -304,7 +304,7 @@ class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
     iterator = dataset.make_one_shot_iterator()
     _, (unused_count, padded_value) = iterator.get_next()
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
                           sess.run(padded_value))
       self.assertAllEqual([[6] * 6], sess.run(padded_value))
diff --git a/tensorflow/contrib/training/python/training/training_test.py b/tensorflow/contrib/training/python/training/training_test.py
index 94cf7788b2..3b524ac8c7 100644
--- a/tensorflow/contrib/training/python/training/training_test.py
+++ b/tensorflow/contrib/training/python/training/training_test.py
@@ -62,7 +62,7 @@ class ClipGradsTest(test.TestCase):
     clipped_gradients_to_variables = training.clip_gradient_norms(
         gradients_to_variables, 3.0)
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       session.run(variables_lib2.global_variables_initializer())
       self.assertAlmostEqual(4.0, gradients_to_variables[0][0].eval())
       self.assertAlmostEqual(3.0, clipped_gradients_to_variables[0][0].eval())
@@ -75,7 +75,7 @@ class ClipGradsTest(test.TestCase):
     clipped_gradients_to_variables = training.clip_gradient_norms_fn(3.0)(
         gradients_to_variables)
 
-    with self.test_session() as session:
+    with self.cached_session() as session:
       session.run(variables_lib2.global_variables_initializer())
       self.assertAlmostEqual(4.0, gradients_to_variables[0][0].eval())
       self.assertAlmostEqual(3.0, clipped_gradients_to_variables[0][0].eval())
@@ -122,7 +122,7 @@ class CreateTrainOpTest(test.TestCase):
       moving_variance = variables_lib.get_variables_by_name('moving_variance')[
           0]
 
-      with self.test_session() as session:
+      with self.cached_session() as session:
         # Initialize all variables
         session.run(variables_lib2.global_variables_initializer())
         mean, variance = session.run([moving_mean, moving_variance])
@@ -155,7 +155,7 @@ class CreateTrainOpTest(test.TestCase):
       moving_variance = variables_lib.get_variables_by_name('moving_variance')[
           0]
 
-      with self.test_session() as session:
+      with self.cached_session() as session:
         # Initialize all variables
         session.run(variables_lib2.global_variables_initializer())
         mean, variance = session.run([moving_mean, moving_variance])
@@ -186,7 +186,7 @@ class CreateTrainOpTest(test.TestCase):
 
       global_step = variables_lib.get_or_create_global_step()
 
-      with self.test_session() as session:
+      with self.cached_session() as session:
         # Initialize all variables
         session.run(variables_lib2.global_variables_initializer())
 
@@ -209,7 +209,7 @@ class CreateTrainOpTest(test.TestCase):
 
       global_step = variables_lib.get_or_create_global_step()
 
-      with self.test_session() as session:
+      with self.cached_session() as session:
         # Initialize all variables
         session.run(variables_lib2.global_variables_initializer())
 
@@ -535,7 +535,7 @@ class TrainTest(test.TestCase):
       train_biases = training.create_train_op(
           total_loss, optimizer, variables_to_train=[biases])
 
-      with self.test_session() as session:
+      with self.cached_session() as session:
         # Initialize the variables.
         session.run(variables_lib2.global_variables_initializer())
 
diff --git a/tensorflow/contrib/util/__init__.py b/tensorflow/contrib/util/__init__.py
index 08741cf8ca..338acef63f 100644
--- a/tensorflow/contrib/util/__init__.py
+++ b/tensorflow/contrib/util/__init__.py
@@ -15,7 +15,7 @@
 
 """Utilities for dealing with Tensors.
 
-See @{$python/contrib.util} guide.
+See [Contrib Util](https://tensorflow.org/api_guides/python/contrib.util) guide.
 
 @@constant_value
 @@make_tensor_proto
diff --git a/tensorflow/contrib/verbs/grpc_verbs_client.h b/tensorflow/contrib/verbs/grpc_verbs_client.h
index 2cfaa4986c..e07085502f 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_client.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_client.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
-#define TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_CLIENT_H_
+#define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_CLIENT_H_
 
 #include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
@@ -47,4 +47,4 @@ class GrpcVerbsClient {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_CLIENT_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
index abe5e08b07..cfb9b7ddd7 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
-#define TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_IMPL_H_
+#define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_IMPL_H_
 
 #include "grpcpp/impl/codegen/async_stream.h"
 #include "grpcpp/impl/codegen/async_unary_call.h"
@@ -86,4 +86,4 @@ class VerbsService GRPC_FINAL {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_IMPL_H_
diff --git a/tensorflow/contrib/verbs/verbs_util.h b/tensorflow/contrib/verbs/verbs_util.h
index 5cd0a3533a..6277bc4b41 100644
--- a/tensorflow/contrib/verbs/verbs_util.h
+++ b/tensorflow/contrib/verbs/verbs_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_RDMA_UTIL_H_
-#define TENSORFLOW_CONTRIB_RDMA_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_VERBS_UTIL_H_
+#define TENSORFLOW_CONTRIB_VERBS_VERBS_UTIL_H_
 
 #include <string>
 
@@ -30,4 +30,4 @@ class VerbsUtil {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_RDMA_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_VERBS_UTIL_H_
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 0af8627290..0882cc3c8b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -121,6 +121,7 @@ load(
     "tf_additional_minimal_lib_srcs",
     "tf_additional_mpi_lib_defines",
     "tf_additional_proto_hdrs",
+    "tf_additional_proto_compiler_hdrs",
     "tf_additional_proto_srcs",
     "tf_additional_test_deps",
     "tf_additional_test_srcs",
@@ -128,6 +129,7 @@ load(
     "tf_jspb_proto_library",
     "tf_kernel_tests_linkstatic",
     "tf_lib_proto_parsing_deps",
+    "tf_lib_proto_compiler_deps",
     "tf_nano_proto_library",
     "tf_platform_hdrs",
     "tf_platform_srcs",
@@ -149,6 +151,7 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
+    "mkl_deps",
 )
 
 exports_files(["ops/ops.pbtxt"])
@@ -372,6 +375,7 @@ cc_library(
         ":lib_platform",
         ":platform_base",
         "//tensorflow/core/platform/default/build_config:port",
+        "@com_google_absl//absl/base",
         "@snappy",
     ],
 )
@@ -612,6 +616,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "lib_proto_compiler",
+    hdrs = [
+        "platform/protobuf_compiler.h",
+    ] + tf_additional_proto_compiler_hdrs(),
+    copts = tf_copts(),
+    deps = tf_lib_proto_compiler_deps() + [
+        ":lib_proto_parsing",
+    ],
+)
+
 # This build rule (along with :lib_internal, :framework, and
 # :framework_internal) purposefully omits the definitions of many declared
 # symbols, which are included in //tensorflow:libtensorflow_framework.so. Using
@@ -654,8 +669,11 @@ cc_library(
         "lib/io/table_builder.h",
         "lib/io/table_options.h",
         "lib/math/math_util.h",
+        "lib/monitoring/collected_metrics.h",
+        "lib/monitoring/collection_registry.h",
         "lib/monitoring/counter.h",
         "lib/monitoring/gauge.h",
+        "lib/monitoring/metric_def.h",
         "lib/monitoring/sampler.h",
         "lib/random/distribution_sampler.h",
         "lib/random/philox_random.h",
@@ -1558,6 +1576,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":mobile_additional_lib_deps",
         ":protos_all_cc_impl",
         ":stats_calculator_portable",
         "//third_party/eigen3",
@@ -1568,6 +1587,11 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "mobile_additional_lib_deps",
+    deps = tf_additional_lib_deps(),
+)
+
 # Native library support for iOS applications.
 #
 # bazel  build --config=ios_x86_64 \
@@ -1599,6 +1623,7 @@ cc_library(
     copts = tf_copts() + ["-Os"] + ["-std=c++11"],
     visibility = ["//visibility:public"],
     deps = [
+        ":mobile_additional_lib_deps",
         ":protos_all_cc_impl",
         ":stats_calculator_portable",
         "//third_party/eigen3",
@@ -1995,9 +2020,6 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
     "lib/io/zlib_compression_options.h",
     "lib/io/zlib_inputstream.h",
     "lib/io/zlib_outputbuffer.h",
-    "lib/monitoring/collected_metrics.h",
-    "lib/monitoring/collection_registry.h",
-    "lib/monitoring/metric_def.h",
     "lib/monitoring/mobile_counter.h",
     "lib/monitoring/mobile_gauge.h",
     "lib/monitoring/mobile_sampler.h",
@@ -2246,6 +2268,8 @@ cc_library(
     srcs = if_android([
         "lib/gif/gif_io.cc",
         "platform/gif.h",
+        "lib/strings/strcat.h",
+        "lib/strings/numbers.h",
     ]),
     hdrs = [
         "lib/bfloat16/bfloat16.h",
@@ -2336,6 +2360,7 @@ tf_generate_proto_text_sources(
     srcs = COMMON_PROTO_SRCS,
     protodeps = ERROR_CODES_PROTO_SRCS,
     srcs_relative_dir = "tensorflow/core/",
+    visibility = ["//visibility:public"],
     deps = [
         ":error_codes_proto_text",
         ":lib_internal",
@@ -2448,6 +2473,7 @@ cc_header_only_library(
 
 cc_header_only_library(
     name = "core_cpu_headers_lib",
+    visibility = ["//visibility:public"],
     deps = [
         ":core_cpu_lib",
     ],
@@ -2514,12 +2540,7 @@ tf_cuda_library(
     ] + if_static(
         extra_deps = ["@protobuf_archive//:protobuf"],
         otherwise = ["@protobuf_archive//:protobuf_headers"],
-    ) + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
-    ),
+    ) + mkl_deps(),
     alwayslink = 1,
 )
 
@@ -2576,6 +2597,7 @@ tf_cuda_library(
 # TODO(josh11b): Is this needed, or can we just use ":protos_all_cc"?
 cc_library(
     name = "protos_cc",
+    visibility = ["//visibility:public"],
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
@@ -2800,12 +2822,7 @@ tf_cuda_library(
         ":protos_all_cc",
         "//third_party/eigen3",
         "//tensorflow/core/grappler:grappler_item",
-    ] + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
-    ),
+    ] + mkl_deps(),
     alwayslink = 1,
 )
 
@@ -2845,12 +2862,7 @@ tf_cuda_library(
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//third_party/eigen3",
         "//tensorflow/core/kernels:required",
-    ] + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
-    ) + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
+    ] + mkl_deps() + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
     alwayslink = 1,
 )
 
@@ -3857,11 +3869,7 @@ tf_cuda_only_cc_test(
         ":test",
         ":test_main",
         "//third_party/eigen3",
-    ] + if_mkl(
-        [
-            "//third_party/mkl:intel_binary_blob",
-        ],
-    ),
+    ] + mkl_deps(),
 )
 
 tf_cc_test_gpu(
diff --git a/tensorflow/core/api_def/base_api/api_def_DivNoNan.pbtxt b/tensorflow/core/api_def/base_api/api_def_DivNoNan.pbtxt
new file mode 100644
index 0000000000..5604a1a89e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DivNoNan.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "DivNoNan"
+  summary: "Returns 0 if the denominator is zero."
+  description: <<END
+
+*NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt b/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
index 58262a385c..37d1a9dcbf 100644
--- a/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
@@ -27,5 +27,15 @@ For example:
 fill([2, 3], 9) ==> [[9, 9, 9]
                      [9, 9, 9]]
 ```
+
+`tf.fill` differs from `tf.constant` in a few ways:
+
+*   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
+    Tensor values.
+*   `tf.fill` creates an Op in the computation graph that constructs the actual
+    Tensor value at runtime. This is in contrast to `tf.constant` which embeds
+    the entire Tensor into the graph with a `Const` node.
+*   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+    based on other runtime Tensors, unlike `tf.constant`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
index a0e42dd02c..9f3f9b276b 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
@@ -123,5 +123,7 @@ Batched indexing into a 3-tensor:
               [['a1', 'b1'], ['c1', 'd1']]]
     output = [['b0', 'b1'], ['d0', 'c1']]
 ```
+
+See also `tf.gather` and `tf.batch_gather`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index 162ef2b033..c6104da4a6 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -54,5 +54,7 @@ params.shape[axis + 1:]` where:
 Note that on CPU, if an out of bound index is found, an error is returned.
 On GPU, if an out of bound index is found, a 0 is stored in the
 corresponding output value.
+
+See also `tf.batch_gather` and `tf.gather_nd`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_HostConst.pbtxt b/tensorflow/core/api_def/base_api/api_def_HostConst.pbtxt
new file mode 100644
index 0000000000..9d04a01f6f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_HostConst.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "HostConst"
+  attr {
+    name: "value"
+    description: <<END
+Attr `value` is the tensor to return.
+END
+  }
+  visibility: SKIP
+  summary: "Returns a constant tensor on the host. Only for writing C++ tests."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt
index d7b56aec87..46da1de1c3 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt
@@ -1,32 +1,5 @@
 op {
   graph_op_name: "MatrixExponential"
-  in_arg {
-    name: "input"
-    description: <<END
-Shape is `[..., M, M]`.
-END
-  }
-  out_arg {
-    name: "output"
-    description: <<END
-Shape is `[..., M, M]`.
-
-@compatibility(scipy)
-Equivalent to scipy.linalg.expm
-@end_compatibility
-END
-  }
-  summary: "Computes the matrix exponential of one or more square matrices:"
-  description: <<END
-\\(exp(A) = \sum_{n=0}^\infty A^n/n!\\)
-
-The exponential is computed using a combination of the scaling and squaring
-method and the Pade approximation. Details can be founds in:
-Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
-revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor of the same shape as the input
-containing the exponential for all input submatrices `[..., :, :]`.
-END
+  visibility: SKIP
+  summary: "Deprecated, use python implementation tf.linalg.matrix_exponential."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
new file mode 100644
index 0000000000..3de2f18fc2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "ParseExampleDataset"
+  in_arg {
+    name: "dense_defaults"
+    description: <<END
+A dict mapping string keys to `Tensor`s.
+The keys of the dict must match the dense_keys of the feature.
+END
+  }
+  attr {
+    name: "sparse_keys"
+    description: <<END
+A list of string keys in the examples features.
+The results for these keys will be returned as `SparseTensor` objects.
+END
+  }
+  attr {
+    name: "dense_keys"
+    description: <<END
+A list of Ndense string Tensors (scalars).
+The keys expected in the Examples features associated with dense values.
+END
+  }
+  attr {
+    name: "sparse_types"
+    description: <<END
+A list of `DTypes` of the same length as `sparse_keys`.
+Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+and `tf.string` (`BytesList`) are supported.
+END
+  }
+    attr {
+    name: "Tdense"
+    description: <<END
+A list of DTypes of the same length as `dense_keys`.
+Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+and `tf.string` (`BytesList`) are supported.
+
+END
+  }
+  attr {
+    name: "dense_shapes"
+    description: <<END
+List of tuples with the same length as `dense_keys`.
+The shape of the data for each dense feature referenced by `dense_keys`.
+Required for any input tensors identified by `dense_keys`.  Must be
+either fully defined, or may contain an unknown first dimension.
+An unknown first dimension means the feature is treated as having
+a variable number of blocks, and the output shape along this dimension
+is considered unknown at graph build time.  Padding is applied for
+minibatch elements smaller than the maximum number of blocks for the
+given feature along this dimension.
+END
+  }
+    attr {
+    name: "output_types"
+    description: <<END
+The type list for the return values.
+END
+  }
+    attr {
+    name: "output_shapes"
+    description: <<END
+The list of shapes being produced.
+END
+  }
+   summary: "Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features."
+}
+
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
index 1a75e67c0c..e400c7402b 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
@@ -70,5 +70,7 @@ The resulting update to ref would look like this:
 
 See `tf.scatter_nd` for more details about how to make updates to
 slices.
+
+See also `tf.scatter_update` and `tf.batch_scatter_update`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
index 4804908afc..4037dee432 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
@@ -59,5 +59,7 @@ Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
 </div>
+
+See also `tf.batch_scatter_update` and `tf.scatter_nd_update`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
index 5e2912fcdd..35f55fe106 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
index a7d85b3f4e..70a07d9b4c 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the mean along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
index 74fc598218..b2e3eece38 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
index 4c4363e524..7bac02e23d 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the product along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \prod_j data_j\\) where the product is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
index 583ab3904f..a73306a892 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output_i = \sum_j data_j\\) where sum is over `j` such
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
index 866e04e97b..138a6366c8 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -21,8 +21,9 @@ END
   }
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
index af4bc75fa0..b8073d88ac 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -30,7 +30,8 @@ END
 Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
index 194bcea726..945bbdcf62 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -23,7 +23,8 @@ END
   description: <<END
 N is the size of the segment being reduced.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
index 8b502928a5..ff328c8a61 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -32,7 +32,8 @@ N is the size of the segment being reduced.
 Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
index dfd50bf273..a68e14607f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -21,8 +21,9 @@ END
   }
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
index 3bc16577ff..aa5c1fc8d0 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -30,8 +30,9 @@ END
 Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 For example:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
new file mode 100644
index 0000000000..e382bcec81
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "StaticRegexReplace"
+  in_arg {
+    name: "input"
+    description: "The text to be processed."
+  }
+  out_arg {
+    name: "output"
+    description: "The text after applying pattern and rewrite."
+  }
+  attr {
+    name: "pattern"
+    description: "The regular expression to match the input."
+  }
+  attr {
+    name: "rewrite"
+    description: "The rewrite to be applied to the matched expresion."
+  }
+  attr {
+    name: "replace_global"
+    description: "If True, the replacement is global, otherwise the replacement\nis done only on the first match."
+  }
+  summary: "Replaces the match of pattern in input with rewrite."
+  description: "It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringLength.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringLength.pbtxt
new file mode 100644
index 0000000000..cc21ddc815
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringLength.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "StringLength"
+  in_arg {
+    name: "input"
+    description: <<END
+The string for which to compute the length.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Integer tensor that has the same shape as `input`. The output contains the
+element-wise string lengths of `input`.
+END
+  }
+  summary: "String lengths of `input`."
+  description: <<END
+Computes the length of each string given in the input tensor.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsafeDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsafeDiv.pbtxt
deleted file mode 100644
index 82c913d15e..0000000000
--- a/tensorflow/core/api_def/base_api/api_def_UnsafeDiv.pbtxt
+++ /dev/null
@@ -1,5 +0,0 @@
-op {
-  graph_op_name: "UnsafeDiv"
-  summary: "Returns 0 if the denominator is zero."
-  description: "" 
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 4ca6780c95..907c6d2022 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
 [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
index 55ea69b5dd..37dd973b23 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
 [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
index 577ff53d60..efbc023705 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the product along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
 [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 9aeabd030d..a8874950eb 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -16,8 +16,9 @@ END
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-segments.
+Read
+[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+for an explanation of segments.
 
 Computes a tensor such that
 \\(output[i] = sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
diff --git a/tensorflow/core/api_def/python_api/api_def_DivNoNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_DivNoNan.pbtxt
new file mode 100644
index 0000000000..1bf3fba3c6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DivNoNan.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DivNoNan"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
new file mode 100644
index 0000000000..45826b6fdc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParseExampleDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterNdSub.pbtxt
new file mode 100644
index 0000000000..c1edef8c9d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterNdSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterNdSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringLength.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringLength.pbtxt
new file mode 100644
index 0000000000..01c02e1f70
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringLength.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringLength"
+  endpoint {
+    name: "strings.length"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnsafeDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnsafeDiv.pbtxt
deleted file mode 100644
index 56caabcf3c..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_UnsafeDiv.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "UnsafeDiv"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 580e61e2ea..20e1dab1d5 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
 
 #include <array>
 #include <memory>
@@ -451,4 +451,4 @@ class BFCAllocator : public VisitableAllocator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
index 9eb9f060f6..065bbd008b 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.h
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
-#define TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
 
 #include <functional>
 #include <string>
@@ -100,4 +100,4 @@ class BufRendezvous {
   void PurgeTable(const Status& s, HookTable* table);
 };
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
index 9de6ab8968..d53aca85b9 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
-#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -72,4 +72,4 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 2e2aa801d9..9372fd6272 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
-#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
 
 #include <string>
 
@@ -237,4 +237,4 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
index 44408438b9..2188087957 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.h
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
-#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/collective.h"
@@ -89,4 +89,4 @@ class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index b5a51d2526..97b6971c5b 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -37,6 +37,8 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/denormal.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -553,6 +555,11 @@ bool ReplaceTensorWithConstant(
 Status ConstantFold(const ConstantFoldingOptions& opts,
                     FunctionLibraryRuntime* function_library, Env* env,
                     Device* partition_device, Graph* graph, bool* was_mutated) {
+  // TensorFlow flushes denormals to zero and rounds to nearest, so we do
+  // the same here.
+  port::ScopedFlushDenormal flush;
+  port::ScopedSetRound round(FE_TONEAREST);
+
   DumpGraph("Before", graph);
   ConstantFoldNameGenerator generate_new_name = opts.generate_new_name;
   if (generate_new_name == nullptr) {
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index 84598880bb..a9a84f761b 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_CONSTANT_FOLDING_H_
-#define TENSORFLOW_COMMON_RUNTIME_CONSTANT_FOLDING_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_CONSTANT_FOLDING_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_CONSTANT_FOLDING_H_
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/function.h"
@@ -66,4 +66,4 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_CONSTANT_FOLDING_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_CONSTANT_FOLDING_H_
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.h b/tensorflow/core/common_runtime/debugger_state_interface.h
index e0fa983373..797a0ade53 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.h
+++ b/tensorflow/core/common_runtime/debugger_state_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
-#define TENSORFLOW_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
 
 #include <memory>
 
@@ -117,4 +117,4 @@ class DebugGraphDecoratorRegistry {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index b537666492..81d68e3be4 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -26,8 +26,8 @@ limitations under the License.
 // * Task numbers are within the specified replica, so there are as
 //   many "task zeros" as replicas.
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_H_
-#define TENSORFLOW_COMMON_RUNTIME_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
 
 #include <memory>
 #include <string>
@@ -183,4 +183,4 @@ class Device : public DeviceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index 10eb62afa8..db50226fe8 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_FACTORY_H_
-#define TENSORFLOW_COMMON_RUNTIME_DEVICE_FACTORY_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
 
 #include <string>
 #include <vector>
@@ -126,4 +126,4 @@ class Registrar {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_FACTORY_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index cd93f76324..c1ff10d9b5 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_MGR_H_
-#define TENSORFLOW_COMMON_RUNTIME_DEVICE_MGR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 
 #include <string>
 #include <unordered_map>
@@ -77,4 +77,4 @@ class DeviceMgr {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_MGR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
diff --git a/tensorflow/core/common_runtime/device_resolver_local.h b/tensorflow/core/common_runtime/device_resolver_local.h
index 098eccdf84..bb6ff2efa0 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.h
+++ b/tensorflow/core/common_runtime/device_resolver_local.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
-#define TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
 
 #include <string>
 
@@ -45,4 +45,4 @@ class DeviceResolverLocal : public DeviceResolverInterface {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index 4cd56e583c..c384d46e97 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_SET_H_
-#define TENSORFLOW_COMMON_RUNTIME_DEVICE_SET_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_SET_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_SET_H_
 
 #include <memory>
 #include <unordered_map>
@@ -86,4 +86,4 @@ class DeviceSet {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_SET_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_SET_H_
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0695278c0d..bf1d78ec65 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -602,7 +602,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
 
   if (tracer) {
     TF_RETURN_IF_ERROR(tracer->Stop());
-    TF_RETURN_IF_ERROR(tracer->Collect(args.stats_collector));
+    TF_RETURN_IF_ERROR(tracer->Collect(run_state.collector.get()));
   }
 
   {
@@ -618,8 +618,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
         &session_state_));
   }
 
-  if (args.stats_collector) {
-    args.stats_collector->Finalize();
+  if (run_state.collector) {
+    run_state.collector->Finalize();
   }
 
   // Build and return the cost model as instructed.
@@ -634,7 +634,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
     }
 
     mutex_lock l(executor_lock_);
-    args.stats_collector->BuildCostModel(&cost_model_manager_, device_to_graph);
+    run_state.collector->BuildCostModel(&cost_model_manager_, device_to_graph);
 
     // annotate stats onto cost graph.
     CostGraphDef* cost_graph = run_metadata->mutable_cost_graph();
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 72a2be4816..55a6fbce6d 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_DIRECT_SESSION_H_
-#define TENSORFLOW_COMMON_RUNTIME_DIRECT_SESSION_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DIRECT_SESSION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DIRECT_SESSION_H_
 
 #include <atomic>
 #include <memory>
@@ -399,4 +399,4 @@ class DirectSession : public Session {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_DIRECT_SESSION_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DIRECT_SESSION_H_
diff --git a/tensorflow/core/common_runtime/dma_helper.h b/tensorflow/core/common_runtime/dma_helper.h
index cdfce1f366..4a76cff1e3 100644
--- a/tensorflow/core/common_runtime/dma_helper.h
+++ b/tensorflow/core/common_runtime/dma_helper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
-#define TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DMA_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DMA_HELPER_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
@@ -35,4 +35,4 @@ class DMAHelper {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DMA_HELPER_H_
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 92307d78f2..cf1cd4134e 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -103,7 +103,6 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
     return *this;                                                            \
   }
 
-DEFINE_SET_ATTR(StringPiece, string_attrs_);
 DEFINE_SET_ATTR(float, float_attrs_);
 DEFINE_SET_ATTR(int, int_attrs_);
 DEFINE_SET_ATTR(bool, bool_attrs_);
@@ -119,9 +118,6 @@ AttrBuilder& AttrBuilder::NumInputs(int n) {
 
 void AttrBuilder::FillAttrValueMap(AttrValueMap* m,
                                    bool include_those_in_node_def) const {
-  for (const auto& p : string_attrs_) {
-    SetInAttrValueMap(m, p.first, p.second);
-  }
   for (const auto& p : int_attrs_) {
     SetInAttrValueMap(m, p.first, p.second);
   }
@@ -211,10 +207,6 @@ tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) const {
     // not been called.
     if (node_def_finalized_) return f;
   }
-  for (const auto& p : string_attrs_) {
-    CombineUnordered(
-        CacheKeyHelper(p.first, tensorflow::Fingerprint128(p.second)), &f);
-  }
   for (const auto& p : int_attrs_) {
     CombineUnordered(CacheKeyHelper(p.first, static_cast<uint64>(p.second)),
                      &f);
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 929b1b8296..ccc95a35e5 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_C_EAGER_RUNTIME_H_
-#define TENSORFLOW_C_EAGER_RUNTIME_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_ATTR_BUILDER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_ATTR_BUILDER_H_
 
 // Support for eager execution of TensorFlow kernels.
 
@@ -131,7 +131,6 @@ class AttrBuilder {
     }
   }
 
-  AttrVec<StringPiece> string_attrs_;
   AttrVec<int> int_attrs_;
   AttrVec<float> float_attrs_;
   AttrVec<bool> bool_attrs_;
@@ -143,8 +142,6 @@ class AttrBuilder {
 };  // namespace tensorflow
 
 template <>
-AttrBuilder& AttrBuilder::Set(StringPiece attr_name, StringPiece&& value);
-template <>
 AttrBuilder& AttrBuilder::Set(StringPiece attr_name, int&& value);
 template <>
 AttrBuilder& AttrBuilder::Set(StringPiece attr_name, float&& value);
@@ -157,4 +154,4 @@ AttrBuilder& AttrBuilder::Set(StringPiece attr_name,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EAGER_RUNTIME_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_ATTR_BUILDER_H_
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 5bdd547c7f..b859b06fa0 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@@ -78,6 +79,12 @@ void EagerContext::InitDeviceMapAndAsync() {
       }
     }
   }
+
+  DeviceSet ds;
+  for (Device* d : devices_) {
+    ds.AddDevice(d);
+  }
+  prioritized_device_type_list_ = ds.PrioritizedDeviceTypeList();
 }
 
 bool EagerContext::Async() const {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 21c5bdf8e9..3c95ac590d 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -93,6 +93,9 @@ class EagerContext {
 
   // TODO(apassos) make this return a constant reference
   std::vector<Device*>* devices() { return &devices_; }
+  const std::vector<DeviceType>& prioritized_device_type_list() {
+    return prioritized_device_type_list_;
+  }
 
   // Clears the kernel caches.
   void ClearCaches();
@@ -206,9 +209,11 @@ class EagerContext {
   // Only one of the below is set.
   std::unique_ptr<DeviceMgr> local_device_manager_;
   DeviceMgr* local_unowned_device_manager_;
+  std::unique_ptr<DeviceMgr> remote_device_manager_;
 
   // Devices owned by device_manager
   std::vector<Device*> devices_;
+  std::vector<DeviceType> prioritized_device_type_list_;
   // All devices are not owned.
   gtl::FlatMap<string, Device*, StringPieceHasher> devices_map_;
   Rendezvous* rendezvous_;
@@ -253,7 +258,6 @@ class EagerContext {
 
 #ifndef __ANDROID__
   void CloseRemoteContexts();
-  std::unique_ptr<DeviceMgr> remote_device_manager_;
 
   // The server_ is not const since we release it when the context is destroyed.
   // Therefore the server_ object is not marked as const (even though it should
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 46065f399c..5b3a64ba98 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -192,17 +192,14 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
 }
 
 Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
-  DeviceSet ds;
-  for (Device* d : *ctx->devices()) {
-    ds.AddDevice(d);
-  }
   DeviceTypeVector final_devices;
-  auto status = SupportedDeviceTypesForNode(ds.PrioritizedDeviceTypeList(),
-                                            ndef, &final_devices);
-  if (!status.ok()) return status;
+  TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
+      ctx->prioritized_device_type_list(), ndef, &final_devices));
   if (final_devices.empty()) {
-    return errors::Internal("Could not find valid device for node ",
-                            ndef.DebugString());
+    return errors::Internal(
+        "Could not find valid device for node.\nNode: ", SummarizeNodeDef(ndef),
+        "\nAll kernels registered for op ", ndef.op(), " :\n",
+        KernelsRegisteredForOp(ndef.op()));
   }
   for (Device* d : *ctx->devices()) {
     if (d->device_type() == final_devices[0].type_string()) {
@@ -211,7 +208,7 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
     }
   }
   return errors::Unknown("Could not find a device for node ",
-                         ndef.DebugString());
+                         SummarizeNodeDef(ndef));
 }
 
 Status GetOutputDTypes(EagerOperation* op, DataTypeVector* output_dtypes) {
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index c2fac4c2c8..02193dae5a 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -72,141 +72,58 @@ bool IsInitializationOp(const Node* node) {
   return node->op_def().allows_uninitialized_input();
 }
 
-// Sets the timeline_label field of *node_stats, using data from *node.
-// Returns true iff the node is a transfer node.
-// TODO(tucker): merge with the DetailText function in session.cc
-// in a common location.
-bool SetTimelineLabel(const Node* node, NodeExecStatsWrapper* stats) {
-  bool is_transfer_node = false;
-  if (!stats) {
-    return is_transfer_node;
-  }
-  string memory;
-  for (auto& all : stats->stats()->memory()) {
-    int64 tot = all.total_bytes();
-    if (tot >= 0.1 * 1048576.0) {
-      int64 peak = all.peak_bytes();
-      if (peak > 0) {
-        memory =
-            strings::StrCat(memory, "[", all.allocator_name(),
-                            strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
-                                            peak / 1048576.0));
-      } else {
-        memory = strings::StrCat(memory, "[", all.allocator_name(),
-                                 strings::Printf(" %.1fMB] ", tot / 1048576.0));
-      }
-    }
-  }
-  const AttrSlice attrs = node->attrs();
-  string text;
-  if (IsSend(node)) {
-    string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string recv_device;
-    TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
-    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
-                           "(", tensor_name, " @", recv_device);
-    is_transfer_node = true;
-  } else if (IsRecv(node)) {
-    string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
-    string send_device;
-    TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
-    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
-                           "(", tensor_name, " @", send_device);
-    is_transfer_node = true;
-  } else {
-    text =
-        strings::StrCat(memory, node->name(), " = ", node->type_string(), "(",
-                        str_util::Join(node->requested_inputs(), ", "), ")");
-  }
-  stats->stats()->set_timeline_label(text);
-  return is_transfer_node;
-}
-
 // Helper routines for collecting step stats.
 namespace nodestats {
-inline int64 NowInUsec() { return Env::Default()->NowMicros(); }
 inline int64 NowInNsec() { return Env::Default()->NowNanos(); }
 
-void SetScheduled(NodeExecStatsWrapper* stats, int64 nanos) {
+void SetScheduled(NodeExecStatsWrapper* stats, int64 micros) {
   if (!stats) return;
-  stats->stats()->set_scheduled_micros(nanos / EnvTime::kMicrosToNanos);
-  stats->stats()->set_scheduled_nanos(nanos);
+  stats->SetScheduled(micros * EnvTime::kMicrosToNanos);
 }
 
 void SetAllStart(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  int64 now_nanos = NowInNsec();
-  stats->stats()->set_all_start_micros(now_nanos / EnvTime::kMicrosToNanos);
-  stats->stats()->set_all_start_nanos(now_nanos);
+  stats->RecordExecutorStarted();
 }
 
 void SetOpStart(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  NodeExecStats* nt = stats->stats();
-  DCHECK_NE(nt->all_start_micros(), 0);
-  DCHECK_NE(nt->all_start_nanos(), 0);
-  int64 now_nanos = NowInNsec();
-  nt->set_op_start_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
-                              nt->all_start_micros());
-  nt->set_op_start_rel_nanos(now_nanos - nt->all_start_nanos());
+  stats->RecordComputeStarted();
 }
 
 void SetOpEnd(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  NodeExecStats* nt = stats->stats();
-  DCHECK_NE(nt->all_start_micros(), 0);
-  DCHECK_NE(nt->all_start_nanos(), 0);
-  int64 now_nanos = NowInNsec();
-  nt->set_op_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
-                            nt->all_start_micros());
-  nt->set_op_end_rel_nanos(now_nanos - nt->all_start_nanos());
+  stats->RecordComputeEnded();
 }
 
 void SetAllEnd(NodeExecStatsWrapper* stats) {
   if (!stats) return;
-  NodeExecStats* nt = stats->stats();
-  DCHECK_NE(nt->all_start_micros(), 0);
-  DCHECK_NE(nt->all_start_nanos(), 0);
-  int64 now_nanos = NowInNsec();
-  nt->set_all_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
-                             nt->all_start_micros());
-  nt->set_all_end_rel_nanos(now_nanos - nt->all_start_nanos());
+  stats->RecordExecutorEnded();
 }
 
 void SetOutput(NodeExecStatsWrapper* stats, int slot, const Tensor* v) {
   if (!stats) return;
-  DCHECK(v);
-  NodeOutput* no = stats->stats()->add_output();
-  no->set_slot(slot);
-  v->FillDescription(no->mutable_tensor_description());
+  stats->SetOutput(slot, v);
 }
 
 void SetMemory(NodeExecStatsWrapper* stats, OpKernelContext* ctx) {
   if (!stats) return;
-
-  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
-    stats->AddAllocation(allocator_pair.first, allocator_pair.second);
-  }
-  auto* ms = stats->stats()->mutable_memory_stats();
-  ms->set_temp_memory_size(ctx->temp_memory_allocated());
-  for (const auto& alloc_id : ctx->persistent_alloc_ids()) {
-    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
-  }
-  ms->set_persistent_memory_size(ctx->persistent_memory_allocated());
+  stats->SetMemory(ctx);
 }
 
 void SetReferencedTensors(NodeExecStatsWrapper* stats,
                           const TensorReferenceVector& tensors) {
   if (!stats) return;
-  // be careful not to increment the reference count on any tensor
-  // while recording the information
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    AllocationDescription* description =
-        stats->stats()->add_referenced_tensor();
-    tensors.at(i).FillDescription(description);
+  stats->SetReferencedTensors(tensors);
+}
+
+// Sets the timeline_label field of *stats, using data from *node.
+// Returns true iff the node is a transfer node.
+bool SetTimelineLabel(const Node* node, NodeExecStatsWrapper* stats) {
+  if (!stats) {
+    return false;
   }
+  return stats->SetTimelineLabel(node);
 }
 
 }  // namespace nodestats
@@ -1319,7 +1236,7 @@ class ExecutorState {
   TensorStore* tensor_store_;
   // Step-local container.
   ScopedStepContainer* step_container_;
-  StepStatsCollector* stats_collector_;
+  StepStatsCollectorInterface* const stats_collector_;
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
   checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_;
@@ -1694,15 +1611,14 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
     if (stats_collector_ && !tagged_node.is_dead) {
       // track allocations if and only if we are collecting statistics
       params.track_allocations = true;
-      stats = new NodeExecStatsWrapper;
-      stats->stats()->set_node_name(node->name());
+      stats = new NodeExecStatsWrapper(node->name());
       nodestats::SetScheduled(stats, scheduled_nsec);
       nodestats::SetAllStart(stats);
     }
 
     if (vlog_) {
       VLOG(1) << "Process node: " << id << " step " << params.step_id << " "
-              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead
+              << SummarizeNode(*node) << (tagged_node.is_dead ? " is dead" : "")
               << " device: " << device->name();
     }
 
@@ -1764,7 +1680,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
             VLOG(2) << "Async kernel done: " << state->item->node->id()
                     << " step " << step_id_ << " "
                     << SummarizeNode(*state->item->node)
-                    << " is dead: " << state->tagged_node.is_dead
+                    << (state->tagged_node.is_dead ? " is dead" : "")
                     << " device: " << device->name();
           }
 
@@ -1818,7 +1734,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
       if (vlog_) {
         VLOG(2) << "Synchronous kernel done: " << id << " step "
                 << params.step_id << " " << SummarizeNode(*node)
-                << " is dead: " << tagged_node.is_dead
+                << (tagged_node.is_dead ? " is dead: " : "")
                 << " device: " << device->name();
       }
 
@@ -2165,7 +2081,8 @@ bool ExecutorState::NodeDone(const Status& s, const Node* node,
                              NodeExecStatsWrapper* stats,
                              TaggedNodeReadyQueue* inline_ready) {
   nodestats::SetAllEnd(stats);
-  if (stats_collector_ != nullptr && !SetTimelineLabel(node, stats)) {
+  if (stats_collector_ != nullptr &&
+      !nodestats::SetTimelineLabel(node, stats)) {
     // Only record non-transfer nodes.
     // Transfers 'stats' ownership to 'stats_collector_'.
     stats_collector_->Save(impl_->params_.device->name(), stats);
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index cd01b43aea..6cd4fd22ea 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -83,7 +83,7 @@ class Executor {
   struct Args {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
-    StepStatsCollector* stats_collector = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
     CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
@@ -235,4 +235,4 @@ void DeleteNonCachedKernel(OpKernel* kernel);
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_EXECUTOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 54bbe84b57..fb89bcc0df 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -555,6 +555,12 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
       next_handle_++;
     }
   }
+
+  if (options.create_kernels_eagerly) {
+    Item* item;
+    TF_RETURN_IF_ERROR(GetOrCreateItem(*handle, &item));
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index a274f1ef51..eeca66f5d0 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_
-#define TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_H_
 
 #include <functional>
 #include <memory>
@@ -170,4 +170,4 @@ Status FunctionDefToBodyHelper(
     FunctionBody** fbody);
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index a3e0d0734f..f1cc2eace1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
 
 #include <memory>
 #include <string>
@@ -89,4 +89,4 @@ class GPUMemAllocator : public SubAllocator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_BFC_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 5043fac797..856fdc34b4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_CUDA_MALLOC_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_CUDA_MALLOC_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOC_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOC_ALLOCATOR_H_
 
 #include <memory>
 
@@ -51,4 +51,4 @@ class GPUcudaMallocAllocator : public VisitableAllocator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_CUDAMALLOC_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOC_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index c49ec2a566..0f9b72040c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
 
 #include <memory>
 #include <string>
@@ -88,4 +88,4 @@ class GPUNanResetAllocator : public VisitableAllocator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEBUG_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index f0a109cc10..2d406b676e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
 
 #include <deque>
 #include <vector>
@@ -203,4 +203,4 @@ class EventMgr {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_EVENT_MGR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/core/common_runtime/gpu/gpu_init.h
index bfd7a77f83..4e1f06ac83 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_init.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -36,4 +36,4 @@ stream_executor::Platform* GPUMachineManager();
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_INIT_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
index 771c158267..c61ada96ef 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
 
 #include <unordered_map>
 
@@ -42,4 +42,4 @@ Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
 }  // namespace gpu_stream_util
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index 57687a8364..8ac3febb01 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_UTIL_H_
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -108,4 +108,4 @@ class GPUUtil {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_UTIL_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
index ea1b04feeb..4bc88ffc8c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -36,4 +37,12 @@ void GPUDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   GPUUtil::CopyGPUTensorToCPU(device, this, device_tensor, cpu_tensor, done);
 }
 
+Status GPUDeviceContext::ThenExecute(Device* device, se::Stream* stream,
+                                     std::function<void()> func) {
+  const DeviceBase::GpuDeviceInfo* gpu_info =
+      device->tensorflow_gpu_device_info();
+  gpu_info->event_mgr->ThenExecute(stream, func);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index d697d878dc..3603808152 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -60,6 +60,9 @@ class GPUDeviceContext : public DeviceContext {
   void MaintainLifetimeOnStream(const Tensor* t,
                                 se::Stream* stream) const override {}
 
+  Status ThenExecute(Device* device, se::Stream* stream,
+                     std::function<void()> func) override;
+
  private:
   int stream_id_;
   // The default primary stream to use for this context.
@@ -75,4 +78,4 @@ class GPUDeviceContext : public DeviceContext {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_DEVICE_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 995a15a299..555b43f655 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
-#define TENSORFLOW_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
 
 #include <string>
 #include <vector>
@@ -65,4 +65,4 @@ class Benchmark {
 }  // end namespace test
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index 84a4f66db4..226f121bf3 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_LOCAL_DEVICE_H_
-#define TENSORFLOW_COMMON_RUNTIME_LOCAL_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -54,4 +54,4 @@ class LocalDevice : public Device {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_LOCAL_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h
index f5d265aa24..6fcd2afd27 100644
--- a/tensorflow/core/common_runtime/optimization_registry.h
+++ b/tensorflow/core/common_runtime/optimization_registry.h
@@ -132,11 +132,12 @@ class OptimizationPassRegistration {
 #define REGISTER_OPTIMIZATION_UNIQ_HELPER(ctr, grouping, phase, optimization) \
   REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization)
 
-#define REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization) \
-  static optimization_registration::OptimizationPassRegistration       \
-      register_optimization_##ctr(                                     \
-          grouping, phase,                                             \
-          std::unique_ptr<GraphOptimizationPass>(new optimization()),  \
+#define REGISTER_OPTIMIZATION_UNIQ(ctr, grouping, phase, optimization)         \
+  static ::tensorflow::optimization_registration::OptimizationPassRegistration \
+      register_optimization_##ctr(                                             \
+          grouping, phase,                                                     \
+          ::std::unique_ptr<::tensorflow::GraphOptimizationPass>(              \
+              new optimization()),                                             \
           #optimization)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index fce87269c5..cefcdd25db 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_PLACER_H_
-#define TENSORFLOW_COMMON_RUNTIME_PLACER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_H_
 
 #include <string>
 #include <unordered_map>
@@ -100,4 +100,4 @@ class Placer {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_PLACER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PLACER_H_
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.h b/tensorflow/core/common_runtime/rendezvous_mgr.h
index cb5848ede3..b4d8ab4eb2 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.h
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
-#define TENSORFLOW_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
 
 #include <string>
 #include <unordered_map>
@@ -87,4 +87,4 @@ class IntraProcessRendezvous : public Rendezvous {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/core/common_runtime/session_factory.h b/tensorflow/core/common_runtime/session_factory.h
index 81c172c6ae..8565088afc 100644
--- a/tensorflow/core/common_runtime/session_factory.h
+++ b/tensorflow/core/common_runtime/session_factory.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_SESSION_FACTORY_H_
-#define TENSORFLOW_COMMON_RUNTIME_SESSION_FACTORY_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_FACTORY_H_
 
 #include <string>
 
@@ -73,4 +73,4 @@ class SessionFactory {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_SESSION_FACTORY_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_FACTORY_H_
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index af6880c6b3..9c2510e6a9 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -16,12 +16,16 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/graph/costmodel.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -36,11 +40,89 @@ struct AllocStats {
 };
 }  // namespace
 
-NodeExecStatsWrapper::NodeExecStatsWrapper()
-    : NodeExecStatsWrapper(new NodeExecStats) {}
+NodeExecStatsWrapper::NodeExecStatsWrapper(const string& node_name)
+    : NodeExecStatsWrapper(new NodeExecStats) {
+  stats_->set_node_name(node_name);
+}
 NodeExecStatsWrapper::NodeExecStatsWrapper(NodeExecStats* stats)
     : stats_(stats) {}
 
+void NodeExecStatsWrapper::SetOutput(int slot, const Tensor* v) {
+  DCHECK(v);
+  NodeOutput* no = stats_->add_output();
+  no->set_slot(slot);
+  v->FillDescription(no->mutable_tensor_description());
+}
+
+void NodeExecStatsWrapper::SetMemory(OpKernelContext* ctx) {
+  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
+    AddAllocation(allocator_pair.first, allocator_pair.second);
+  }
+  auto* ms = stats_->mutable_memory_stats();
+  ms->set_temp_memory_size(ctx->temp_memory_allocated());
+  for (const auto& alloc_id : ctx->persistent_alloc_ids()) {
+    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
+  }
+  ms->set_persistent_memory_size(ctx->persistent_memory_allocated());
+}
+
+void NodeExecStatsWrapper::SetReferencedTensors(
+    const TensorReferenceVector& tensors) {
+  // be careful not to increment the reference count on any tensor
+  // while recording the information
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    AllocationDescription* description = stats_->add_referenced_tensor();
+    tensors.at(i).FillDescription(description);
+  }
+}
+
+// TODO(tucker): merge with the DetailText function in session.cc
+// in a common location.
+bool NodeExecStatsWrapper::SetTimelineLabel(const Node* node) {
+  bool is_transfer_node = false;
+  string memory;
+  for (auto& all : stats_->memory()) {
+    int64 tot = all.total_bytes();
+    if (tot >= 0.1 * 1048576.0) {
+      int64 peak = all.peak_bytes();
+      if (peak > 0) {
+        memory =
+            strings::StrCat(memory, "[", all.allocator_name(),
+                            strings::Printf(" %.1fMB %.1fMB] ", tot / 1048576.0,
+                                            peak / 1048576.0));
+      } else {
+        memory = strings::StrCat(memory, "[", all.allocator_name(),
+                                 strings::Printf(" %.1fMB] ", tot / 1048576.0));
+      }
+    }
+  }
+  const AttrSlice attrs = node->attrs();
+  string text;
+  if (IsSend(node)) {
+    string tensor_name;
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
+    string recv_device;
+    TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", recv_device);
+    is_transfer_node = true;
+  } else if (IsRecv(node)) {
+    string tensor_name;
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
+    string send_device;
+    TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", send_device);
+    is_transfer_node = true;
+  } else {
+    text =
+        strings::StrCat(memory, node->name(), " = ", node->type_string(), "(",
+                        str_util::Join(node->requested_inputs(), ", "), ")");
+  }
+  stats_->set_timeline_label(text);
+  return is_transfer_node;
+}
+
 void NodeExecStatsWrapper::AddAllocation(
     Allocator* allocator, TrackingAllocator* tracking_allocator) {
   AllocatorMemoryUsed* memory = stats_->add_memory();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 996dbb59bc..7206fbf427 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_STEP_STATS_COLLECTOR_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_STEP_STATS_COLLECTOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
 
 #include <memory>
 #include <unordered_map>
 #include <vector>
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,42 +32,127 @@ class Allocator;
 class AllocatorMemoryUsed;
 class CostModelManager;
 class Graph;
+class Node;
 class NodeExecStats;
+class OpKernelContext;
 class StepStats;
+class Tensor;
 class TrackingAllocator;
 
 // Wraps NodeExecStats and adds allocation to it.
 class NodeExecStatsWrapper {
  public:
-  NodeExecStatsWrapper();
+  NodeExecStatsWrapper(const string& node_name);
   // Owns 'stats'.
   NodeExecStatsWrapper(NodeExecStats* stats);
 
   // Destructor calls Finalize() to release the TrackingAllocators.
   ~NodeExecStatsWrapper() { Finalize(); }
 
-  NodeExecStats* stats() { return stats_.get(); }
-
-  // "Does not take ownership of the 'allocator'.
-  // Transfers ownership of the 'tracking_allocator' to *this."
-  void AddAllocation(Allocator* allocator,
-                     TrackingAllocator* tracking_allocator);
+  // Records the absolute time in nanoseconds at which this node became
+  // runnable (i.e. was scheduled for execution).
+  void SetScheduled(int64 nanos) {
+    stats_->set_scheduled_micros(nanos / EnvTime::kMicrosToNanos);
+    stats_->set_scheduled_nanos(nanos);
+  }
+
+  // Called immediately after this node starts being processed by the executor.
+  void RecordExecutorStarted() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    stats_->set_all_start_micros(now_nanos / EnvTime::kMicrosToNanos);
+    stats_->set_all_start_nanos(now_nanos);
+  }
+
+  // Called immediately before this node's `Compute()` or `ComputeAsync()`
+  // method is called.
+  void RecordComputeStarted() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    DCHECK_NE(stats_->all_start_micros(), 0);
+    DCHECK_NE(stats_->all_start_nanos(), 0);
+    stats_->set_op_start_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
+                                    stats_->all_start_micros());
+    stats_->set_op_start_rel_nanos(now_nanos - stats_->all_start_nanos());
+  }
+
+  // Called immediately after this node's `Compute()` method returned (or, for
+  // asynchronous operations, the callback passed to its `ComputeAsync()` method
+  // was called).
+  void RecordComputeEnded() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    DCHECK_NE(stats_->all_start_micros(), 0);
+    DCHECK_NE(stats_->all_start_nanos(), 0);
+    stats_->set_op_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
+                                  stats_->all_start_micros());
+    stats_->set_op_end_rel_nanos(now_nanos - stats_->all_start_nanos());
+  }
+
+  // Called immediately after this executor finishes processing this node.
+  void RecordExecutorEnded() {
+    int64 now_nanos = Env::Default()->NowNanos();
+    DCHECK_NE(stats_->all_start_micros(), 0);
+    DCHECK_NE(stats_->all_start_nanos(), 0);
+    stats_->set_all_end_rel_micros(now_nanos / EnvTime::kMicrosToNanos -
+                                   stats_->all_start_micros());
+    stats_->set_all_end_rel_nanos(now_nanos - stats_->all_start_nanos());
+  }
+
+  // Records information about the tensor produced by this node at the given
+  // output slot.
+  void SetOutput(int slot, const Tensor* v);
+
+  // Records information about the memory allocated during the execution of this
+  // node.
+  void SetMemory(OpKernelContext* ctx);
+
+  // Records information about the tensors that were accessed during the
+  // execution of this node.
+  void SetReferencedTensors(const TensorReferenceVector& tensors);
+
+  // Sets the timeline_label field of the wrapped NodeExecStats, using data
+  // from *node. Returns true iff the node is a transfer node.
+  bool SetTimelineLabel(const Node* node);
 
  private:
   friend class StepStatsCollector;
 
+  NodeExecStats* stats() { return stats_.get(); }
+
   // Populates stats_ and releases TrackingAllocator.
   void Finalize();
 
+  // Does not take ownership of the `allocator`.
+  // Takes ownership of `tracking_allocator`.
+  void AddAllocation(Allocator* allocator,
+                     TrackingAllocator* tracking_allocator);
+
   gtl::InlinedVector<std::pair<AllocatorMemoryUsed*, TrackingAllocator*>, 2>
       allocations_;
   std::unique_ptr<NodeExecStats> stats_;
 };
 
+// Statistics collection interface for individual node execution.
+//
+// See `StepStatsCollector` for a concrete implementation of this interface
+// that interfaces with the `Session` layer.
+class StepStatsCollectorInterface {
+ public:
+  virtual ~StepStatsCollectorInterface() {}
+
+  // Saves `stats` to the collector.
+  virtual void Save(const string& device, NodeExecStatsWrapper* stats) = 0;
+
+  // Generates a string reporting the currently used memory based
+  // on ResourceExhausted OOM `err` message.
+  // `err` message needs to contain device name and allocator name, e.g.:
+  // "ResourceExhaustedError: OOM when allocating tensor ...
+  // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
+  virtual string ReportAllocsOnResourceExhausted(const string& err) = 0;
+};
+
 // StepStatsCollector manages the collection of a StepStats object.
 // The StepStats object holds multiple DeviceStats.
 // Each DeviceStats object holds multiple NodeExecStats.
-class StepStatsCollector {
+class StepStatsCollector : public StepStatsCollectorInterface {
  public:
   // Does not take ownership of `ss`.
   explicit StepStatsCollector(StepStats* ss);
@@ -80,14 +167,9 @@ class StepStatsCollector {
   // Save saves nt to the DeviceStats object associated with device.
   // Should be called before Finalize.
   void Save(const string& device, NodeExecStats* nt);
-  void Save(const string& device, NodeExecStatsWrapper* stats);
+  void Save(const string& device, NodeExecStatsWrapper* stats) override;
 
-  // Generates a string reporting the currently used memory based
-  // on ResourceExhausted OOM `err` message.
-  // `err` message needs to contain device name and allocator name, E.g.:
-  // "ResourceExhaustedError: OOM when allocating tensor ...
-  // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
-  string ReportAllocsOnResourceExhausted(const string& err);
+  string ReportAllocsOnResourceExhausted(const string& err) override;
 
   // The following 2 Finalize methods populate the StepStats passed
   // from the constructor. Calling it more than once won't have any effect.
@@ -112,4 +194,4 @@ class StepStatsCollector {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_STEP_STATS_COLLECTOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_STEP_STATS_COLLECTOR_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index 550f193332..cc5909de17 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -17,8 +17,8 @@ limitations under the License.
 #error This file must only be included when building TensorFlow with SYCL support
 #endif
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
@@ -72,4 +72,4 @@ class SYCLAllocator : public Allocator {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
index 8edf922d11..ae0563a96a 100644
--- a/tensorflow/core/common_runtime/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
-#define TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
 
 #include <functional>
 #include "tensorflow/core/framework/allocator.h"
@@ -76,4 +76,4 @@ class TrackingVisitableAllocator : public TrackingAllocator,
   VisitableAllocator* allocator_;
 };
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
diff --git a/tensorflow/core/debug/debug_callback_registry.h b/tensorflow/core/debug/debug_callback_registry.h
index 8f08c656c2..bcd4ddc50c 100644
--- a/tensorflow/core/debug/debug_callback_registry.h
+++ b/tensorflow/core/debug/debug_callback_registry.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_DEBUG_CALLBACK_REGISTRY_H_
-#define TENSORFLOW_DEBUG_CALLBACK_REGISTRY_H_
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_CALLBACK_REGISTRY_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_CALLBACK_REGISTRY_H_
 
 #include <functional>
 #include <map>
@@ -68,4 +68,4 @@ class DebugCallbackRegistry {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_DEBUG_CALLBACK_REGISTRY_H_
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_CALLBACK_REGISTRY_H_
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index 64deff1f00..86dc90a134 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_DEBUG_NODE_INSERTER_H_
-#define TENSORFLOW_DEBUG_NODE_INSERTER_H_
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_GRAPH_UTILS_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_GRAPH_UTILS_H_
 
 #include <unordered_map>
 #include <vector>
@@ -123,4 +123,4 @@ class DebugNodeInserter {
 };
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_DEBUG_NODE_INSERTER_H_
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_GRAPH_UTILS_H_
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 8d3c9ff575..93376613b6 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_DEBUG_GRPC_TESTLIB_H_
-#define TENSORFLOW_DEBUG_GRPC_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_GRPC_TESTLIB_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_GRPC_TESTLIB_H_
 
 #include <atomic>
 #include <unordered_set>
@@ -84,4 +84,4 @@ bool PollTillFirstRequestSucceeds(const string& server_url,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_DEBUG_GRPC_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_GRPC_TESTLIB_H_
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index c974a47051..cedb7386b7 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_DEBUG_IO_UTILS_H_
-#define TENSORFLOW_DEBUG_IO_UTILS_H_
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_IO_UTILS_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_IO_UTILS_H_
 
 #include <cstddef>
 #include <functional>
@@ -398,4 +398,4 @@ class DebugGrpcIO {
 }  // namespace tensorflow
 #endif  // #ifndef(PLATFORM_WINDOWS)
 
-#endif  // TENSORFLOW_DEBUG_IO_UTILS_H_
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_IO_UTILS_H_
diff --git a/tensorflow/core/debug/debug_node_key.h b/tensorflow/core/debug/debug_node_key.h
index b46054c013..eaeb369790 100644
--- a/tensorflow/core/debug/debug_node_key.h
+++ b/tensorflow/core/debug/debug_node_key.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_DEBUG_NODE_KEY_H_
-#define TENSORFLOW_DEBUG_NODE_KEY_H_
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUG_NODE_KEY_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUG_NODE_KEY_H_
 
 #include <string>
 
@@ -48,4 +48,4 @@ struct DebugNodeKey {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_DEBUG_NODE_KEY_H_
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUG_NODE_KEY_H_
diff --git a/tensorflow/core/debug/debugger_state_impl.h b/tensorflow/core/debug/debugger_state_impl.h
index 52e2663d08..8f6e53fafe 100644
--- a/tensorflow/core/debug/debugger_state_impl.h
+++ b/tensorflow/core/debug/debugger_state_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_DEBUGGER_STATE_IMPL_H_
-#define TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+#ifndef TENSORFLOW_CORE_DEBUG_DEBUGGER_STATE_IMPL_H_
+#define TENSORFLOW_CORE_DEBUG_DEBUGGER_STATE_IMPL_H_
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
 
@@ -58,4 +58,4 @@ class DebugGraphDecorator : public DebugGraphDecoratorInterface {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+#endif  // TENSORFLOW_CORE_DEBUG_DEBUGGER_STATE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index b2192c5a80..37029f3f1a 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -562,6 +562,7 @@ cc_library(
     deps = [
         ":worker_cache",
         ":worker_interface",
+        "//tensorflow/core:framework",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index a48f734d3e..269f620e42 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/master.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 
@@ -167,13 +168,55 @@ class DeviceFinder {
     }
     // Enumerates all known workers' target. A target name is a
     // prefix of a device name. E.g., /job:mnist/replica:0/task:10.
-    CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided.";
-    const string& local_device_name = env_->local_devices[0]->name();
-    std::vector<string> workers;
-    worker_cache->ListWorkers(&workers);
     if (filters_.empty()) {
+      // If no filters were specified, we list all known workers in
+      // `worker_cache`.
+      std::vector<string> workers;
+      worker_cache->ListWorkers(&workers);
       std::swap(workers, targets_);
     } else {
+      // When applying filters, we must include the local worker, even if it
+      // does not match any of the filters.
+      CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided.";
+      const string& local_device_name = env_->local_devices[0]->name();
+      DeviceNameUtils::ParsedName local_parsed_name;
+      CHECK(DeviceNameUtils::ParseFullName(local_device_name,
+                                           &local_parsed_name));
+      bool all_filters_have_job = true;
+      std::unordered_set<string> filter_job_names({local_parsed_name.job});
+      for (const DeviceNameUtils::ParsedName& filter : filters_) {
+        all_filters_have_job = all_filters_have_job && filter.has_job;
+        if (filter.has_job) {
+          filter_job_names.insert(filter.job);
+        }
+      }
+
+      std::vector<string> workers;
+      if (all_filters_have_job) {
+        // If all of the device filters have a job specified, then we only need
+        // to list the workers in the jobs named in the filter, because a worker
+        // in any other job would not match any filter.
+        for (const string& job_name : filter_job_names) {
+          VLOG(2) << "Selectively listing workers in job: " << job_name;
+          std::vector<string> workers_in_job;
+          worker_cache->ListWorkersInJob(job_name, &workers_in_job);
+          workers.insert(workers.end(), workers_in_job.begin(),
+                         workers_in_job.end());
+        }
+      } else {
+        // If any of the device filters does not have a job specified, then we
+        // must list the workers from all jobs.
+        VLOG(2) << "Listing workers in all jobs because some device "
+                << "filter has no job specified. Filters were:";
+        if (device_filters.empty()) {
+          VLOG(2) << "- <NO FILTERS>";
+        } else {
+          for (const string& filter : device_filters) {
+            VLOG(2) << "- " << filter;
+          }
+        }
+        worker_cache->ListWorkers(&workers);
+      }
       for (const string& name : workers) {
         if (MatchFilters(name) ||
             DeviceNameUtils::IsSameAddressSpace(name, local_device_name)) {
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index da26c42aca..837ccd1dd4 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -99,4 +99,4 @@ struct MasterEnv {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MASTER_ENV_H_
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 72a0c7edd8..474ac0e186 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -721,4 +721,4 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index b7eb3c9015..456c30ecf4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -163,6 +163,13 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) override {
+    for (GrpcChannelCache* cache : caches_) {
+      cache->ListWorkersInJob(job_name, workers);
+    }
+  }
+
   string TranslateTask(const string& target) override {
     mutex_lock l(mu_);  // could use reader lock
     GrpcChannelCache* cache = gtl::FindPtrOrNull(target_caches_, target);
@@ -223,6 +230,13 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) override {
+    if (job_name == job_id_) {
+      ListWorkers(workers);
+    }
+  }
+
   string TranslateTask(const string& target) override {
     DeviceNameUtils::ParsedName parsed;
     if (!DeviceNameUtils::ParseFullName(target, &parsed)) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 4861cdb691..6fa99d7b14 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -66,6 +66,8 @@ class GrpcChannelCache {
   //  /job:<job identifier>/task:<task id>
   // e.g. /job:mnist/task:2
   virtual void ListWorkers(std::vector<string>* workers) = 0;
+  virtual void ListWorkersInJob(const string& job_name,
+                                std::vector<string>* workers) = 0;
 
   // If found, returns a gRPC channel that is connected to the remote
   // worker named by 'target'. 'target' is of the following
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index f07a5a0974..a814ef85e2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -89,13 +89,33 @@ TEST(GrpcChannelTest, HostPorts) {
     EXPECT_NE(d_4_1.get(), e_5_2.get());
   }
 
-  std::vector<string> workers;
-  cc->ListWorkers(&workers);
-  EXPECT_EQ(std::vector<string>(
-                {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
-                 "/job:mnist/replica:0/task:2", "/job:mnist/replica:0/task:3",
-                 "/job:mnist/replica:0/task:4", "/job:mnist/replica:0/task:5"}),
-            workers);
+  {
+    std::vector<string> workers;
+    cc->ListWorkers(&workers);
+    EXPECT_EQ(
+        std::vector<string>(
+            {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
+             "/job:mnist/replica:0/task:2", "/job:mnist/replica:0/task:3",
+             "/job:mnist/replica:0/task:4", "/job:mnist/replica:0/task:5"}),
+        workers);
+  }
+
+  {
+    std::vector<string> workers;
+    cc->ListWorkersInJob("mnist", &workers);
+    EXPECT_EQ(
+        std::vector<string>(
+            {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
+             "/job:mnist/replica:0/task:2", "/job:mnist/replica:0/task:3",
+             "/job:mnist/replica:0/task:4", "/job:mnist/replica:0/task:5"}),
+        workers);
+  }
+
+  {
+    std::vector<string> workers;
+    cc->ListWorkersInJob("other", &workers);
+    EXPECT_TRUE(workers.empty());
+  }
 }
 
 TEST(GrpcChannelTest, SparseHostPorts) {
@@ -135,13 +155,30 @@ TEST(GrpcChannelTest, SparseHostPorts) {
     EXPECT_NE(d_4_1.get(), e_5_2.get());
   }
 
-  std::vector<string> workers;
-  cc->ListWorkers(&workers);
-  std::sort(workers.begin(), workers.end());
-  EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                 "/job:mnist/replica:0/task:3",
-                                 "/job:mnist/replica:0/task:4"}),
-            workers);
+  {
+    std::vector<string> workers;
+    cc->ListWorkers(&workers);
+    std::sort(workers.begin(), workers.end());
+    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
+                                   "/job:mnist/replica:0/task:3",
+                                   "/job:mnist/replica:0/task:4"}),
+              workers);
+  }
+
+  {
+    std::vector<string> workers;
+    cc->ListWorkersInJob("mnist", &workers);
+    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
+                                   "/job:mnist/replica:0/task:3",
+                                   "/job:mnist/replica:0/task:4"}),
+              workers);
+  }
+
+  {
+    std::vector<string> workers;
+    cc->ListWorkersInJob("other", &workers);
+    EXPECT_TRUE(workers.empty());
+  }
 }
 
 TEST(GrpcChannelTest, NewHostPortGrpcChannelValidation) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index 709c3833e7..b85c1dc5b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
-#define TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
 
 #include <memory>
 
@@ -35,4 +35,4 @@ WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index bcd46a4c06..c4f2247145 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -190,6 +190,8 @@ Status GrpcServer::Init(
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
+  // Allow subclasses to specify more args to pass to the gRPC server.
+  MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
   worker_impl_ =
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 3366246afb..7979e96d3e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -59,6 +59,9 @@ typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*)>
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
+  // Allow children classes to override this and provide custom args to the
+  // server before it is constructed. Default behavior is to do nothing.
+  virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
  public:
   static Status Create(const ServerDef& server_def, Env* env,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
index d5baaae353..98164e750b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TESTLIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TESTLIB_H_
 
 #include <memory>
 #include <string>
@@ -71,4 +71,4 @@ class TestCluster {
 }  // end namespace test
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_TESTLIB_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index b9f21ea211..e1541db69b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -54,6 +54,11 @@ class GrpcWorkerCache : public WorkerCachePartial {
     channel_cache_->ListWorkers(workers);
   }
 
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) const override {
+    channel_cache_->ListWorkersInJob(job_name, workers);
+  }
+
   WorkerInterface* CreateWorker(const string& target) override {
     if (target == local_target_) {
       return local_worker_;
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 25ff6512a0..b070dd13dd 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -50,6 +50,8 @@ namespace {
 // Fake cache implementation for WorkerEnv.
 class DummyWorkerCache : public WorkerCacheInterface {
   void ListWorkers(std::vector<string>* workers) const override {}
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) const override {}
   WorkerInterface* CreateWorker(const string& target) override {
     return nullptr;
   }
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index 48d83845dd..88a97da34d 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 
@@ -138,6 +139,19 @@ class TestWorkerCache : public WorkerCacheInterface {
     }
   }
 
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) const override {
+    workers->clear();
+    for (auto it : workers_) {
+      DeviceNameUtils::ParsedName device_name;
+      CHECK(DeviceNameUtils::ParseFullName(it.first, &device_name));
+      CHECK(device_name.has_job);
+      if (job_name == device_name.job) {
+        workers->push_back(it.first);
+      }
+    }
+  }
+
   WorkerInterface* CreateWorker(const string& target) override {
     auto it = workers_.find(target);
     if (it != workers_.end()) {
diff --git a/tensorflow/core/distributed_runtime/worker_cache.h b/tensorflow/core/distributed_runtime/worker_cache.h
index 8521f8956b..0c8575b4d5 100644
--- a/tensorflow/core/distributed_runtime/worker_cache.h
+++ b/tensorflow/core/distributed_runtime/worker_cache.h
@@ -36,6 +36,8 @@ class WorkerCacheInterface {
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
   virtual void ListWorkers(std::vector<string>* workers) const = 0;
+  virtual void ListWorkersInJob(const string& job_name,
+                                std::vector<string>* workers) const = 0;
 
   // If "target" names a remote task for which an RPC channel exists
   // or can be constructed, returns a pointer to a WorkerInterface object
diff --git a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
index 43c3b6285b..1f309b4361 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
@@ -32,6 +32,10 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   virtual void ListWorkers(std::vector<string>* workers) const {
     return wrapped_->ListWorkers(workers);
   }
+  virtual void ListWorkersInJob(const string& job_name,
+                                std::vector<string>* workers) const {
+    return wrapped_->ListWorkersInJob(job_name, workers);
+  }
 
   // If "target" names a remote task for which an RPC channel exists
   // or can be constructed, returns a pointer to a WorkerInterface object
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index ca6dc1b1de..c7d0c6b7f3 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -35,6 +35,11 @@ class WorkerFreeListCache : public WorkerCacheInterface {
     wrapped_->ListWorkers(workers);
   }
 
+  void ListWorkersInJob(const string& job_name,
+                        std::vector<string>* workers) const override {
+    wrapped_->ListWorkersInJob(job_name, workers);
+  }
+
   WorkerInterface* CreateWorker(const string& target) override {
     mutex_lock l(mu_);
     auto p = workers_.find(target);
diff --git a/tensorflow/core/example/example_parser_configuration.h b/tensorflow/core/example/example_parser_configuration.h
index 3d06bd55e2..8bbed28471 100644
--- a/tensorflow/core/example/example_parser_configuration.h
+++ b/tensorflow/core/example/example_parser_configuration.h
@@ -53,4 +53,4 @@ Status ExampleParserConfigurationProtoToFeatureVectors(
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSE_CONFIGURATION_H_
+#endif  // TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index 2265498b5e..ec93b9aad9 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -97,8 +97,8 @@ limitations under the License.
 //   GetFeatureValues<FeatureType>(feature) -> RepeatedField<FeatureType>
 //     Returns values of the feature for the FeatureType.
 
-#ifndef TENSORFLOW_EXAMPLE_FEATURE_H_
-#define TENSORFLOW_EXAMPLE_FEATURE_H_
+#ifndef TENSORFLOW_CORE_EXAMPLE_FEATURE_UTIL_H_
+#define TENSORFLOW_CORE_EXAMPLE_FEATURE_UTIL_H_
 
 #include <iterator>
 #include <type_traits>
@@ -322,4 +322,4 @@ bool ExampleHasFeature(const string& key, const Example& example) {
 }
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_EXAMPLE_FEATURE_H_
+#endif  // TENSORFLOW_CORE_EXAMPLE_FEATURE_UTIL_H_
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 0da9b1081b..9fce488793 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_
-#define TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_ATTR_VALUE_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_ATTR_VALUE_UTIL_H_
 
 #include <functional>
 #include <string>
@@ -126,4 +126,4 @@ bool SubstitutePlaceholders(const SubstituteFunc& substitute, AttrValue* value);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_ATTR_VALUE_UTIL_H_
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index 2f79d0fa70..e9e94024f5 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_BFLOAT16_H_
-#define TENSORFLOW_FRAMEWORK_BFLOAT16_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_BFLOAT16_H_
+#define TENSORFLOW_CORE_FRAMEWORK_BFLOAT16_H_
 
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/byte_order.h"
@@ -60,4 +60,4 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_BFLOAT16_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_BFLOAT16_H_
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 90074c87b2..acdaaf6a90 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_CANCELLATION_H_
-#define TENSORFLOW_FRAMEWORK_CANCELLATION_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_CANCELLATION_H_
+#define TENSORFLOW_CORE_FRAMEWORK_CANCELLATION_H_
 
 #include <atomic>
 #include <functional>
@@ -134,4 +134,4 @@ class CancellationManager {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_CANCELLATION_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_CANCELLATION_H_
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index c3e6388e28..0b37b3a88c 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
-#define TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_COLLECTIVE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_COLLECTIVE_H_
 
 #include <string>
 #include <vector>
@@ -308,4 +308,4 @@ class PerStepCollectiveRemoteAccess : public CollectiveRemoteAccess {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_COLLECTIVE_H_
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 2bedce1d6a..e6f9f935f9 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
-#define TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_COMMON_SHAPE_FNS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_COMMON_SHAPE_FNS_H_
 
 #include <array>
 
@@ -311,4 +311,4 @@ Status ExplicitShapes(InferenceContext* c);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_COMMON_SHAPE_FNS_H_
diff --git a/tensorflow/core/framework/control_flow.h b/tensorflow/core/framework/control_flow.h
index 4dad0b4fef..4839e02e22 100644
--- a/tensorflow/core/framework/control_flow.h
+++ b/tensorflow/core/framework/control_flow.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_CONTROL_FLOW_H_
-#define TENSORFLOW_FRAMEWORK_CONTROL_FLOW_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_CONTROL_FLOW_H_
+#define TENSORFLOW_CORE_FRAMEWORK_CONTROL_FLOW_H_
 
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -55,4 +55,4 @@ struct FrameAndIterHash {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_CONTROL_FLOW_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_CONTROL_FLOW_H_
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index b184fd91e1..794250a2c1 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -89,6 +89,15 @@ class DeviceContext : public core::RefCounted {
                                      Tensor* cpu_tensor, StatusCallback done) {
     done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
   }
+
+  // If possible, wait for all events on *stream to complete then execute func.
+  // A non-OK Status is returned otherwise.  The stream argument should be the
+  // one provided by GpuDeviceInfo.  This function is not applicable to devices
+  // that don't provide such a value.
+  virtual Status ThenExecute(Device* device, stream_executor::Stream* stream,
+                             std::function<void()> func) {
+    return errors::Internal("ThenExecute not supported by device");
+  }
 };
 
 // map[i] is the DeviceContext* for the node with id i, if i < map.size().
diff --git a/tensorflow/core/framework/fake_input.h b/tensorflow/core/framework/fake_input.h
index 103db47a99..c3062762ff 100644
--- a/tensorflow/core/framework/fake_input.h
+++ b/tensorflow/core/framework/fake_input.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_FAKE_INPUT_H_
-#define TENSORFLOW_FRAMEWORK_FAKE_INPUT_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
 
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/types.h"
@@ -37,4 +37,4 @@ inline FakeInputFunctor FakeInput(std::initializer_list<DataType> dts) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_FAKE_INPUT_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index c81f4a4450..56e2017a61 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_FUNCTION_H_
-#define TENSORFLOW_FRAMEWORK_FUNCTION_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
 
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -41,7 +41,7 @@ class ProcessFunctionLibraryRuntime;
 class ResourceMgr;
 class Rendezvous;
 class ScopedStepContainer;
-class StepStatsCollector;
+class StepStatsCollectorInterface;
 class Node;
 
 // FunctionDefHelper::Create is a convenient helper to construct a
@@ -490,6 +490,11 @@ class FunctionLibraryRuntime {
     // Instantiates the function using an executor of the given type. If empty,
     // the default TensorFlow executor will be used.
     string executor_type;
+
+    // If true, the runtime will attempt to create kernels for the function at
+    // instantiation time, rather than on the first run. This can be used to
+    // surface errors earlier.
+    bool create_kernels_eagerly = false;
   };
   typedef uint64 Handle;
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
@@ -527,7 +532,7 @@ class FunctionLibraryRuntime {
     CancellationManager* cancellation_manager = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
     ScopedStepContainer* step_container = nullptr;
-    StepStatsCollector* stats_collector = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
 
     std::function<void(std::function<void()>)>* runner = nullptr;
 
@@ -731,4 +736,4 @@ GET_ATTR(bool)
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_FUNCTION_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
diff --git a/tensorflow/core/framework/graph_def_util.h b/tensorflow/core/framework/graph_def_util.h
index 525e84a989..2f8d5e8f51 100644
--- a/tensorflow/core/framework/graph_def_util.h
+++ b/tensorflow/core/framework/graph_def_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_GRAPH_DEF_UTIL_H_
-#define TENSORFLOW_FRAMEWORK_GRAPH_DEF_UTIL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_GRAPH_DEF_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_GRAPH_DEF_UTIL_H_
 
 #include <set>
 #include "tensorflow/core/framework/op.h"
@@ -118,4 +118,4 @@ Status StrippedOpListForGraph(const GraphDef& graph_def,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_GRAPH_DEF_UTIL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_GRAPH_DEF_UTIL_H_
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index 2966aa58de..32dd21f94e 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_KERNEL_DEF_BUILDER_H_
-#define TENSORFLOW_FRAMEWORK_KERNEL_DEF_BUILDER_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_BUILDER_H_
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -84,4 +84,4 @@ KernelDefBuilder& KernelDefBuilder::TypeConstraint(const char* attr_name) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_KERNEL_DEF_BUILDER_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_KERNEL_DEF_BUILDER_H_
diff --git a/tensorflow/core/framework/log_memory.h b/tensorflow/core/framework/log_memory.h
index faef7b8e98..1b926ddaa3 100644
--- a/tensorflow/core/framework/log_memory.h
+++ b/tensorflow/core/framework/log_memory.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_LOG_MEMORY_H_
-#define TENSORFLOW_FRAMEWORK_LOG_MEMORY_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOG_MEMORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOG_MEMORY_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -108,4 +108,4 @@ class LogMemory {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_LOG_MEMORY_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOG_MEMORY_H_
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index 1381dd66a5..0622dd06cb 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_LOOKUP_INTERFACE_H_
-#define TENSORFLOW_FRAMEWORK_LOOKUP_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_LOOKUP_INTERFACE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_LOOKUP_INTERFACE_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -142,4 +142,4 @@ class LookupInterface : public ResourceBase {
 }  // namespace lookup
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_LOOKUP_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_LOOKUP_INTERFACE_H_
diff --git a/tensorflow/core/framework/memory_types.h b/tensorflow/core/framework/memory_types.h
index d3918513d3..f719131bcb 100644
--- a/tensorflow/core/framework/memory_types.h
+++ b/tensorflow/core/framework/memory_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_MEMORY_TYPES_H_
-#define TENSORFLOW_FRAMEWORK_MEMORY_TYPES_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_MEMORY_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_MEMORY_TYPES_H_
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
@@ -35,4 +35,4 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_MEMORY_TYPES_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_MEMORY_TYPES_H_
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index c138332beb..ad07ec5480 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_NODE_DEF_BUILDER_H_
-#define TENSORFLOW_FRAMEWORK_NODE_DEF_BUILDER_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_BUILDER_H_
 
 #include <functional>
 #include <vector>
@@ -175,4 +175,4 @@ class NodeDefBuilder {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_NODE_DEF_BUILDER_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_BUILDER_H_
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index c012b7c3d3..499034cab2 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_NODE_DEF_UTIL_H_
-#define TENSORFLOW_FRAMEWORK_NODE_DEF_UTIL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
 
 #include <string>
 #include <unordered_map>
@@ -312,4 +312,4 @@ Status AddPrefixAndSuffixToNode(StringPiece prefix, StringPiece suffix,
                                 NodeDef* node_def);
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_NODE_DEF_UTIL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
diff --git a/tensorflow/core/framework/numeric_op.h b/tensorflow/core/framework/numeric_op.h
index 4538ff053c..0167e21f11 100644
--- a/tensorflow/core/framework/numeric_op.h
+++ b/tensorflow/core/framework/numeric_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_NUMERIC_OP_H_
-#define TENSORFLOW_FRAMEWORK_NUMERIC_OP_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -110,4 +110,4 @@ class BinaryElementWiseOp : public BinaryOp<T> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_NUMERIC_OP_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index b1d0127809..3236d1897c 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
-#define TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NUMERIC_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NUMERIC_TYPES_H_
 
 #include <complex>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -122,4 +122,4 @@ struct hash<Eigen::half> {
 }  // namespace std
 #endif  // _MSC_VER
 
-#endif  // TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NUMERIC_TYPES_H_
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 3ccca4090d..25f8de8dcc 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_OP_H_
-#define TENSORFLOW_FRAMEWORK_OP_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_H_
 
 #include <functional>
 #include <unordered_map>
@@ -309,4 +309,4 @@ struct OpDefBuilderReceiver {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_OP_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_H_
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index fbfb4018aa..0b39d6e848 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Class and associated machinery for specifying an Op's OpDef and shape
 // inference function for Op registration.
 
-#ifndef TENSORFLOW_FRAMEWORK_OP_DEF_BUILDER_H_
-#define TENSORFLOW_FRAMEWORK_OP_DEF_BUILDER_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_DEF_BUILDER_H_
 
 #include <string>
 #include <vector>
@@ -162,4 +162,4 @@ class OpDefBuilder {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_OP_DEF_BUILDER_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_DEF_BUILDER_H_
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 9be0dc69d2..3597f43d51 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -172,6 +172,15 @@ const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def) {
   return nullptr;
 }
 
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
 #define VALIDATE(EXPR, ...)                                            \
   do {                                                                 \
     if (!(EXPR)) {                                                     \
diff --git a/tensorflow/core/framework/op_def_util.h b/tensorflow/core/framework/op_def_util.h
index 0ba1325a03..85afe2bdea 100644
--- a/tensorflow/core/framework/op_def_util.h
+++ b/tensorflow/core/framework/op_def_util.h
@@ -16,10 +16,11 @@ limitations under the License.
 // TODO(josh11b): Probably not needed for OpKernel authors, so doesn't
 // need to be as publicly accessible as other files in framework/.
 
-#ifndef TENSORFLOW_FRAMEWORK_OP_DEF_UTIL_H_
-#define TENSORFLOW_FRAMEWORK_OP_DEF_UTIL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_DEF_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_DEF_UTIL_H_
 
 #include <string>
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -47,6 +48,10 @@ OpDef::AttrDef* FindAttrMutable(StringPiece name, OpDef* op_def);
 // Returns nullptr if no such attr is found.
 const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def);
 
+// Searches api_def for input argument with the indicated name.
+// Returns nullptr if no such attr is found.
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def);
+
 // Produce a human-readable version of an op_def that is more concise
 // than a text-format proto.  Excludes descriptions.
 string SummarizeOpDef(const OpDef& op_def);
@@ -98,4 +103,4 @@ uint64 OpDefHash(const OpDef& o);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_OP_DEF_UTIL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_DEF_UTIL_H_
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index 533dd64805..c269e2df04 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_OP_GEN_LIB_H_
-#define TENSORFLOW_FRAMEWORK_OP_GEN_LIB_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_GEN_LIB_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_GEN_LIB_H_
 
 #include <string>
 #include <unordered_map>
@@ -97,4 +97,4 @@ class ApiDefMap {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_OP_GEN_LIB_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_GEN_LIB_H_
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index aab95b785b..e752599de1 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -70,7 +70,7 @@ class OpRegistryInterface;
 class ResourceMgr;
 class ScopedStepContainer;
 class CollectiveExecutor;
-class StepStatsCollector;
+class StepStatsCollectorInterface;
 
 class OpKernel {
  public:
@@ -569,7 +569,7 @@ class OpKernelContext {
     CallFrameInterface* call_frame = nullptr;
     FunctionLibraryRuntime* function_library = nullptr;
     std::function<void(std::function<void()>)>* runner = nullptr;
-    StepStatsCollector* stats_collector = nullptr;
+    StepStatsCollectorInterface* stats_collector = nullptr;
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
@@ -984,7 +984,7 @@ class OpKernelContext {
   std::function<void(std::function<void()>)>* runner() const {
     return params_->runner;
   }
-  StepStatsCollector* stats_collector() const {
+  StepStatsCollectorInterface* stats_collector() const {
     return params_->stats_collector;
   }
 
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index 4aeaab3d9b..4ca4416c5a 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_QUEUE_INTERFACE_H_
-#define TENSORFLOW_FRAMEWORK_QUEUE_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_QUEUE_INTERFACE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_QUEUE_INTERFACE_H_
 
 #include <string>
 #include <vector>
@@ -99,4 +99,4 @@ class QueueInterface : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_QUEUE_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_QUEUE_INTERFACE_H_
diff --git a/tensorflow/core/framework/reader_base.h b/tensorflow/core/framework/reader_base.h
index cb44be4dee..5b82e9181f 100644
--- a/tensorflow/core/framework/reader_base.h
+++ b/tensorflow/core/framework/reader_base.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_READER_BASE_H_
-#define TENSORFLOW_FRAMEWORK_READER_BASE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_READER_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_READER_BASE_H_
 
 #include <memory>
 #include <string>
@@ -135,4 +135,4 @@ class ReaderBase : public ReaderInterface {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_READER_BASE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_READER_BASE_H_
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index dac6056b5a..f894acbe1d 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_READER_INTERFACE_H_
-#define TENSORFLOW_FRAMEWORK_READER_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_READER_INTERFACE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_READER_INTERFACE_H_
 
 #include <memory>
 #include <string>
@@ -84,4 +84,4 @@ class ReaderInterface : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_READER_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_READER_INTERFACE_H_
diff --git a/tensorflow/core/framework/reader_op_kernel.h b/tensorflow/core/framework/reader_op_kernel.h
index ffd6a1a184..e65a8695be 100644
--- a/tensorflow/core/framework/reader_op_kernel.h
+++ b/tensorflow/core/framework/reader_op_kernel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_READER_OP_KERNEL_H_
-#define TENSORFLOW_FRAMEWORK_READER_OP_KERNEL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_READER_OP_KERNEL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_READER_OP_KERNEL_H_
 
 #include <functional>
 #include <string>
@@ -85,4 +85,4 @@ class ReaderOpKernel : public ResourceOpKernel<ReaderInterface> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_READER_OP_KERNEL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_READER_OP_KERNEL_H_
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index f1cd37ecda..ddb5b10c18 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
-#define TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
 // This file is used by cuda code and must remain compilable by nvcc.
 
 #include "tensorflow/core/framework/numeric_types.h"
@@ -161,9 +161,12 @@ limitations under the License.
   TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
       TF_CALL_uint8(m) TF_CALL_int8(m)
 
+#define TF_CALL_FLOAT_TYPES(m) \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
+
 #define TF_CALL_REAL_NUMBER_TYPES(m) \
   TF_CALL_INTEGRAL_TYPES(m)          \
-  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
+  TF_CALL_FLOAT_TYPES(m)
 
 #define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
@@ -225,4 +228,4 @@ limitations under the License.
 #define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m)
 #endif  // __ANDROID_TYPES_SLIM__
 
-#endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index ab35c2f095..d475a1972d 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
-#define TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
 // This file is used by cuda code and must remain compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -102,4 +102,4 @@ struct proxy_type {
 #endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 33d4cb77ff..f8a587c9b5 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_RESOURCE_MGR_H_
-#define TENSORFLOW_FRAMEWORK_RESOURCE_MGR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_MGR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_MGR_H_
 
 #include <string>
 #include <typeindex>
@@ -61,8 +61,8 @@ namespace tensorflow {
 //
 //   // Create a var.
 //   MyVar* my_var = new MyVar;
-//   my_var.val = Tensor(DT_FLOAT, my_shape);
-//   my_var.val.flat<float>().setZeros();   // 0 initialized.
+//   my_var->val = Tensor(DT_FLOAT, my_shape);
+//   my_var->val.flat<float>().setZeros();   // 0 initialized.
 //   ctx->SetStatus(rm.Create("my_container", "my_name", my_var));
 //
 //   // += a variable.
@@ -555,4 +555,4 @@ void ResourceHandleOp<T>::Compute(OpKernelContext* ctx) {
 
 }  //  end namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_RESOURCE_MGR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_MGR_H_
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index 0a8da8b3bf..fbcd439dea 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_RESOURCE_OP_KERNEL_H_
-#define TENSORFLOW_FRAMEWORK_RESOURCE_OP_KERNEL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RESOURCE_OP_KERNEL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RESOURCE_OP_KERNEL_H_
 
 #include <string>
 
@@ -136,4 +136,4 @@ class ResourceOpKernel : public OpKernel {
 };
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_RESOURCE_OP_KERNEL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RESOURCE_OP_KERNEL_H_
diff --git a/tensorflow/core/framework/selective_registration.h b/tensorflow/core/framework/selective_registration.h
index 503947969d..4b281a04bf 100644
--- a/tensorflow/core/framework/selective_registration.h
+++ b/tensorflow/core/framework/selective_registration.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_SELECTIVE_REGISTRATION_H_
-#define TENSORFLOW_FRAMEWORK_SELECTIVE_REGISTRATION_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
 
 #include <string.h>
 
@@ -55,4 +55,4 @@ static_assert(false, "ops_to_register.h must define SHOULD_REGISTER macros");
 #define SHOULD_REGISTER_OP_KERNEL(clz) true
 #endif
 
-#endif  // TENSORFLOW_FRAMEWORK_SELECTIVE_REGISTRATION_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
index 653a661dd2..63568685f2 100644
--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
-#define TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SESSION_STATE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SESSION_STATE_H_
 
 #include <string>
 #include <unordered_map>
@@ -90,4 +90,4 @@ class TensorStore {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_SESSION_STATE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SESSION_STATE_H_
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index a82beb7e8f..516afa517d 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -617,13 +617,13 @@ bool Tensor::IsInitialized() const {
 }
 
 void Tensor::CheckType(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype)
+  CHECK_EQ(dtype(), expected_dtype) << " "
       << DataTypeString(expected_dtype) << " expected, got "
       << DataTypeString(dtype());
 }
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype)
+  CHECK_EQ(dtype(), expected_dtype) << " "
       << DataTypeString(expected_dtype) << " expected, got "
       << DataTypeString(dtype());
   CHECK(IsAligned()) << "ptr = " << base<void>();
diff --git a/tensorflow/core/framework/tensor_slice.h b/tensorflow/core/framework/tensor_slice.h
index 6019737342..82f21fb17e 100644
--- a/tensorflow/core/framework/tensor_slice.h
+++ b/tensorflow/core/framework/tensor_slice.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_TENSOR_SLICE_H_
-#define TENSORFLOW_FRAMEWORK_TENSOR_SLICE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_SLICE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_SLICE_H_
 
 #include <string>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -221,4 +221,4 @@ void TensorSlice::FillIndicesAndSizes(
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_TENSOR_SLICE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_SLICE_H_
diff --git a/tensorflow/core/framework/tensor_types.h b/tensorflow/core/framework/tensor_types.h
index a5c1a56bfc..6f981db189 100644
--- a/tensorflow/core/framework/tensor_types.h
+++ b/tensorflow/core/framework/tensor_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_TENSOR_TYPES_H_
-#define TENSORFLOW_FRAMEWORK_TENSOR_TYPES_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_TYPES_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -123,4 +123,4 @@ To32Bit(TensorType in) {
 }
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_FRAMEWORK_TENSOR_TYPES_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_TYPES_H_
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index 43d2d95311..4bda8f9eb8 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_TENSOR_UTIL_H_
-#define TENSORFLOW_FRAMEWORK_TENSOR_UTIL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -160,4 +160,4 @@ CreateTensorProto(const std::vector<Type>& values,
 }  // namespace tensor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_TENSOR_UTIL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index 661c28969e..5eafce662e 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_TRACKING_ALLOCATOR_H_
-#define TENSORFLOW_FRAMEWORK_TRACKING_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TRACKING_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TRACKING_ALLOCATOR_H_
 
 #include <unordered_map>
 #include "tensorflow/core/framework/allocator.h"
@@ -130,4 +130,4 @@ class TrackingAllocator : public Allocator {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_TRACKING_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TRACKING_ALLOCATOR_H_
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index b978d90fa8..989fc42e26 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_
-#define TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_
 
 #include <string>
 #if defined(__GXX_RTTI) || defined(_CPPRTTI)
@@ -84,4 +84,4 @@ inline TypeIndex MakeTypeIndex() {
 #endif  // __GXX_RTTI
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPE_INDEX_H_
diff --git a/tensorflow/core/framework/type_traits.h b/tensorflow/core/framework/type_traits.h
index e8351e494f..96fbf92938 100644
--- a/tensorflow/core/framework/type_traits.h
+++ b/tensorflow/core/framework/type_traits.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_TYPE_TRAITS_H_
-#define TENSORFLOW_FRAMEWORK_TYPE_TRAITS_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPE_TRAITS_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPE_TRAITS_H_
 
 #include <limits>
 #include <utility>
@@ -106,4 +106,4 @@ struct is_signed<tensorflow::qint32> : public is_signed<tensorflow::int32> {};
 
 }  // namespace std
 
-#endif  // TENSORFLOW_FRAMEWORK_TYPE_TRAITS_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPE_TRAITS_H_
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index ff7c9855d6..15b1add2c1 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_TYPES_H_
-#define TENSORFLOW_FRAMEWORK_TYPES_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
 
 #include <map>
 #include <set>
@@ -481,4 +481,4 @@ bool DataTypeAlwaysOnHost(DataType dt);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_TYPES_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index c02391dae3..52732801a0 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_VARIANT_H_
-#define TENSORFLOW_FRAMEWORK_VARIANT_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_H_
 
 #include <functional>
 #include <iostream>
@@ -351,4 +351,4 @@ const void* Variant::get() const;
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_VARIANT_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_H_
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index ded04b2a30..f155aa4892 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
-#define TENSORFLOW_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
 
 #include <iostream>
 #include <type_traits>
@@ -271,4 +271,4 @@ bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index c9e8dd2217..e6a2665a56 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_VARIANT_OP_REGISTRY_H_
-#define TENSORFLOW_FRAMEWORK_VARIANT_OP_REGISTRY_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_OP_REGISTRY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_OP_REGISTRY_H_
 
 #include <string>
 #include <unordered_set>
@@ -580,4 +580,4 @@ class UnaryVariantBinaryOpRegistration {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_VARIANT_OP_REGISTRY_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_OP_REGISTRY_H_
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index 1d87bc341a..7500e77d43 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_VARIANT_TENSOR_DATA_H
-#define TENSORFLOW_FRAMEWORK_VARIANT_TENSOR_DATA_H
+#ifndef TENSORFLOW_CORE_FRAMEWORK_VARIANT_TENSOR_DATA_H_
+#define TENSORFLOW_CORE_FRAMEWORK_VARIANT_TENSOR_DATA_H_
 
 #include <algorithm>
 #include <vector>
@@ -112,4 +112,4 @@ string ProtoDebugString(const VariantTensorData& object);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_VARIANT_TENSOR_DATA_H
+#endif  // TENSORFLOW_CORE_FRAMEWORK_VARIANT_TENSOR_DATA_H_
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 5bbbc6f6dc..45f8a29a92 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_ALGORITHM_H_
-#define TENSORFLOW_GRAPH_ALGORITHM_H_
+#ifndef TENSORFLOW_CORE_GRAPH_ALGORITHM_H_
+#define TENSORFLOW_CORE_GRAPH_ALGORITHM_H_
 
 #include <functional>
 #include <unordered_set>
@@ -117,4 +117,4 @@ bool FixupSourceAndSinkEdges(Graph* g);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_ALGORITHM_H_
+#endif  // TENSORFLOW_CORE_GRAPH_ALGORITHM_H_
diff --git a/tensorflow/core/graph/colors.h b/tensorflow/core/graph/colors.h
index c1e1940cac..43d2225571 100644
--- a/tensorflow/core/graph/colors.h
+++ b/tensorflow/core/graph/colors.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_COLORS_H_
-#define TENSORFLOW_GRAPH_COLORS_H_
+#ifndef TENSORFLOW_CORE_GRAPH_COLORS_H_
+#define TENSORFLOW_CORE_GRAPH_COLORS_H_
 
 namespace tensorflow {
 
@@ -26,4 +26,4 @@ const char* ColorFor(int dindex);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_COLORS_H_
+#endif  // TENSORFLOW_CORE_GRAPH_COLORS_H_
diff --git a/tensorflow/core/graph/control_flow.h b/tensorflow/core/graph/control_flow.h
index 548820720b..5abe77f5a1 100644
--- a/tensorflow/core/graph/control_flow.h
+++ b/tensorflow/core/graph/control_flow.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_CONTROL_FLOW_H_
-#define TENSORFLOW_GRAPH_CONTROL_FLOW_H_
+#ifndef TENSORFLOW_CORE_GRAPH_CONTROL_FLOW_H_
+#define TENSORFLOW_CORE_GRAPH_CONTROL_FLOW_H_
 
 #include <vector>
 
@@ -48,4 +48,4 @@ Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_CONTROL_FLOW_H_
+#endif  // TENSORFLOW_CORE_GRAPH_CONTROL_FLOW_H_
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 9b703e4693..2d94dd5cdc 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_COSTMODEL_H_
-#define TENSORFLOW_GRAPH_COSTMODEL_H_
+#ifndef TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
+#define TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
 
 #include <unordered_map>
 #include <vector>
@@ -229,4 +229,4 @@ class CostModel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_COSTMODEL_H_
+#endif  // TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
diff --git a/tensorflow/core/graph/default_device.h b/tensorflow/core/graph/default_device.h
index 68d7c8e553..f0f53c91f4 100644
--- a/tensorflow/core/graph/default_device.h
+++ b/tensorflow/core/graph/default_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_DEFAULT_DEVICE_H_
-#define TENSORFLOW_GRAPH_DEFAULT_DEVICE_H_
+#ifndef TENSORFLOW_CORE_GRAPH_DEFAULT_DEVICE_H_
+#define TENSORFLOW_CORE_GRAPH_DEFAULT_DEVICE_H_
 
 #include <string>
 
@@ -38,4 +38,4 @@ inline void SetDefaultDevice(const string& device, GraphDef* graph_def) {
 }  // namespace graph
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_DEFAULT_DEVICE_H_
+#endif  // TENSORFLOW_CORE_GRAPH_DEFAULT_DEVICE_H_
diff --git a/tensorflow/core/graph/gradients.cc b/tensorflow/core/graph/gradients.cc
index c1a8a63784..bec41712b1 100644
--- a/tensorflow/core/graph/gradients.cc
+++ b/tensorflow/core/graph/gradients.cc
@@ -65,16 +65,37 @@ struct NodeOutEq {
 static Node* AddZerosLike(Graph* g, NodeOut input) {
   DCHECK_LT(0, input.dtype());
   DCHECK_LT(input.dtype(), DT_FLOAT_REF);
-  NodeDef ndef;
-  ndef.set_name(g->NewName(kNodeLabel));
-  ndef.set_op("ZerosLike");
-  ndef.add_input(input.name());
-  AddNodeAttr("T", input.dtype(), &ndef);
-  Status s;
-  Node* ret = g->AddNode(ndef, &s);
-  TF_CHECK_OK(s);
-  g->AddEdge(input.node, input.index, ret, 0);
-  return ret;
+  if (input.dtype() == DT_RESOURCE) {
+    NodeDef read_def;
+    read_def.set_name(g->NewName("Read"));
+    read_def.set_op("ReadVariableOp");
+    read_def.add_input(input.name());
+    AddNodeAttr("dtype", DT_FLOAT, &read_def);
+    Status s;
+    Node* read = g->AddNode(read_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(input.node, input.index, read, 0);
+    NodeDef ndef;
+    ndef.set_name(g->NewName(kNodeLabel));
+    ndef.set_op("ZerosLike");
+    ndef.add_input(read_def.name());
+    AddNodeAttr("T", DT_FLOAT, &ndef);
+    Node* ret = g->AddNode(ndef, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(read, 0, ret, 0);
+    return ret;
+  } else {
+    NodeDef ndef;
+    ndef.set_name(g->NewName(kNodeLabel));
+    ndef.set_op("ZerosLike");
+    ndef.add_input(input.name());
+    AddNodeAttr("T", input.dtype(), &ndef);
+    Status s;
+    Node* ret = g->AddNode(ndef, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(input.node, input.index, ret, 0);
+    return ret;
+  }
 }
 
 static Node* AddSymGrad(Graph* g, Node* n, gtl::ArraySlice<NodeOut> grads) {
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 889359a68a..f6e41faf9c 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_GRAPH_CONSTRUCTOR_H_
-#define TENSORFLOW_GRAPH_GRAPH_CONSTRUCTOR_H_
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_CONSTRUCTOR_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_CONSTRUCTOR_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -186,4 +186,4 @@ extern void CopyGraph(const Graph& src, Graph* dest);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_GRAPH_CONSTRUCTOR_H_
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_CONSTRUCTOR_H_
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index 0d6aae4355..ec131580ae 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_GRAPH_DEF_BUILDER_H_
-#define TENSORFLOW_GRAPH_GRAPH_DEF_BUILDER_H_
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
 
 #include <vector>
 #include "tensorflow/core/framework/graph.pb.h"
@@ -203,4 +203,4 @@ Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
 }  // namespace ops
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_GRAPH_DEF_BUILDER_H_
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_H_
diff --git a/tensorflow/core/graph/graph_partition.h b/tensorflow/core/graph/graph_partition.h
index 67fafddd51..8020c2d247 100644
--- a/tensorflow/core/graph/graph_partition.h
+++ b/tensorflow/core/graph/graph_partition.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_GRAPH_PARTITION_H_
-#define TENSORFLOW_GRAPH_GRAPH_PARTITION_H_
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_PARTITION_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_PARTITION_H_
 
 #include <functional>
 #include <string>
@@ -95,4 +95,4 @@ Status AddControlEdges(const PartitionOptions& opts,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_GRAPH_PARTITION_H_
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_PARTITION_H_
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 5683944e46..833592caab 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2418,6 +2418,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
     csinfo_.conv2d_grad_filter_with_bias =
         "__MklDummyConv2DBackpropFilterWithBias";
+    csinfo_.conv3d = "Conv3D";
+    csinfo_.conv3d_grad_input = "Conv3DBackpropInputV2";
+    csinfo_.conv3d_grad_filter = "Conv3DBackpropFilterV2";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
     csinfo_.identity = "Identity";
@@ -2468,18 +2471,27 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsConcatV2, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
-                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv2D,
+                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv,
                       AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv3d,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv3d),
+                      CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv3d_grad_filter,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_filter),
+                      CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv3d_grad_input,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_input),
+                      CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
@@ -2614,6 +2626,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string conv2d_grad_input;
     string conv2d_grad_filter;
     string conv2d_grad_filter_with_bias;
+    string conv3d;
+    string conv3d_grad_input;
+    string conv3d_grad_filter;
     string fused_batch_norm;
     string fused_batch_norm_grad;
     string identity;
@@ -3086,7 +3101,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
   static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
@@ -3571,14 +3586,13 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
-                                           NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
+                                         NodeBuilder* nb) {
   DataType T;
   string data_format;
   string padding;
   std::vector<int32> strides;
   std::vector<int32> dilations;
-  bool use_cudnn_on_gpu;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
@@ -3586,8 +3600,6 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(
-      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -3595,7 +3607,6 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
   nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
   nb->Attr("data_format", data_format);
-  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
 }
 
 void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
@@ -3896,7 +3907,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
 
   // Copy attributes from Conv2D to Conv2DWithBias.
-  CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+  CopyAttrsConv(const_cast<const Node*>(pred), &nb);
 
   // Copy the device assigned to old node to new node.
   nb.Device(succ->def().device());
@@ -4007,7 +4018,7 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
   }
 
   // Copy attributes from Conv2DBackpropFilter.
-  CopyAttrsConv2D(const_cast<const Node*>(fltr), &nb);
+  CopyAttrsConv(const_cast<const Node*>(fltr), &nb);
 
   // Copy the device assigned to old node to new node.
   nb.Device(fltr->def().device());
diff --git a/tensorflow/core/graph/mkl_layout_pass.h b/tensorflow/core/graph/mkl_layout_pass.h
index ffe5c1ecfc..e7175149df 100644
--- a/tensorflow/core/graph/mkl_layout_pass.h
+++ b/tensorflow/core/graph/mkl_layout_pass.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // A graph pass that rewrites graph for propagating MKL layout as a tensor
 
-#ifndef TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
-#define TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+#ifndef TENSORFLOW_CORE_GRAPH_MKL_LAYOUT_PASS_H_
+#define TENSORFLOW_CORE_GRAPH_MKL_LAYOUT_PASS_H_
 
 #ifdef INTEL_MKL
 
@@ -33,4 +33,4 @@ extern bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g);
 
 #endif
 
-#endif  // TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_
+#endif  // TENSORFLOW_CORE_GRAPH_MKL_LAYOUT_PASS_H_
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index f6b7b5674b..4727ee7b56 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_NODE_BUILDER_H_
-#define TENSORFLOW_GRAPH_NODE_BUILDER_H_
+#ifndef TENSORFLOW_CORE_GRAPH_NODE_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPH_NODE_BUILDER_H_
 
 #include <vector>
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -160,4 +160,4 @@ NodeBuilder& NodeBuilder::Attr(StringPiece attr_name,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_NODE_BUILDER_H_
+#endif  // TENSORFLOW_CORE_GRAPH_NODE_BUILDER_H_
diff --git a/tensorflow/core/graph/optimizer_cse.h b/tensorflow/core/graph/optimizer_cse.h
index b8f3230c70..ef466fb788 100644
--- a/tensorflow/core/graph/optimizer_cse.h
+++ b/tensorflow/core/graph/optimizer_cse.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // An optimization pass that performs common subexpression elimination.
 
-#ifndef TENSORFLOW_GRAPH_OPTIMIZER_CSE_H_
-#define TENSORFLOW_GRAPH_OPTIMIZER_CSE_H_
+#ifndef TENSORFLOW_CORE_GRAPH_OPTIMIZER_CSE_H_
+#define TENSORFLOW_CORE_GRAPH_OPTIMIZER_CSE_H_
 
 #include <sys/types.h>
 #include "tensorflow/core/graph/graph.h"
@@ -34,4 +34,4 @@ extern bool OptimizeCSE(Graph* g,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_OPTIMIZER_CSE_H_
+#endif  // TENSORFLOW_CORE_GRAPH_OPTIMIZER_CSE_H_
diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/graph/quantize_training.h
index 2bb4ee1cf0..dc3d7e3b1f 100644
--- a/tensorflow/core/graph/quantize_training.h
+++ b/tensorflow/core/graph/quantize_training.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
-#define TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
+#ifndef TENSORFLOW_CORE_GRAPH_QUANTIZE_TRAINING_H_
+#define TENSORFLOW_CORE_GRAPH_QUANTIZE_TRAINING_H_
 
 #include "tensorflow/core/graph/graph.h"
 
@@ -53,4 +53,4 @@ Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
+#endif  // TENSORFLOW_CORE_GRAPH_QUANTIZE_TRAINING_H_
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index ba35846d93..3e99ff0c8c 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_SUBGRAPH_H_
-#define TENSORFLOW_GRAPH_SUBGRAPH_H_
+#ifndef TENSORFLOW_CORE_GRAPH_SUBGRAPH_H_
+#define TENSORFLOW_CORE_GRAPH_SUBGRAPH_H_
 
 #include <string>
 
@@ -162,4 +162,4 @@ class SendFetchRewrite : public PruneRewrite {
 }  // namespace subgraph
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_SUBGRAPH_H_
+#endif  // TENSORFLOW_CORE_GRAPH_SUBGRAPH_H_
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 67b252cb6c..ea7788f654 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -21,39 +21,14 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/kernels/constant_op.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
-
-// HostConst: forced to generate output on the host.
-// Only used by testlib; no op is registered for this kernel
-// externally (i.e., in array_ops.cc)
-REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), HostConstantOp);
-REGISTER_KERNEL_BUILDER(
-    Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), HostConstantOp);
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Register the HostConst Op
-// Returns a constant tensor on the host.  Useful for writing C++ tests
-// and benchmarks which run on GPU but require arguments pinned to the host.
-// Used by test::graph::HostConstant.
-// value: Attr `value` is the tensor to return.
-REGISTER_OP("HostConst")
-    .Output("output: dtype")
-    .Attr("value: tensor")
-    .Attr("dtype: type")
-    .SetShapeFn(shape_inference::UnknownShape);
-
 namespace test {
 namespace graph {
 
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index eb9038d619..8585b35a19 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // DEPRECATED: Use the C++ API defined in tensorflow/cc instead.
 
-#ifndef TENSORFLOW_GRAPH_TESTLIB_H_
-#define TENSORFLOW_GRAPH_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_GRAPH_TESTLIB_H_
+#define TENSORFLOW_CORE_GRAPH_TESTLIB_H_
 
 #include <string>
 #include <vector>
@@ -213,4 +213,4 @@ Node* DiagPart(Graph* g, Node* in, DataType type);
 }  // end namespace test
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_GRAPH_TESTLIB_H_
diff --git a/tensorflow/core/graph/types.h b/tensorflow/core/graph/types.h
index c707809927..ac5a7f8229 100644
--- a/tensorflow/core/graph/types.h
+++ b/tensorflow/core/graph/types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_TYPES_H_
-#define TENSORFLOW_GRAPH_TYPES_H_
+#ifndef TENSORFLOW_CORE_GRAPH_TYPES_H_
+#define TENSORFLOW_CORE_GRAPH_TYPES_H_
 
 #include "tensorflow/core/lib/gtl/int_type.h"
 #include "tensorflow/core/platform/types.h"
@@ -32,4 +32,4 @@ TF_LIB_GTL_DEFINE_INT_TYPE(Bytes, int64);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_TYPES_H_
+#endif  // TENSORFLOW_CORE_GRAPH_TYPES_H_
diff --git a/tensorflow/core/graph/while_context.h b/tensorflow/core/graph/while_context.h
index 2a83eb7bd8..5405e62be2 100644
--- a/tensorflow/core/graph/while_context.h
+++ b/tensorflow/core/graph/while_context.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPH_WHILE_CONTEXT_H_
-#define TENSORFLOW_GRAPH_WHILE_CONTEXT_H_
+#ifndef TENSORFLOW_CORE_GRAPH_WHILE_CONTEXT_H_
+#define TENSORFLOW_CORE_GRAPH_WHILE_CONTEXT_H_
 
 #include "tensorflow/core/graph/graph.h"
 
@@ -73,4 +73,4 @@ class WhileContext {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_GRAPH_H_
+#endif  // TENSORFLOW_CORE_GRAPH_WHILE_CONTEXT_H_
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index 12e3e46f65..f543dca49e 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -45,6 +45,8 @@ VirtualCluster::VirtualCluster(const DeviceSet* device_set)
   for (const auto& device : device_set_->devices()) {
     DeviceProperties props = GetDeviceInfo(device->parsed_name());
     if (props.type() == "UNKNOWN") continue;
+    auto attrs = device->attributes();
+    props.set_memory_size(attrs.memory_limit());
     devices_[device->name()] = props;
   }
 }
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index a60e3c7a9f..0690640ffa 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <limits>
 #include <unordered_map>
 
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index a5736d40b1..b01aca610a 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 231c7c63be..6710ff9df3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -804,8 +805,9 @@ class SymbolicShapeRefiner {
         CHECK_NOTNULL(function_library_.Find(function_node->op()));
 
     GrapplerFunctionItem grappler_function_item;
-    TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
-        *function_def, function_library_, &grappler_function_item));
+    TF_RETURN_IF_ERROR(
+        MakeGrapplerFunctionItem(*function_def, function_library_,
+                                 graph_def_version_, &grappler_function_item));
 
     if (grappler_function_item.inputs().size() > function_node->input_size()) {
       return errors::FailedPrecondition(
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 5acfb56b05..8938b7c32e 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -783,6 +785,46 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   EXPECT_EQ("float: [128,256]", PropToString(prop));
 }
 
+TEST_F(GraphPropertiesTest, FunctionWithScalarInputTest) {
+  // Create graph with a function that takes a scalar value so that we use
+  // Placeholder with scalar as for input to the function shape inference.
+  // Placeholder -> Identity -> MyFunc, where MyFunc simply takes Identity of
+  // the input; all tensors are scalars.
+  FunctionDefLibrary library;
+  *library.add_function() = FunctionDefHelper::Create(
+      "MyFunc",                                                   // Name
+      {"x: float"},                                               // Inputs
+      {"out: float"},                                             // Outputs
+      {},                                                         // Attrs
+      {{{"a"}, "Identity", {"x"}, {{"T", DataType::DT_FLOAT}}}},  // Nodes
+      {{"out", "a:output:0"}});                                   // Returns
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  TF_CHECK_OK(s.graph()->AddFunctionLibrary(library));
+  Output placeholder =
+      ops::Placeholder(s.WithOpName("Placeholder"), DataType::DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({})));
+  Output identity = ops::Identity(s.WithOpName("Identity"), placeholder);
+  auto _identity = tensorflow::ops::AsNodeOut(s, identity);
+  auto builder =
+      tensorflow::NodeBuilder("MyFunc", "MyFunc", s.graph()->op_registry());
+  tensorflow::Node* func_op;
+  TF_CHECK_OK(builder.Input(_identity).Finalize(s.graph(), &func_op));
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Tensorflow version < 21 infers output shape of Placeholder with empty shape
+  // as unknown, instead of scalar.
+  EXPECT_GT(item.graph.versions().producer(), 21);
+
+  // MyFunc output shouldn't be unknown rank.
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto out_props = properties.GetOutputProperties("MyFunc");
+  const OpInfo::TensorProperties out_prop0 = out_props[0];
+  EXPECT_EQ(DT_FLOAT, out_prop0.dtype());
+  EXPECT_FALSE(out_prop0.shape().unknown_rank());
+}
+
 TEST_F(GraphPropertiesTest, SimpleFunctionStaticShapeInference) {
   // Test graph produced in python using:
   /*
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 6406a4bdbf..71f4d9fd05 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
@@ -175,14 +176,24 @@ int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
 TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
                                       int rank, bool* found_unknown_shapes) {
   auto shape = original_shape;
-  if (shape.unknown_rank() || shape.dim_size() < rank) {
+  bool is_scalar = !shape.unknown_rank() && shape.dim_size() == 0;
+
+  if (shape.unknown_rank() || (!is_scalar && shape.dim_size() < rank)) {
     *found_unknown_shapes = true;
-    TensorShapeProto::Dim dim;
     VLOG(2) << "Use minimum shape because the rank is unknown.";
     // The size of each dimension is at least 1, if unknown.
-    dim.set_size(1);
+    for (int i = shape.dim_size(); i < rank; i++) {
+      shape.add_dim()->set_size(1);
+    }
+  } else if (is_scalar) {
+    for (int i = 0; i < rank; i++) {
+      shape.add_dim()->set_size(1);
+    }
+  } else if (shape.dim_size() > rank) {
+    *found_unknown_shapes = true;
+    shape.clear_dim();
     for (int i = 0; i < rank; i++) {
-      *shape.add_dim() = dim;
+      shape.add_dim()->set_size(original_shape.dim(i).size());
     }
   } else {
     for (int i = 0; i < shape.dim_size(); i++) {
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 7271a29319..998bd59dce 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -1126,5 +1127,77 @@ TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNormGrad) {
     EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
   }
 }
+
+TEST_F(OpLevelCostEstimatorTest, MaybeGetMinimumShape) {
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(true);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 4, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    ExpectTensorShape({1, 1, 1, 1}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 1, &unknown_shapes);
+    EXPECT_FALSE(unknown_shapes);
+    ExpectTensorShape({1}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 2, &unknown_shapes);
+    EXPECT_FALSE(unknown_shapes);
+    ExpectTensorShape({1, 1}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    x.add_dim()->set_size(10);
+    x.add_dim()->set_size(20);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 2, &unknown_shapes);
+    EXPECT_FALSE(unknown_shapes);
+    ExpectTensorShape({10, 20}, y);
+
+    unknown_shapes = false;
+    TensorShapeProto z = MaybeGetMinimumShape(x, 4, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    EXPECT_EQ(4, z.dim_size());
+    ExpectTensorShape({10, 20, 1, 1}, z);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    x.add_dim()->set_size(10);
+    x.add_dim()->set_size(20);
+    x.add_dim()->set_size(-1);
+    x.add_dim()->set_size(20);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 4, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    ExpectTensorShape({10, 20, 1, 20}, y);
+  }
+
+  {
+    TensorShapeProto x;
+    x.set_unknown_rank(false);
+    x.add_dim()->set_size(10);
+    x.add_dim()->set_size(20);
+    x.add_dim()->set_size(30);
+    x.add_dim()->set_size(20);
+    bool unknown_shapes = false;
+    TensorShapeProto y = MaybeGetMinimumShape(x, 2, &unknown_shapes);
+    EXPECT_TRUE(unknown_shapes);
+    ExpectTensorShape({10, 20}, y);
+  }
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index be54d98534..aad00ce039 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -99,7 +99,7 @@ static void ExtractExtraProperties(
       continue;
     }
     TensorId input_tensor_id = ParseTensorName(input_name);
-    const string input_node_name = input_tensor_id.first.ToString();
+    const string input_node_name(input_tensor_id.first);
 
     auto iter = name_to_node.find(input_node_name);
     if (iter == name_to_node.end()) continue;
@@ -172,7 +172,7 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
   for (const auto& input_name : node.input()) {
     CHECK(!input_name.empty());
     TensorId input_tensor_id = ParseTensorName(input_name);
-    const string input_node_name = input_tensor_id.first.ToString();
+    const string input_node_name(input_tensor_id.first);
     const int output_index = input_tensor_id.second;
 
     // Skip control inputs.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 6e3ebdee12..037a823096 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -880,10 +880,15 @@ Costs VirtualScheduler::Summary() const {
   // Print per device summary
   VLOG(1) << "Devices:";
   Costs critical_path_costs = Costs::ZeroCosts();
+  std::vector<string> device_names;
+  device_names.reserve(device_.size());
+  for (auto& it : device_) {
+    device_names.push_back(it.first);
+  }
+  std::sort(device_names.begin(), device_names.end());
 
-  for (const auto& device : device_) {
-    const auto& name = device.first;
-    const auto& state = device.second;
+  for (const auto& name : device_names) {
+    const auto& state = device_.at(name);
 
     std::map<string, int64> op_to_memory;
     // First profile only persistent memory usage.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index b1373d8317..02a379fca8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 288587ce9b..029515ad3c 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variable.pb.h"
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index caaa5ac8db..70ad9f9a9b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -110,10 +110,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "constant_folding_test",
     srcs = ["constant_folding_test.cc"],
-    shard_count = 5,
+    tags = ["requires-gpu-sm35"],
     deps = [
         ":constant_folding",
         "//tensorflow/cc:cc_ops",
@@ -827,11 +827,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/costs:graph_properties",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 889445bbd6..4fb2fe6883 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index f2ac3a44c0..815bd23307 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -852,7 +852,19 @@ DataType GetDataTypeFromNodeOrProps(const NodeDef& node,
   }
   return dtype;
 }
-
+bool IsValidConstShapeForNCHW(const TensorShapeProto& shape) {
+  if (shape.dim_size() != 4) {
+    return false;
+  }
+  int num_dim_larger_than_one = 0;
+  for (const auto& dim : shape.dim()) {
+    if (dim.size() > 1) ++num_dim_larger_than_one;
+  }
+  return num_dim_larger_than_one <= 1;
+}
+const string& GetShape(const NodeDef& node) {
+  return node.attr().at("data_format").s();
+}
 }  // namespace
 
 // static
@@ -1699,7 +1711,7 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
     return Status::OK();
   }
 
-  if (MulConvPushDown(node, *properties)) {
+  if (MulConvPushDown(*properties, optimized_graph, node)) {
     graph_modified_ = true;
     return Status::OK();
   }
@@ -2541,8 +2553,9 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) {
   return false;
 }
 
-bool ConstantFolding::MulConvPushDown(NodeDef* node,
-                                      const GraphProperties& properties) {
+bool ConstantFolding::MulConvPushDown(const GraphProperties& properties,
+                                      GraphDef* optimized_graph,
+                                      NodeDef* node) {
   // Push down multiplication on ConvND.
   //                       *                  ConvND
   //                     /   \                /    \
@@ -2618,12 +2631,14 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
     }
     const auto& const_shape = const_props[0].shape();
 
-    TensorShapeProto new_filter_shape;
-    if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) {
-      return false;
-    }
-    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
-      return false;
+    if (GetShape(*conv_node) == "NHWC") {
+      TensorShapeProto new_filter_shape;
+      if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) {
+        return false;
+      }
+      if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
+        return false;
+      }
     }
 
     string mul_new_name =
@@ -2657,6 +2672,69 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
     }
     node_map_->AddNode(mul_new_name, node);
 
+    if (GetShape(*conv_node) == "NCHW") {
+      if (const_node->attr().at("value").tensor().tensor_shape().dim_size() <=
+          1) {
+        // Broadcast should work for scalar or 1D. No need to reshape.
+        return true;
+      }
+      if (!IsValidConstShapeForNCHW(
+              const_node->attr().at("value").tensor().tensor_shape())) {
+        return false;
+      }
+      // Adds Const node for Reshape.
+      auto* shape_const_node = optimized_graph->add_node();
+      const string shape_const_node_name =
+          OptimizedNodeName(*const_node, "_new_shape");
+      shape_const_node->set_name(shape_const_node_name);
+      shape_const_node->set_op("Const");
+      shape_const_node->set_device(const_node->device());
+      (*shape_const_node->mutable_attr())["dtype"].set_type(DT_INT32);
+      Tensor t(DT_INT32, {4});
+      t.flat<int32>()(0) = 1;
+      t.flat<int32>()(1) = 1;
+      t.flat<int32>()(2) = 1;
+      t.flat<int32>()(3) = const_node->attr()
+                               .at("value")
+                               .tensor()
+                               .tensor_shape()
+                               .dim(1)  // IsValidConstShapeForNCHW guarantees
+                                        // dim 1 is the dim to reshape
+                               .size();
+      t.AsProtoTensorContent(
+          (*shape_const_node->mutable_attr())["value"].mutable_tensor());
+      node_map_->AddNode(shape_const_node_name, shape_const_node);
+
+      // Adds Reshape node.
+      auto* reshape_node = optimized_graph->add_node();
+      const string reshape_node_name =
+          OptimizedNodeName(*const_node, "_reshape");
+      reshape_node->set_op("Reshape");
+      reshape_node->set_name(reshape_node_name);
+      reshape_node->set_device(const_node->device());
+      (*reshape_node->mutable_attr())["T"].set_type(
+          const_node->attr().at("dtype").type());
+      (*reshape_node->mutable_attr())["Tshape"].set_type(DT_INT32);
+      node_map_->AddNode(reshape_node_name, reshape_node);
+
+      // const_node -> reshape_node
+      node_map_->RemoveOutput(const_node->name(), node->name());
+      *reshape_node->add_input() = const_node->name();
+      node_map_->AddOutput(const_node->name(), reshape_node_name);
+
+      // shape_const_node -> reshape_node
+      *reshape_node->add_input() = shape_const_node_name;
+      node_map_->AddOutput(shape_const_node_name, reshape_node_name);
+
+      // reshape_node -> node (Mul)
+      node_map_->AddOutput(reshape_node_name, node->name());
+      if (left_child_is_constant) {
+        node->set_input(0, reshape_node_name);
+      } else {
+        node->set_input(1, reshape_node_name);
+      }
+    }
+
     return true;
   }
   return false;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index b42d5f201e..051dfb681e 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -125,7 +125,8 @@ class ConstantFolding : public GraphOptimizer {
 
   // Aggregate constants present around a conv operator. Returns true if the
   // transformation was applied successfully.
-  bool MulConvPushDown(NodeDef* node, const GraphProperties& properties);
+  bool MulConvPushDown(const GraphProperties& properties,
+                       GraphDef* optimized_graph, NodeDef* node);
 
   // Strength reduces floating point division by a constant Div(x, const) to
   // multiplication by the reciprocal Mul(x, Reciprocal(const)).
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index b9765b9292..0683572dcc 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -240,7 +240,7 @@ TEST_F(ConstantFoldingTest, AddTree) {
   }
 }
 
-TEST_F(ConstantFoldingTest, ConvPushDownTest) {
+TEST_F(ConstantFoldingTest, ConvPushDownTestNHWC) {
   // Tests if the following rewrite is performed:
   //
   //         *                       Conv2D
@@ -3047,6 +3047,143 @@ TEST_F(ConstantFoldingTest, TensorArraySize) {
   test::ExpectTensorEqual<int32>(tensors_expected[1], tensors_actual[1]);
 }
 
+TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
+  // Multiplying min() with 0.1 gives a denormal without FTZ and zero with FTZ.
+  // Make sure constant folding behaves the same way as TensorFlow.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a =
+      ops::Const(s.WithOpName("a"), std::numeric_limits<float>::min(), {1});
+  Output b = ops::Const(s.WithOpName("b"), 0.1f, {1});
+  Output c = ops::Mul(s.WithOpName("c"), a, b);
+
+  GrapplerItem item;
+  item.fetch.push_back("c");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(1, output.node_size());
+
+  const NodeDef& node_d = output.node(0);
+  EXPECT_EQ("c", node_d.name());
+  EXPECT_EQ("Const", node_d.op());
+
+  std::vector<string> fetch = {"c"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+#if GOOGLE_CUDA
+TEST_F(ConstantFoldingTest, ConvPushDownTestNCHW) {
+  // Tests if the following rewrite is performed:
+  //
+  //         *                       Conv2D
+  //        / \                       / \
+  //       c  Conv2D        -->      x  (c * filter)
+  //           / \
+  //          x  filter
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  int input_channel = 1;
+  int output_channel = 2;
+  int filter_size = 1;
+
+  TensorShape filter_shape(
+      {filter_size, filter_size, input_channel, output_channel});
+
+  // Filter shape: [1, 1, 1, 2]
+  // Filter for output channel 0 = {2.f}
+  // Filter for output channel 1 = {-2.f}
+  // clang-format off
+  Output filter =
+      ops::Const(s.WithOpName("filter"), {
+          {
+              {{2.f, -2.f}}
+          }
+      });
+  // clang-format on
+
+  int batch_size = 1;
+  int matrix_size = 3;
+  // input shape: [1,1,3,3]
+  TensorShape input_shape(
+      {batch_size, input_channel, matrix_size, matrix_size});
+  Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                  ops::Placeholder::Shape(input_shape));
+
+  Output conv = ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1},
+                            "VALID", ops::Conv2D::DataFormat("NCHW"));
+  Output c = ops::Const(s.WithOpName("c"), 2.0f, /* shape */ {1, 2, 1, 1});
+  Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold(nullptr);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Here only op/IO are checked. The values are verified by EvaluateNodes
+  // below.
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "mul") {
+      ++found;
+      EXPECT_EQ("Conv2D", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("conv/merged_input", node.input(1));
+    } else if (node.name() == "conv/merged_input") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(0, node.input_size());
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  // Check that const folded multiplication node has the expected value.
+  std::vector<string> fetch = {"mul"};
+  // Input shape (NCHW) is [1,1,3,3], filter is [1,1,1,2] output shape should be
+  // (NCHW) [1,2,3,3]
+  ::tensorflow::Input::Initializer x{
+      {
+          {
+              {1.f, 2.f, 3.f},  // H = 0
+              {4.f, 5.f, 6.f},  // H = 1
+              {7.f, 8.f, 9.f}   // H = 2
+          }                     // C = 0
+      }                         // N = 0
+  };
+
+  //       |1,2,3|
+  // conv( |4,5,6|,       // input
+  //       |7,8,9|
+  //      [[[2,-2]]])     // filter
+  //    *  [1,2,1,1]      // mul by const
+  //          =
+  //     [
+  //       |4, 8, 12|
+  //       |16,20,24|   ==> output channel 0
+  //       |28,32,36|
+  //
+  //       | -4, -8,-12|
+  //       |-16,-20,-24|   ==> output channel 1
+  //       |-28,-32,-36|
+  //     ]
+  auto actual = EvaluateNodes(output, fetch, {{"x", x.tensor}});
+  auto expected = EvaluateNodes(item.graph, fetch, {{"x", x.tensor}});
+  test::ExpectTensorEqual<float>(expected[0], actual[0]);
+}
+#endif
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index b8e69787e3..979c437c02 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -4,6 +4,44 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
 cc_library(
+    name = "filter_fusion",
+    srcs = ["filter_fusion.cc"],
+    hdrs = [
+        "filter_fusion.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        ":fusion_utils",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/kernels:cast_op",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "filter_fusion_test",
+    srcs = ["filter_fusion_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":filter_fusion",
+        ":graph_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
     name = "function_rename",
     srcs = ["function_rename.cc"],
     hdrs = [
@@ -46,11 +84,13 @@ cc_library(
     deps = [
         ":graph_utils",
         "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:cast_op",
+        "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
@@ -125,6 +165,43 @@ cc_library(
 )
 
 cc_library(
+    name = "map_vectorization",
+    srcs = ["map_vectorization.cc"],
+    hdrs = [
+        "map_vectorization.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "map_vectorization_test",
+    srcs = ["map_vectorization_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        ":map_vectorization",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/kernels:cast_op",  # Must be linked for the testlib functions to work.
+    ],
+)
+
+cc_library(
     name = "map_and_batch_fusion",
     srcs = ["map_and_batch_fusion.cc"],
     hdrs = [
@@ -306,11 +383,13 @@ cc_library(
     name = "data",
     visibility = ["//visibility:public"],
     deps = [
+        ":filter_fusion",
         ":function_rename",
         ":latency_all_edges",
         ":map_and_batch_fusion",
         ":map_and_filter_fusion",
         ":map_fusion",
+        ":map_vectorization",
         ":noop_elimination",
         ":shuffle_and_repeat_fusion",
     ],
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
new file mode 100644
index 0000000000..c71aa6e804
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/filter_fusion.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/fusion_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+NodeDef MakeFusedFilterNode(const NodeDef& first_filter_node,
+                            const NodeDef& second_filter_node,
+                            const FunctionDef& fused_function,
+                            MutableGraphView* graph) {
+  NodeDef fused_node;
+  graph_utils::SetUniqueGraphNodeName("fused_filter", graph->GetGraph(),
+                                      &fused_node);
+
+  fused_node.set_op("FilterDataset");
+  fused_node.add_input(first_filter_node.input(0));
+
+  auto copy_attribute = [](const string& attribute_name, const NodeDef& from,
+                           NodeDef* to) {
+    (*to->mutable_attr())[attribute_name] = from.attr().at(attribute_name);
+  };
+
+  auto attr = first_filter_node.attr().at("predicate");
+  *attr.mutable_func()->mutable_name() = fused_function.signature().name();
+  (*fused_node.mutable_attr())["predicate"] = std::move(attr);
+
+  copy_attribute("Targuments", first_filter_node, &fused_node);
+
+  for (auto key : {"output_shapes", "output_types"})
+    copy_attribute(key, second_filter_node, &fused_node);
+
+  return fused_node;
+}
+
+}  // namespace
+
+Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
+                              GraphDef* output) {
+  GraphDef sorted_old_graph = item.graph;
+  TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
+  *output = sorted_old_graph;
+
+  MutableGraphView graph(output);
+  std::set<string> nodes_to_delete;
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             output->library());
+
+  auto get_filter_node = [](const NodeDef& node) -> const NodeDef* {
+    if (node.op() == "FilterDataset") return &node;
+    return nullptr;
+  };
+
+  auto get_fused_predicate =
+      [&](const NodeDef* first_filter_node,
+          const NodeDef* second_filter_node) -> FunctionDef* {
+    const auto& parent_fun = first_filter_node->attr().at("predicate");
+    const FunctionDef* first_func =
+        function_library.Find(parent_fun.func().name());
+    const auto& fun = second_filter_node->attr().at("predicate");
+    const FunctionDef* second_func = function_library.Find(fun.func().name());
+
+    if (!fusion_utils::HasSameSignature(first_func->signature(),
+                                        second_func->signature())) {
+      VLOG(1) << "Can't fuse Filters because they have different signature\n";
+      return nullptr;
+    }
+
+    return fusion_utils::FuseFunctions(
+        *first_func, *second_func, "fused_predicate",
+        fusion_utils::SameSignature, fusion_utils::SameInput,
+        fusion_utils::LazyConjunctionOutput, fusion_utils::LazyConjunctionNodes,
+        output->mutable_library());
+  };
+
+  for (const NodeDef& node : sorted_old_graph.node()) {
+    const NodeDef* second_filter_node = get_filter_node(node);
+    if (!second_filter_node) continue;
+
+    const NodeDef* first_filter_node =
+        get_filter_node(*graph_utils::GetInputNode(*second_filter_node, graph));
+    if (!first_filter_node) continue;
+
+    const auto* fused_predicate =
+        get_fused_predicate(first_filter_node, second_filter_node);
+    if (!fused_predicate) continue;
+    const auto* fused_filter_node = graph.AddNode(MakeFusedFilterNode(
+        *first_filter_node, *second_filter_node, *fused_predicate, &graph));
+
+    graph.ReplaceInput(*second_filter_node, *fused_filter_node);
+
+    // TODO(prazek): we should run some optimizations on the fused filter
+    // functions, or make sure that optimization passes run after filter
+    // fusion.
+    TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_predicate));
+    // TODO(prazek): we could also remove map functions from library if they
+    // are not used anymore.
+    nodes_to_delete.insert(first_filter_node->name());
+    nodes_to_delete.insert(second_filter_node->name());
+  }
+
+  graph.DeleteNodes(nodes_to_delete);
+  return Status::OK();
+}
+
+void FilterFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
+                            const GraphDef& optimize_output, double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(FilterFusion, "filter_fusion");
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
new file mode 100644
index 0000000000..91a0364a46
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization fuses filter transformations.
+class FilterFusion : public CustomGraphOptimizer {
+ public:
+  FilterFusion() = default;
+  ~FilterFusion() override = default;
+
+  string name() const override { return "filter_fusion"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc
new file mode 100644
index 0000000000..5a289e60d0
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/filter_fusion.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name) {
+  return test::function::NDef(
+      name, "FilterDataset", {input_node_name.ToString()},
+      {{"predicate", FunctionDefHelper::FunctionRef("IsZero")},
+       {"Targuments", {}},
+       {"output_shapes", {}},
+       {"output_types", {}}});
+}
+
+TEST(FilterFusionTest, FuseTwoFilterIntoOne) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       MakeFilterNode("filter1", "range"),
+       MakeFilterNode("filter2", "filter1")},
+      // FunctionLib
+      {
+          test::function::IsZero(),
+      });
+
+  FilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter1", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter2", output));
+}
+
+TEST(FilterFusionTest, FuseThreeNodesIntoOne) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       MakeFilterNode("filter1", "range"), MakeFilterNode("filter2", "filter1"),
+       MakeFilterNode("filter3", "filter2"),
+       NDef("cache", "CacheDataset", {"filter3", "filename"}, {})},
+      // FunctionLib
+      {
+          test::function::IsZero(),
+      });
+
+  FilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter1", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter2", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter3", output));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index f84f109af6..01a78c04b0 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/fusion_utils.h"
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_def.pb.h"
-
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -52,6 +52,12 @@ string GetOutputNode(const FunctionDef& function, int output_idx) {
   return function.ret().at(ret_output_name);
 }
 
+string& GetMutableOutputNode(FunctionDef* function, int output_idx) {
+  const auto& ret_output_name =
+      function->signature().output_arg(output_idx).name();
+  return function->mutable_ret()->at(ret_output_name);
+}
+
 template <typename Iterable>
 StringCollection GetNames(const Iterable& iterable, int allocate_size) {
   StringCollection names;
@@ -106,7 +112,6 @@ gtl::FlatMap<string, string> GetUniqueNames(const Iterable& first_iterable,
 // Nodes that will be added to the function can have the same name as the nodes
 // from parent function.
 void RenameFunctionNodes(const FunctionDef& first_function,
-                         FunctionDef* fused_function,
                          protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse,
                          protobuf::Map<string, string>* rets_to_fuse) {
   const gtl::FlatMap<string, string> changed_node_names =
@@ -149,6 +154,7 @@ OpDef GetUniqueSignature(const OpDef& first_signature,
   const gtl::FlatMap<string, string> changed_input_names =
       GetUniqueNames(first_signature.input_arg(), second_signature.input_arg());
   OpDef signature;
+  signature.set_name(second_signature.name());
 
   for (const auto& input_arg : second_signature.input_arg()) {
     auto& input = *signature.add_input_arg();
@@ -221,12 +227,13 @@ void FuseFunctionNodes(const StringCollection& first_inputs,
 }
 
 // This function looks for direct edges from input to return and rewrites
-// them to the coresponding input of the return of `first_function`.
+// them to the corresponding input of the return of `first_function`.
 void FuseReturns(const StringCollection& first_inputs,
                  const StringCollection& second_inputs,
                  const StringCollection& first_outputs,
-                 const SetInputFn& set_input, FunctionDef* fused_function) {
-  for (auto& ret : *fused_function->mutable_ret()) {
+                 const SetInputFn& set_input,
+                 protobuf::Map<string, string>* fused_ret) {
+  for (auto& ret : *fused_ret) {
     auto return_input = ParseNodeConnection(ret.second);
     auto input_it =
         std::find(second_inputs.begin(), second_inputs.end(), return_input);
@@ -249,6 +256,33 @@ StringCollection GetFunctionOutputs(const FunctionDef& function) {
   return outputs;
 }
 
+FunctionDef* CreateFalsePredicate(
+    const protobuf::RepeatedPtrField<OpDef_ArgDef>& fake_args,
+    FunctionDefLibrary* library) {
+  GraphDef graph;
+  MutableGraphView graph_view(&graph);
+  auto* node = graph_utils::AddScalarConstNode(false, &graph_view);
+  auto* false_predicate = library->add_function();
+  graph_utils::SetUniqueGraphFunctionName("false_predicate", library,
+                                          false_predicate);
+
+  int num = 0;
+  for (const auto& fake_arg : fake_args) {
+    auto* arg = false_predicate->mutable_signature()->add_input_arg();
+    arg->set_type(fake_arg.type());
+    arg->set_name(strings::StrCat("fake_arg", num));
+    num++;
+  }
+
+  auto* output = false_predicate->mutable_signature()->add_output_arg();
+  output->set_name("false_out");
+  output->set_type(DT_BOOL);
+
+  (*false_predicate->mutable_ret())["false_out"] = node->name() + ":output:0";
+  *false_predicate->mutable_node_def() = std::move(*graph.mutable_node());
+  return false_predicate;
+}
+
 void CheckIfCanCompose(const OpDef& first_signature,
                        const OpDef& second_signature) {
   CHECK(CanCompose(first_signature, second_signature))
@@ -259,6 +293,15 @@ void CheckIfCanCompose(const OpDef& first_signature,
 
 }  // namespace
 
+void MergeNodes(const FunctionDef& first_function,
+                const FunctionDef& second_function, FunctionDef* fused_function,
+                FunctionDefLibrary* library) {
+  // Copy all nodes from first_function.
+  fused_function->mutable_node_def()->CopyFrom(first_function.node_def());
+  // Copy transformed nodes from the second function.
+  fused_function->mutable_node_def()->MergeFrom(second_function.node_def());
+}
+
 bool CanCompose(const OpDef& first_signature, const OpDef& second_signature) {
   // TODO(prazek): Functions can have additional inputs being placeholders
   // for a values used in function.  We should be able to also fuse these
@@ -285,8 +328,8 @@ void ComposeSignature(const OpDef& first_signature,
 
 void ComposeOutput(const protobuf::Map<string, string>& first_ret,
                    const protobuf::Map<string, string>& second_ret,
-                   FunctionDef* fused_function) {
-  *fused_function->mutable_ret() = second_ret;
+                   protobuf::Map<string, string>* fused_ret) {
+  *fused_ret = second_ret;
 }
 
 void CombineSignature(const OpDef& first_signature,
@@ -302,41 +345,110 @@ void CombineSignature(const OpDef& first_signature,
 
 void CombineOutput(const protobuf::Map<string, string>& first_ret,
                    const protobuf::Map<string, string>& second_ret,
-                   FunctionDef* fused_function) {
-  *fused_function->mutable_ret() = first_ret;
-  fused_function->mutable_ret()->insert(second_ret.begin(), second_ret.end());
+                   protobuf::Map<string, string>* fused_ret) {
+  *fused_ret = first_ret;
+  fused_ret->insert(second_ret.begin(), second_ret.end());
+}
+
+string SameInput(const StringCollection& first_inputs,
+                 const StringCollection& second_inputs,
+                 const StringCollection& first_outputs, int arg_num) {
+  return first_inputs.at(arg_num);
+}
+
+bool HasSameSignature(const OpDef& first_signature,
+                      const OpDef& second_signature) {
+  return first_signature.input_arg_size() ==
+             second_signature.input_arg_size() &&
+         first_signature.output_arg_size() ==
+             second_signature.output_arg_size();
+}
+
+void SameSignature(const OpDef& first_signature, const OpDef& second_signature,
+                   OpDef* fused_signature) {
+  CHECK(HasSameSignature(first_signature, second_signature))
+      << "Functions do not have the same signature";
+  // Copy signature from first function.
+  *fused_signature = first_signature;
+}
+
+void LazyConjunctionNodes(const FunctionDef& first_function,
+                          const FunctionDef& second_function,
+                          FunctionDef* fused_function,
+                          FunctionDefLibrary* library) {
+  fused_function->mutable_node_def()->CopyFrom(first_function.node_def());
+
+  NodeDefBuilder if_builder("", "If");
+  if_builder.Input(GetOutputNode(first_function, 0), 0, DT_BOOL);
+  DataTypeVector in_arg_types;
+  std::vector<NodeDefBuilder::NodeOut> inputs;
+  for (const auto& input_arg : first_function.signature().input_arg()) {
+    inputs.push_back({input_arg.name(), 0, input_arg.type()});
+    in_arg_types.push_back(input_arg.type());
+  }
+  if_builder.Attr("Tin", in_arg_types);
+
+  if_builder.Attr("Tcond", DT_BOOL);
+  if_builder.Attr("Tout", DataTypeVector{DT_BOOL});
+  if_builder.Attr("_lower_using_switch_merge", true);
+
+  NameAttrList then_branch;
+  then_branch.set_name(second_function.signature().name());
+  if_builder.Attr("then_branch", then_branch);
+
+  auto* false_predicate =
+      CreateFalsePredicate(first_function.signature().input_arg(), library);
+
+  NameAttrList else_branch;
+  else_branch.set_name(false_predicate->signature().name());
+  if_builder.Attr("else_branch", else_branch);
+  if_builder.Input(inputs);
+
+  auto* if_node = fused_function->add_node_def();
+  // This is guaranteed to succeed.
+  TF_CHECK_OK(if_builder.Finalize(if_node));
+  graph_utils::SetUniqueFunctionNodeName("cond", fused_function, if_node);
+
+  GetMutableOutputNode(fused_function, 0) = if_node->name() + ":output:0";
+}
+
+void LazyConjunctionOutput(const protobuf::Map<string, string>& first_ret,
+                           const protobuf::Map<string, string>& second_ret,
+                           protobuf::Map<string, string>* fused_ret) {
+  CHECK_EQ(first_ret.size(), 1);
+  CHECK_EQ(second_ret.size(), 1);
+  // Temporarily copy returns from first_ret.  We are going to change the
+  // output node after creating it.
+  *fused_ret = first_ret;
 }
 
-FunctionDef* FuseFunctions(const FunctionDef& first_function,
-                           const FunctionDef& function,
-                           StringPiece fused_name_prefix,
-                           const SetFunctionSignatureFn& set_signature,
-                           const SetInputFn& set_input,
-                           const SetOutputFn& set_output,
-                           FunctionDefLibrary* library) {
-  if (first_function.attr_size() != 0 || function.attr_size() != 0)
+FunctionDef* FuseFunctions(
+    const FunctionDef& first_function, const FunctionDef& second_function,
+    StringPiece fused_name_prefix, const SetFunctionSignatureFn& set_signature,
+    const SetInputFn& set_input, const SetOutputFn& set_output,
+    const SetNodesFn& set_nodes, FunctionDefLibrary* library) {
+  if (first_function.attr_size() != 0 || second_function.attr_size() != 0)
     return nullptr;  // Functions with attributes are currently not supported
 
   // This function will be used as a clone of second function, having unique
   // names.
-  FunctionDef setup_function = function;
+  FunctionDef setup_function = second_function;
   *setup_function.mutable_signature() = GetUniqueSignature(
       first_function.signature(), setup_function.signature(),
       setup_function.mutable_ret(), setup_function.mutable_node_def());
 
   FunctionDef* fused_function = library->add_function();
-  // Copy all nodes from first_function.
-  fused_function->mutable_node_def()->CopyFrom(first_function.node_def());
+
   set_signature(first_function.signature(), setup_function.signature(),
                 fused_function->mutable_signature());
 
   graph_utils::SetUniqueGraphFunctionName(fused_name_prefix, library,
                                           fused_function);
 
-  RenameFunctionNodes(first_function, fused_function,
-                      setup_function.mutable_node_def(),
+  RenameFunctionNodes(first_function, setup_function.mutable_node_def(),
                       setup_function.mutable_ret());
-  set_output(first_function.ret(), setup_function.ret(), fused_function);
+  set_output(first_function.ret(), setup_function.ret(),
+             fused_function->mutable_ret());
 
   CHECK(fused_function->signature().output_arg_size() ==
         fused_function->ret_size())
@@ -351,10 +463,10 @@ FunctionDef* FuseFunctions(const FunctionDef& first_function,
   FuseFunctionNodes(first_inputs, second_inputs, first_outputs, set_input,
                     setup_function.mutable_node_def());
   FuseReturns(first_inputs, second_inputs, first_outputs, set_input,
-              fused_function);
+              fused_function->mutable_ret());
+
+  set_nodes(first_function, setup_function, fused_function, library);
 
-  // Copy transformed nodes from the second function.
-  fused_function->mutable_node_def()->MergeFrom(setup_function.node_def());
   return fused_function;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.h b/tensorflow/core/grappler/optimizers/data/fusion_utils.h
index 41f13f6cb8..19b7002dcd 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.h
@@ -48,14 +48,20 @@ using SetInputFn =
                          const StringCollection& second_function_inputs,
                          const StringCollection& parent_outputs, int arg_num)>;
 
-// This function is invoked with first function ret. It is used to set up
-// returns of fused function.  If you need to combine outputs
-// of first and second function, then this is a right place to create a new
-// nodes.
+// This function is invoked with first and second function ret. It is used to
+// set up returns of fused function.
 using SetOutputFn =
     std::function<void(const protobuf::Map<string, string>& parent_ret,
                        const protobuf::Map<string, string>& second_function_ret,
-                       FunctionDef* fused_function)>;
+                       protobuf::Map<string, string>* fused_ret)>;
+
+using SetNodesFn = std::function<void(
+    const FunctionDef& first_function, const FunctionDef& second_function,
+    FunctionDef* fused_function, FunctionDefLibrary* library)>;
+
+void MergeNodes(const FunctionDef& first_function,
+                const FunctionDef& second_function, FunctionDef* fused_function,
+                FunctionDefLibrary* library);
 
 // Returns true if functions can be composed.
 bool CanCompose(const OpDef& first_signature, const OpDef& second_signature);
@@ -71,7 +77,7 @@ string ComposeInput(const StringCollection& first_inputs,
 // second_function(first_function(args...)).
 void ComposeOutput(const protobuf::Map<string, string>& first_ret,
                    const protobuf::Map<string, string>& second_ret,
-                   FunctionDef* fused_function);
+                   protobuf::Map<string, string>* fused_ret);
 
 // Set input signature to `first_function_signature` and output signature
 // to `first_function_signature` + `second_function_signature`
@@ -83,7 +89,32 @@ void CombineSignature(const OpDef& first_signature,
 // return *first_function(...), *second_function(...)
 void CombineOutput(const protobuf::Map<string, string>& first_ret,
                    const protobuf::Map<string, string>& second_ret,
-                   FunctionDef* fused_function);
+                   protobuf::Map<string, string>* fused_ret);
+
+// Returns true if both signatures have the same number of input and output
+// args.
+bool HasSameSignature(const OpDef& first_signature,
+                      const OpDef& second_signature);
+
+// Check if both signatures are same and copy it from `first_signature`.
+void SameSignature(const OpDef& first_signature, const OpDef& second_signature,
+                   OpDef* fused_signature);
+
+// Take the same input as first function.
+string SameInput(const StringCollection& first_inputs,
+                 const StringCollection& second_inputs,
+                 const StringCollection& first_outputs, int arg_num);
+
+// Create a fused function that computes the short-circuit logical AND of the
+// result of the first function and the result of the second function.
+void LazyConjunctionOutput(const protobuf::Map<string, string>& first_ret,
+                           const protobuf::Map<string, string>& second_ret,
+                           protobuf::Map<string, string>* fused_ret);
+
+void LazyConjunctionNodes(const FunctionDef& first_function,
+                          const FunctionDef& second_function,
+                          FunctionDef* fused_function,
+                          FunctionDefLibrary* library);
 
 // Fuse `first_function` with `second_function`, setting `fused_name_prefix` as
 // a name prefix.  The nodes from `first_function` are copied unmodified.  All
@@ -91,13 +122,11 @@ void CombineOutput(const protobuf::Map<string, string>& first_ret,
 // that are not conflicting with first function.  This means that copied nodes
 // from  second function can end up having different names.  For explanation of
 // set up functions see the documentation of the functions types.
-FunctionDef* FuseFunctions(const FunctionDef& first_function,
-                           const FunctionDef& second_function,
-                           StringPiece fused_name_prefix,
-                           const SetFunctionSignatureFn& set_signature,
-                           const SetInputFn& set_input,
-                           const SetOutputFn& set_output,
-                           FunctionDefLibrary* library);
+FunctionDef* FuseFunctions(
+    const FunctionDef& first_function, const FunctionDef& second_function,
+    StringPiece fused_name_prefix, const SetFunctionSignatureFn& set_signature,
+    const SetInputFn& set_input, const SetOutputFn& set_output,
+    const SetNodesFn& set_nodes, FunctionDefLibrary* library);
 
 }  // namespace fusion_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
index 7ad5d63bf6..d5c6466080 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils_test.cc
@@ -57,10 +57,10 @@ TEST(FusionUtilsTest, FuseFunctionsByComposition) {
   auto *function = graph.mutable_library()->add_function();
   *function = test::function::XTimesTwo();
 
-  auto *fused_function =
-      FuseFunctions(*parent_function, *function, "fused_maps",
-                    fusion_utils::ComposeSignature, fusion_utils::ComposeInput,
-                    fusion_utils::ComposeOutput, graph.mutable_library());
+  auto *fused_function = FuseFunctions(
+      *parent_function, *function, "fused_maps", fusion_utils::ComposeSignature,
+      fusion_utils::ComposeInput, fusion_utils::ComposeOutput,
+      fusion_utils::MergeNodes, graph.mutable_library());
 
   EXPECT_EQ(fused_function->signature().name(), "fused_maps");
   EXPECT_EQ(fused_function->signature().input_arg_size(), 1);
@@ -98,7 +98,8 @@ TEST(FusionUtilsTest, FuseFunctionWithPredicate) {
   auto *fused_function =
       FuseFunctions(*xtimes_two, *is_zero, "fused_map_and_filter_function",
                     fusion_utils::CombineSignature, fusion_utils::ComposeInput,
-                    fusion_utils::CombineOutput, graph.mutable_library());
+                    fusion_utils::CombineOutput, fusion_utils::MergeNodes,
+                    graph.mutable_library());
 
   EXPECT_EQ(fused_function->signature().name(),
             "fused_map_and_filter_function");
@@ -134,10 +135,10 @@ TEST(FusionUtilsTest, FuseSameFunctionWithExtraOutput) {
   auto *function = graph.mutable_library()->add_function();
   *function = test::function::XTimesTwo();
 
-  auto *fused_function =
-      FuseFunctions(*parent_function, *function, "fused_maps",
-                    fusion_utils::CombineSignature, fusion_utils::ComposeInput,
-                    fusion_utils::CombineOutput, graph.mutable_library());
+  auto *fused_function = FuseFunctions(
+      *parent_function, *function, "fused_maps", fusion_utils::CombineSignature,
+      fusion_utils::ComposeInput, fusion_utils::CombineOutput,
+      fusion_utils::MergeNodes, graph.mutable_library());
 
   EXPECT_EQ(fused_function->signature().input_arg_size(), 1);
   EXPECT_EQ(fused_function->signature().output_arg_size(), 2);
@@ -169,7 +170,8 @@ TEST(FusionUtilsTest, ZipFusion) {
 
   auto *fused_function =
       FuseFunctions(*function, *function, "zip_maps", zip_signature, zip_input,
-                    fusion_utils::CombineOutput, graph.mutable_library());
+                    fusion_utils::CombineOutput, fusion_utils::MergeNodes,
+                    graph.mutable_library());
 
   EXPECT_EQ(fused_function->signature().input_arg_size(), 2);
   EXPECT_EQ(fused_function->signature().output_arg_size(), 2);
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 0eceaf4017..883037173b 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -108,6 +108,26 @@ NodeDef* AddNode(StringPiece name, StringPiece op,
   return graph->AddNode(std::move(node));
 }
 
+NodeDef* AddNode(StringPiece name, StringPiece op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 FunctionDef* fd) {
+  NodeDef* node = fd->add_node_def();
+  if (!name.empty()) {
+    node->set_name(name.ToString());
+  } else {
+    SetUniqueFunctionNodeName(op, fd, node);
+  }
+  node->set_op(op.ToString());
+  for (const string& input : inputs) {
+    node->add_input(input);
+  }
+  for (auto attr : attributes) {
+    (*node->mutable_attr())[attr.first] = attr.second;
+  }
+  return node;
+}
+
 template <>
 NodeDef* AddScalarConstNode(bool v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
@@ -181,7 +201,7 @@ bool ContainsGraphNodeWithName(StringPiece name, const GraphDef& graph) {
 }
 
 bool ContainsNodeWithOp(StringPiece op, const GraphDef& graph) {
-  return FindNodeWithOp(op, graph) != -1;
+  return FindGraphNodeWithOp(op, graph) != -1;
 }
 
 bool ContainsGraphFunctionWithName(StringPiece name,
@@ -205,7 +225,7 @@ int FindGraphNodeWithName(StringPiece name, const GraphDef& graph) {
   return indices.empty() ? -1 : indices.front();
 }
 
-int FindNodeWithOp(StringPiece op, const GraphDef& graph) {
+int FindGraphNodeWithOp(StringPiece op, const GraphDef& graph) {
   std::vector<int> indices = GetElementIndicesWithPredicate(
       [&op](const NodeDef& node) { return node.op() == op; }, graph.node());
   return indices.empty() ? -1 : indices.front();
@@ -242,6 +262,12 @@ int FindFunctionNodeWithOp(StringPiece op, const FunctionDef& function) {
   return indices.empty() ? -1 : indices.front();
 }
 
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph) {
+  if (node.input_size() == 0) return nullptr;
+  GraphView::InputPort input_port = graph.GetInputPort(node.name(), 0);
+  return graph.GetRegularFanin(input_port).node;
+}
+
 void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph,
                             NodeDef* node) {
   string name = prefix.ToString();
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 28a1aff877..6f431c232d 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -37,6 +37,12 @@ NodeDef* AddNode(StringPiece name, StringPiece op,
                  const std::vector<std::pair<string, AttrValue>>& attributes,
                  MutableGraphView* graph);
 
+// Adds a node to a FunctionDef.
+NodeDef* AddNode(StringPiece name, StringPiece op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 FunctionDef* fd);
+
 // Adds a Const node with the given value to the graph.
 template <typename T>
 NodeDef* AddScalarConstNode(T v, MutableGraphView* graph) {
@@ -99,7 +105,10 @@ int FindFunctionNodeWithOp(StringPiece op, const FunctionDef& function);
 
 // Returns the index of the first node with the given op or -1 if no such  node
 // exists.
-int FindNodeWithOp(StringPiece op, const GraphDef& graph);
+int FindGraphNodeWithOp(StringPiece op, const GraphDef& graph);
+
+// Gets the 0th input to a node in the graph.
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph);
 
 // Returns the list of indices of all nodes with the given op or empty list if
 // no such node exists.
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 0a3af1a914..c19ac7b880 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -176,25 +176,25 @@ TEST(GraphUtilsTest, FindGraphFunctionWithName) {
       FindGraphFunctionWithName(new_function->signature().name(), library), -1);
 }
 
-TEST(GraphUtilsTest, FindNodeWithOp) {
+TEST(GraphUtilsTest, FindGraphNodeWithOp) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
-  EXPECT_EQ(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.GetGraph()), -1);
 
   AddNode("A", "OpA", {}, {}, &graph);
   AddNode("B", "OpB", {"A"}, {}, &graph);
   AddNode("A2", "OpA", {"B"}, {}, &graph);
-  EXPECT_EQ(FindNodeWithOp("OpA", *graph.GetGraph()), 0);
+  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.GetGraph()), 0);
 
   graph.DeleteNodes({"B"});
-  EXPECT_EQ(FindNodeWithOp("OpB", *graph.GetGraph()), -1);
+  EXPECT_EQ(FindGraphNodeWithOp("OpB", *graph.GetGraph()), -1);
   EXPECT_EQ(FindGraphNodeWithName("A2", *graph.GetGraph()), 1);
 }
 
 TEST(GraphUtilsTest, FindAllGraphNodesWithOp) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
-  EXPECT_EQ(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.GetGraph()), -1);
 
   AddNode("A", "OpA", {}, {}, &graph);
   AddNode("B", "OpB", {"A"}, {}, &graph);
@@ -251,6 +251,54 @@ TEST(GraphUtilsTest, SetUniqueGraphFunctionName) {
             other_function->signature().name());
 }
 
+TEST(GraphUtilsTest, AddNodeToFunctionDef) {
+  FunctionDef func;
+  const char* op_name = "xxx";
+  AddNode(op_name, op_name, {}, {}, &func);
+
+  const NodeDef& node1 = func.node_def(FindFunctionNodeWithName("xxx", func));
+  EXPECT_EQ(node1.op(), op_name);
+  EXPECT_EQ(node1.input_size(), 0);
+  EXPECT_EQ(node1.attr_size(), 0);
+
+  const std::vector<string> inputs({"input1", "input2"});
+  AddNode("", op_name, inputs, {}, &func);
+  const NodeDef& node2 =
+      func.node_def(FindFunctionNodeWithName("xxx/_2", func));
+  EXPECT_EQ(node2.op(), op_name);
+  EXPECT_EQ(node2.attr_size(), 0);
+  EXPECT_EQ(node2.input_size(), inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    EXPECT_EQ(node2.input(i), inputs[i]);
+  }
+
+  AttrValue a1, a2;
+  a1.set_type(DT_INT32);
+  a2.set_type(DT_INT64);
+  const std::vector<std::pair<string, AttrValue>> attrs(
+      {{"attr1", a1}, {"attr2", a2}});
+  AddNode("", op_name, {}, attrs, &func);
+  const NodeDef& node3 =
+      func.node_def(FindFunctionNodeWithName("xxx/_3", func));
+  EXPECT_EQ(node3.op(), op_name);
+  EXPECT_EQ(node3.input_size(), 0);
+  EXPECT_EQ(node3.attr_size(), attrs.size());
+  for (size_t i = 0; i < attrs.size(); ++i) {
+    EXPECT_EQ(attrs[i].second.type(), node3.attr().at(attrs[i].first).type());
+  }
+}
+
+TEST(GraphUtilsTest, GetInputNode) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node1 = AddNode("", "A", {}, {}, &graph);
+  NodeDef* node2 = AddNode("", "A", {node1->name()}, {}, &graph);
+
+  EXPECT_EQ(GetInputNode(*node2, graph), node1);
+  EXPECT_EQ(GetInputNode(*node1, graph), nullptr);
+}
+
 }  // namespace
 }  // namespace graph_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 3ce238a30a..e9ad6f1b8a 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -104,8 +104,8 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // Use a more descriptive variable name now that we know the node type.
     const NodeDef& batch_node = node;
-    GraphView::InputPort input_port = graph.GetInputPort(batch_node.name(), 0);
-    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
+    NodeDef* node2 = graph_utils::GetInputNode(batch_node, graph);
+
     if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
       continue;
     }
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index a46c504ac4..b676246b31 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -85,8 +85,8 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node =
-      output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
+  NodeDef map_and_batch_node = output.node(
+      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -170,8 +170,8 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node =
-      output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
+  NodeDef map_and_batch_node = output.node(
+      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -253,8 +253,8 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node =
-      output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
+  NodeDef map_and_batch_node = output.node(
+      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 5e76c9f819..f1844a141c 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -116,22 +116,25 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto& fun = filter_node->attr().at("predicate");
     const FunctionDef* filter_func = function_library.Find(fun.func().name());
     if (!fusion_utils::CanCompose(map_func->signature(),
-                                  filter_func->signature()))
+                                  filter_func->signature())) {
+      VLOG(1) << "Can't fuse map and filter because the output signature of "
+                 "the map function does not match the input signature of the "
+                 "filter function\n";
       return nullptr;
+    }
     return fusion_utils::FuseFunctions(
         *map_func, *filter_func, "fused_map_and_filter_function",
         fusion_utils::CombineSignature, fusion_utils::ComposeInput,
-        fusion_utils::CombineOutput, output->mutable_library());
+        fusion_utils::CombineOutput, fusion_utils::MergeNodes,
+        output->mutable_library());
   };
 
   for (const NodeDef& node : sorted_old_graph.node()) {
     const NodeDef* filter_node = get_filter_node(node);
     if (!filter_node) continue;
 
-    GraphView::InputPort input_port =
-        graph.GetInputPort(filter_node->name(), 0);
     const NodeDef* map_node =
-        get_map_node(*graph.GetRegularFanin(input_port).node);
+        get_map_node(*graph_utils::GetInputNode(*filter_node, graph));
     if (!map_node) continue;
 
     const auto* fused_function = make_fused_function(map_node, filter_node);
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
index 027e0c1590..3b6829ade3 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
@@ -101,18 +101,18 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
       graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
   ASSERT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
 
-  int map_id = graph_utils::FindNodeWithOp("MapDataset", output);
+  int map_id = graph_utils::FindGraphNodeWithOp("MapDataset", output);
   auto& map_node = output.node(map_id);
   ASSERT_EQ(map_node.input_size(), 1);
   EXPECT_EQ(map_node.input(0), "range");
 
   int filter_by_component_id =
-      graph_utils::FindNodeWithOp("FilterByLastComponentDataset", output);
+      graph_utils::FindGraphNodeWithOp("FilterByLastComponentDataset", output);
   auto& filter_by_component = output.node(filter_by_component_id);
   ASSERT_EQ(filter_by_component.input_size(), 1);
   EXPECT_EQ(filter_by_component.input(0), map_node.name());
 
-  int cache_id = graph_utils::FindNodeWithOp("CacheDataset", output);
+  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
   auto& cache_node = output.node(cache_id);
   ASSERT_EQ(cache_node.input_size(), 2);
   EXPECT_EQ(cache_node.input(0), filter_by_component.name());
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index feb370eb9d..a78ecb09f7 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -90,21 +90,25 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto& fun = map_node->attr().at("f");
     const FunctionDef* func = function_library.Find(fun.func().name());
 
-    if (!fusion_utils::CanCompose(parent_func->signature(), func->signature()))
+    if (!fusion_utils::CanCompose(parent_func->signature(),
+                                  func->signature())) {
+      VLOG(1) << "Can't fuse two maps because the output signature of the "
+                 "first map function does not match the input signature of the "
+                 "second function\n";
       return nullptr;
+    }
     return fusion_utils::FuseFunctions(
         *parent_func, *func, "fused_map", fusion_utils::ComposeSignature,
         fusion_utils::ComposeInput, fusion_utils::ComposeOutput,
-        output->mutable_library());
+        fusion_utils::MergeNodes, output->mutable_library());
   };
 
   for (const NodeDef& node : sorted_old_graph.node()) {
     const NodeDef* map_node = get_map_node(node);
     if (!map_node) continue;
 
-    GraphView::InputPort input_port = graph.GetInputPort(map_node->name(), 0);
     const NodeDef* parent_map_node =
-        get_map_node(*graph.GetRegularFanin(input_port).node);
+        get_map_node(*graph_utils::GetInputNode(*map_node, graph));
     if (!parent_map_node) continue;
 
     const auto* fused_function = get_fused_function(parent_map_node, map_node);
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
new file mode 100644
index 0000000000..92551a0459
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -0,0 +1,258 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_vectorization.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+void CopyAttribute(const string& attr_name, const NodeDef& from, NodeDef* to) {
+  (*to->mutable_attr())[attr_name] = from.attr().at(attr_name);
+}
+
+FunctionDef* AddVectorizedFunction(const NodeDef& map_node,
+                                   const FunctionDef& orig_func,
+                                   FunctionDefLibrary* library) {
+  // If we decide to use a different method of vectorization, we can just
+  // swap out this part.
+  FunctionDef* vectorized_func = library->add_function();
+  // Function inputs and outputs are the same as original, just
+  // with different shapes.
+  *vectorized_func->mutable_signature() = orig_func.signature();
+  graph_utils::SetUniqueGraphFunctionName("vectorized_function", library,
+                                          vectorized_func);
+
+  // Add MapDefun node
+  NodeDef* map_defun_node = vectorized_func->mutable_node_def()->Add();
+  map_defun_node->set_op("MapDefun");
+  graph_utils::SetUniqueFunctionNodeName(map_defun_node->op(), vectorized_func,
+                                         map_defun_node);
+
+  // Set attrs and inputs
+  for (const string& k : {"f", "output_types", "output_shapes"}) {
+    // Function, output types and (unbatched) shapes are the same as the
+    // original map node.
+    CopyAttribute(k, map_node, map_defun_node);
+  }
+
+  // Get types of input arguments from original map function
+  AttrValue t_args;
+  for (const auto& input : vectorized_func->signature().input_arg()) {
+    t_args.mutable_list()->add_type(input.type());
+    map_defun_node->add_input(input.name());
+  }
+  (*map_defun_node->mutable_attr())["Targuments"] = t_args;
+
+  // Set return values to match output names
+  string output_prefix = strings::StrCat(map_defun_node->name(), ":output:");
+  for (size_t i = 0; i < vectorized_func->signature().output_arg_size(); ++i) {
+    const auto& output_arg = vectorized_func->signature().output_arg(i);
+    (*vectorized_func->mutable_ret())[output_arg.name()] =
+        strings::StrCat(output_prefix, i);
+  }
+
+  return vectorized_func;
+}
+
+bool IsOutputShapesFullyDefined(const NodeDef& node) {
+  auto* shapes_attr = gtl::FindOrNull(node.attr(), "output_shapes");
+  if (shapes_attr == nullptr) return false;
+  const auto& shapes = shapes_attr->list().shape();
+
+  for (const TensorShapeProto& shape : shapes) {
+    for (const auto& dim : shape.dim()) {
+      if (dim.size() == -1) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool IsStatefulFn(const FunctionLibraryDefinition& library,
+                  const FunctionDef& function_def) {
+  for (const NodeDef& node_def : function_def.node_def()) {
+    const OpDef* op_def;
+    Status s = library.LookUpOpDef(node_def.op(), &op_def);
+    if (!s.ok() || op_def->is_stateful()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasCapturedInputs(const NodeDef& map_node) {
+  return map_node.attr().at("Targuments").list().type_size() > 0;
+}
+
+NodeDef make_new_batch_node(const NodeDef& old_batch_node,
+                            const NodeDef& input_node,
+                            const FunctionDef& vectorized_func,
+                            MutableGraphView* graph) {
+  NodeDef batch_node;
+  batch_node.set_op(old_batch_node.op());
+  graph_utils::SetUniqueGraphNodeName(batch_node.op(), graph->GetGraph(),
+                                      &batch_node);
+
+  // Set the `input_dataset` input argument
+  batch_node.add_input(input_node.name());
+  // Set the `batch_size` input_argument
+  batch_node.add_input(old_batch_node.input(1));
+  if (batch_node.op() == "BatchDatasetV2") {
+    // Set the `drop_remainder` input argument
+    batch_node.add_input(old_batch_node.input(2));
+  }
+
+  // Set attrs
+  AttrValue output_types;
+  for (const auto& input : vectorized_func.signature().input_arg()) {
+    output_types.mutable_list()->add_type(input.type());
+  }
+  (*batch_node.mutable_attr())["output_types"] = output_types;
+
+  auto& output_shapes_attr = (*batch_node.mutable_attr())["output_shapes"];
+  const auto& input_shapes =
+      input_node.attr().at("output_shapes").list().shape();
+  int64 batch_size =
+      old_batch_node.attr().at("output_shapes").list().shape()[0].dim(0).size();
+  for (size_t i = 0; i < input_shapes.size(); ++i) {
+    TensorShapeProto* shape = output_shapes_attr.mutable_list()->add_shape();
+    TensorShapeProto_Dim* dim = shape->add_dim();
+    dim->set_size(batch_size);
+    shape->MergeFrom(input_shapes.Get(i));
+  }
+  return batch_node;
+}
+
+NodeDef make_new_map_node(const NodeDef& old_map_node,
+                          const NodeDef& old_batch_node,
+                          const NodeDef& new_batch_node,
+                          const FunctionDef& vectorized_func,
+                          MutableGraphView* graph) {
+  NodeDef map_node;
+  map_node.set_op(old_map_node.op());
+  graph_utils::SetUniqueGraphNodeName(map_node.op(), graph->GetGraph(),
+                                      &map_node);
+
+  // Set the `input_dataset` input argument
+  map_node.add_input(new_batch_node.name());
+  for (int i = 1; i < old_map_node.input_size(); i++) {
+    // Set the `other_arguments` and `num_parallel_calls` input arguments
+    map_node.add_input(old_map_node.input(i));
+  }
+
+  // Set attrs
+  CopyAttribute("Targuments", old_map_node, &map_node);
+  auto& func_attr = (*map_node.mutable_attr())["f"];
+  func_attr.mutable_func()->set_name(vectorized_func.signature().name());
+
+  for (auto key : {"output_shapes", "output_types"}) {
+    CopyAttribute(key, old_batch_node, &map_node);
+  }
+  return map_node;
+}
+
+}  // namespace
+
+Status MapVectorization::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                  GraphDef* output) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+  std::set<string> nodes_to_delete;
+
+  for (const NodeDef& node : item.graph.node()) {
+    // Find Map->Batch nodes.
+    // TODO(rachelim): Optimize MapAndBatchDataset[V2] as well.
+    if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
+      continue;
+    }
+
+    const NodeDef& batch_node(node);
+    NodeDef* node2 = graph_utils::GetInputNode(batch_node, graph);
+    if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
+      continue;
+    }
+
+    // Use a more descriptive variable name now that we know the node type.
+    NodeDef* map_node = node2;
+    // Input to the map node
+    NodeDef* input_node = graph_utils::GetInputNode(*map_node, graph);
+    CHECK_NOTNULL(input_node);
+
+    FunctionDefLibrary* library = output->mutable_library();
+
+    FunctionLibraryDefinition function_library(OpRegistry::Global(), *library);
+    const FunctionDef* orig_func =
+        function_library.Find(map_node->attr().at("f").func().name());
+
+    // Check that this is a valid optimization.
+    if (!IsOutputShapesFullyDefined(*input_node) ||
+        !IsOutputShapesFullyDefined(*map_node) ||
+        IsStatefulFn(function_library, *orig_func) ||
+        HasCapturedInputs(*map_node)) {
+      // 1. If any of the inputs have an unknown shape, don't optimize, since
+      // inputs might not be batchable.
+      // 2. If any of the map func outputs have an unknown shape, don't
+      // optimize, so that batching errors surface as before.
+      // 3. If the function is stateful, don't vectorize it.
+      // 4. TODO(rachelim): Make this work for MapDataset with captured inputs
+      // by tiling inputs or modifying the signature of MapDefun.
+      continue;
+    }
+
+    FunctionDef* vectorized_func =
+        AddVectorizedFunction(*map_node, *orig_func, library);
+    CHECK_NOTNULL(vectorized_func);
+
+    auto* new_batch_node = graph.AddNode(
+        make_new_batch_node(batch_node, *input_node, *vectorized_func, &graph));
+
+    auto* new_map_node = graph.AddNode(make_new_map_node(
+        *map_node, batch_node, *new_batch_node, *vectorized_func, &graph));
+    graph.ReplaceInput(batch_node, *new_map_node);
+
+    // Mark the `Map` and `Batch` nodes for removal.
+    nodes_to_delete.insert(map_node->name());
+    nodes_to_delete.insert(batch_node.name());
+  }
+  graph.DeleteNodes(nodes_to_delete);
+  return Status::OK();
+}
+
+void MapVectorization::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                const GraphDef& optimize_output,
+                                double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(MapVectorization, "map_vectorization");
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.h b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
new file mode 100644
index 0000000000..cc56a8ee5e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class MapVectorization : public CustomGraphOptimizer {
+ public:
+  MapVectorization() = default;
+  ~MapVectorization() override = default;
+
+  string name() const override { return "map_vectorization"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
new file mode 100644
index 0000000000..be2475bae8
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
@@ -0,0 +1,201 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_vectorization.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+using test::function::GDef;
+using test::function::NDef;
+
+void MakeTensorShapeProtoHelper(const gtl::ArraySlice<int> dims,
+                                TensorShapeProto* t) {
+  for (size_t i = 0; i < dims.size(); ++i) {
+    auto* d = t->add_dim();
+    d->set_size(dims[i]);
+  }
+}
+
+AttrValue MakeShapeListAttr(
+    const gtl::ArraySlice<const gtl::ArraySlice<int>>& shapes) {
+  AttrValue shapes_attr;
+  for (size_t i = 0; i < shapes.size(); ++i) {
+    MakeTensorShapeProtoHelper(shapes[i],
+                               shapes_attr.mutable_list()->add_shape());
+  }
+
+  return shapes_attr;
+}
+
+NodeDef MakeMapNodeHelper(
+    StringPiece name, StringPiece input_node_name, StringPiece function_name,
+    StringPiece map_op_name,
+    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
+    const gtl::ArraySlice<DataType>& output_types) {
+  return test::function::NDef(
+      name, map_op_name, {input_node_name.ToString()},
+      {{"f", FunctionDefHelper::FunctionRef(function_name.ToString())},
+       {"Targuments", {}},
+       {"output_shapes", MakeShapeListAttr(output_shapes)},
+       {"output_types", output_types}});
+}
+
+NodeDef MakeMapNode(
+    StringPiece name, StringPiece input_node_name, StringPiece function_name,
+    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
+    const gtl::ArraySlice<DataType>& output_types) {
+  return MakeMapNodeHelper(name, input_node_name, function_name, "MapDataset",
+                           output_shapes, output_types);
+}
+
+NodeDef MakeBatchNode(
+    StringPiece name, StringPiece input_node_name,
+    StringPiece input_batch_size_name,
+    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
+    const gtl::ArraySlice<DataType>& output_types) {
+  return NDef(name, "BatchDataset",
+              {input_node_name.ToString(), input_batch_size_name.ToString()},
+              {{"output_types", output_types},
+               {"output_shapes", MakeShapeListAttr(output_shapes)}});
+}
+
+NodeDef MakeBatchV2Node(
+    StringPiece name, StringPiece input_node_name,
+    StringPiece input_batch_size_name, StringPiece input_drop_remainder_name,
+    const gtl::ArraySlice<const gtl::ArraySlice<int>>& output_shapes,
+    const gtl::ArraySlice<DataType>& output_types) {
+  return NDef(name, "BatchDatasetV2",
+              {input_node_name.ToString(), input_batch_size_name.ToString(),
+               input_drop_remainder_name.ToString()},
+              {{"output_types", output_types},
+               {"output_shapes", MakeShapeListAttr(output_shapes)}});
+}
+
+NodeDef MakeRangeNode(StringPiece name, const gtl::ArraySlice<string>& inputs) {
+  return NDef(name, "RangeDataset", inputs,
+              {{"output_shapes", MakeShapeListAttr({{}})},
+               {"output_types", gtl::ArraySlice<DataType>({DT_INT64})}});
+}
+
+TEST(MapVectorizationTest, VectorizeMapWithBatch) {
+  GrapplerItem item;
+  item.graph = GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       MakeRangeNode("range", {"start", "stop", "step"}),
+       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
+       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
+            1);
+  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
+            1);
+  const NodeDef& map_node =
+      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
+  const NodeDef& batch_node =
+      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
+  EXPECT_EQ(map_node.input(0), batch_node.name());
+  EXPECT_EQ(batch_node.input(0), "range");
+}
+
+TEST(MapVectorizationTest, VectorizeMapWithBatchV2) {
+  GrapplerItem item;
+  item.graph = GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("drop_remainder", "Const", {},
+            {{"value", false}, {"dtype", DT_BOOL}}),
+       MakeRangeNode("range", {"start", "stop", "step"}),
+       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
+       MakeBatchV2Node("batch", "map", "batch_size", "drop_remainder", {{-1}},
+                       {DT_INT32})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
+            1);
+  EXPECT_EQ(
+      graph_utils::FindAllGraphNodesWithOp("BatchDatasetV2", output).size(), 1);
+  const NodeDef& map_node =
+      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
+  const NodeDef& batch_node =
+      output.node(graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output));
+  EXPECT_EQ(map_node.input(0), batch_node.name());
+  EXPECT_EQ(batch_node.input(0), "range");
+}
+
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputShape) {
+  GrapplerItem item;
+  item.graph = GDef(
+      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("input", "InputDataset", {},
+            {{"output_types", gtl::ArraySlice<DataType>({DT_INT32})}}),
+       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
+       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+}
+
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
+  GrapplerItem item;
+  item.graph = GDef(
+      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("input", "InputDataset", {},
+            {{"output_shapes", MakeShapeListAttr({{}})}}),
+       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
+       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index 55d57b3b97..a26f1000a3 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -69,8 +69,7 @@ Status NoOpElimination::Optimize(Cluster* cluster, const GrapplerItem& item,
   for (const NodeDef& node : item.graph.node()) {
     if (!IsNoOp(node, graph)) continue;
 
-    GraphView::InputPort input_port = graph.GetInputPort(node.name(), 0);
-    NodeDef* const parent = graph.GetRegularFanin(input_port).node;
+    NodeDef* const parent = graph_utils::GetInputNode(node, graph);
     graph.ReplaceInput(node, *parent);
 
     nodes_to_delete.insert(node.name());
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index 7c7161c5b2..cb0ff670e8 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -76,8 +76,8 @@ Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
 
     // Use a more descriptive variable name now that we know the node type.
     const NodeDef& repeat_node = node;
-    GraphView::InputPort input_port = graph.GetInputPort(repeat_node.name(), 0);
-    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
+    NodeDef* node2 = graph_utils::GetInputNode(repeat_node, graph);
+
     if (node2->op() != "ShuffleDataset") {
       continue;
     }
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
index a2e470e511..f0696eb76d 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
@@ -78,7 +78,7 @@ TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
   EXPECT_TRUE(
       graph_utils::ContainsNodeWithOp("ShuffleAndRepeatDataset", output));
   NodeDef shuffle_and_repeat_node = output.node(
-      graph_utils::FindNodeWithOp("ShuffleAndRepeatDataset", output));
+      graph_utils::FindGraphNodeWithOp("ShuffleAndRepeatDataset", output));
   EXPECT_EQ(shuffle_and_repeat_node.input_size(), 5);
   EXPECT_EQ(shuffle_and_repeat_node.input(0), shuffle_node->input(0));
   EXPECT_EQ(shuffle_and_repeat_node.input(1), shuffle_node->input(1));
diff --git a/tensorflow/core/grappler/optimizers/evaluation_utils.cc b/tensorflow/core/grappler/optimizers/evaluation_utils.cc
index 00ad7494f4..79d9ea1608 100644
--- a/tensorflow/core/grappler/optimizers/evaluation_utils.cc
+++ b/tensorflow/core/grappler/optimizers/evaluation_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/denormal.h"
diff --git a/tensorflow/core/grappler/optimizers/evaluation_utils.h b/tensorflow/core/grappler/optimizers/evaluation_utils.h
index 8414b5b8ca..c9dfb6dc0b 100644
--- a/tensorflow/core/grappler/optimizers/evaluation_utils.h
+++ b/tensorflow/core/grappler/optimizers/evaluation_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace Eigen {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 645e4c2087..56364f0095 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -453,6 +453,7 @@ Status InitializeFunctionSpecializationSignature(
 }
 
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
+                          const int graph_def_version,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
   VLOG(2) << "Specialize function instantiation: "
@@ -492,7 +493,8 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // Make a GrapplerFunctionItem and convert it back to FunctionDef after
   // pushing all constant inputs into the function body.
   GrapplerFunctionItem item;
-  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                              graph_def_version, &item));
 
   // Push const inputs into the function body, and keep track of their control
   // dependencies.
@@ -576,15 +578,15 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
 
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                       const FunctionOptimizerContext& ctx,
-                      GraphDef* optimized_graph) {
+                      const int graph_def_version, GraphDef* optimized_graph) {
   VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
 
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
   GrapplerFunctionItem item;
-  Status item_status =
-      MakeGrapplerFunctionItem(func, func_attr, ctx.function_library(), &item);
+  Status item_status = MakeGrapplerFunctionItem(
+      func, func_attr, ctx.function_library(), graph_def_version, &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -645,7 +647,8 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
     if (func_body_node_func != nullptr) {
       // Recursively inline function calls.
       TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
-                                        ctx, optimized_graph));
+                                        ctx, graph_def_version,
+                                        optimized_graph));
     } else {
       // Annotate the node with the function attributes.
       for (const auto& attr : func.attr()) {
@@ -824,7 +827,8 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       if (inline_func && ctx.IsInlinedFunction(func_name)) {
         // Inline function body into the optimized graph}
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            InlineFunction(node, *func, ctx, optimized_graph));
+            InlineFunction(node, *func, ctx, item.graph.versions().producer(),
+                           optimized_graph));
         continue;
       }
 
@@ -837,7 +841,8 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            SpecializeFunction(node, *func, &ctx, optimized_graph));
+            SpecializeFunction(node, *func, item.graph.versions().producer(),
+                               &ctx, optimized_graph));
         continue;
       }
     }
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 1be5f8dcc2..91794cefe5 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index e778b7879d..5fd34efeb1 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -361,7 +361,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
       // Make a GrapplerItem from a FunctionDef.
       GrapplerFunctionItem func_item;
-      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item));
+      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+          func, flib, item.graph.versions().producer(), &func_item));
 
       // Optimize function body graph.
       GraphDef optimized_func_graph;
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
index 89847f83d4..b033cff8e6 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/testlib.h"
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 26c54df56b..caa0b7b0cb 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
 
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/graph_view.h"
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 462b752316..a2c363ea6e 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -307,8 +308,8 @@ GrapplerFunctionItem::GrapplerFunctionItem(
     const AttrValueMap& func_attr,
     const std::vector<InputArgExpansion>& input_arg_expansions,
     const std::vector<OutputArgExpansion>& output_arg_expansions,
-    const std::vector<string>& keep_nodes, bool is_stateful,
-    GraphDef&& function_body)
+    const std::vector<string>& keep_nodes, const int graph_def_version,
+    bool is_stateful, GraphDef&& function_body)
     : description_(description),
       func_attr_(func_attr),
       input_arg_expansions_(input_arg_expansions),
@@ -318,6 +319,7 @@ GrapplerFunctionItem::GrapplerFunctionItem(
   keep_ops = keep_nodes;
   // Swap the graph body.
   graph.Swap(&function_body);
+  graph.mutable_versions()->set_producer(graph_def_version);
   // Fill the feed nodes with input placeholders.
   for (const InputArgExpansion& input_arg : input_arg_expansions_) {
     for (const string& placeholder : input_arg.placeholders) {
@@ -472,6 +474,7 @@ Status InstantiationBodyParameters(
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const AttrValueMap& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
+                                const int graph_def_version,
                                 GrapplerFunctionItem* item) {
   const OpDef& signature = func.signature();
 
@@ -595,14 +598,17 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   *item = GrapplerFunctionItem(
       /*func_name=*/signature.name(), /*description=*/signature.description(),
       /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
-      inputs, outputs, keep_nodes, is_stateful, std::move(function_body));
+      inputs, outputs, keep_nodes, graph_def_version, is_stateful,
+      std::move(function_body));
   return Status::OK();
 }
 
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const FunctionLibraryDefinition& flib,
+                                const int graph_def_version,
                                 GrapplerFunctionItem* item) {
-  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, item);
+  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, graph_def_version,
+                                  item);
 }
 
 // Register GrapplerFunctionItem input arg expansion and function body outputs
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 9f607dc2ee..61588ceb83 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -141,8 +141,8 @@ class GrapplerFunctionItem : public GrapplerItem {
       const AttrValueMap& func_attr,
       const std::vector<InputArgExpansion>& input_arg_expansions,
       const std::vector<OutputArgExpansion>& output_arg_expansions,
-      const std::vector<string>& keep_nodes, bool is_stateful,
-      GraphDef&& function_body);
+      const std::vector<string>& keep_nodes, const int versions,
+      bool is_stateful, GraphDef&& function_body);
 
   const string& description() const;
 
@@ -222,6 +222,7 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const AttrValueMap& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
+                                const int graph_def_version,
                                 GrapplerFunctionItem* item);
 
 // Make a GrapplerFunction item from the function definition. Function must be
@@ -231,6 +232,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 // without specializing it to it's instantiation attributes (at least types)?
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const FunctionLibraryDefinition& flib,
+                                const int graph_def_version,
                                 GrapplerFunctionItem* item);
 
 // Make a FunctionDef from the GrapplerFunctionItem. Use function library
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index b2d059e0ac..b51f2781b8 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -239,7 +240,8 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("XTimesTwo", item.id);
   EXPECT_EQ(4, item.function_body().node_size());
@@ -314,7 +316,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("SubGrad", item.id);
   EXPECT_EQ(12, item.function_body().node_size());
@@ -395,7 +398,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
   func_attr["T"].set_type(DT_FLOAT);
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -456,7 +460,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(1, item.output_size());
   EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
@@ -499,7 +504,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("ForwardInputs", item.id);
   EXPECT_EQ(5, item.function_body().node_size());
@@ -545,7 +551,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(0, item.input_size());
   EXPECT_EQ(1, item.output_size());
@@ -584,7 +591,8 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   FunctionDef specialized;
   TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
@@ -622,7 +630,8 @@ TEST_F(FunctionsTest, ReplaceInputWithConst) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(2, item.input_size());
   EXPECT_EQ(1, item.output_size());
@@ -713,7 +722,8 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), lib_def);
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   // Replace function body with identity function
   item.SwapFunctionBody(std::move(id_func_body));
@@ -754,7 +764,8 @@ TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
   GrapplerFunctionItem item;
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_INT32);
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
+                                        TF_GRAPH_DEF_VERSION, &item));
 
   FunctionDef func2;
   TF_EXPECT_OK(MakeFunctionDef(item, flib, &func2));
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 95e988e5fc..3690fd4362 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -52,6 +52,8 @@ load(
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
+    "if_mkl_ml",
+    "mkl_deps",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
@@ -493,16 +495,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "warn_about_ints",
-    srcs = ["warn_about_ints.cc"],
-    hdrs = ["warn_about_ints.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 # Private support libraries ---------------------------------------------------
 
 cc_header_only_library(
@@ -628,6 +620,7 @@ cc_library(
         ":gather_nd_op",
         ":gather_op",
         ":guarantee_const_op",
+        ":host_constant_op",
         ":identity_n_op",
         ":identity_op",
         ":inplace_ops",
@@ -702,6 +695,12 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
+    name = "host_constant_op",
+    prefix = "host_constant_op",
+    deps = ARRAY_DEPS,
+)
+
+tf_kernel_library(
     name = "diag_op",
     prefix = "diag_op",
     deps = ARRAY_DEPS,
@@ -902,10 +901,7 @@ if_mkl(
             "transpose_op.cc",
         ],
         hdrs = ["transpose_op.h"],
-        deps = ARRAY_DEPS + if_mkl([
-            "//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ]),
+        deps = ARRAY_DEPS + mkl_deps(),
     )],
     [tf_kernel_library(
         name = "transpose_op",
@@ -1302,6 +1298,7 @@ tf_cuda_cc_test(
     srcs = ["gather_nd_op_test.cc"],
     deps = [
         ":gather_nd_op",
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -2868,7 +2865,7 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "batch_matmul_op",
-    srcs = [] + if_mkl([
+    srcs = if_mkl_ml([
         "mkl_batch_matmul_op.cc",
     ]),
     # <prefix>*impl.h are excluded by default from the CPU build, add explicitly.
@@ -2877,7 +2874,7 @@ tf_kernel_library(
     # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
     copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "batch_matmul_op",
-    deps = MATH_DEPS + if_mkl([
+    deps = MATH_DEPS + if_mkl_ml([
         "//third_party/mkl:intel_binary_blob",
     ]),
 )
@@ -2960,10 +2957,7 @@ tf_kernel_library(
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }) + if_mkl([
-        "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ]) + if_cuda([
+    }) + mkl_deps() + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
 )
@@ -3173,6 +3167,7 @@ tf_cuda_cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         ":reduction_ops",
@@ -3308,6 +3303,7 @@ tf_cuda_cc_test(
     srcs = ["diag_op_test.cc"],
     deps = [
         ":diag_op",
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -3528,13 +3524,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "softplus_op",
     prefix = "softplus_op",
-    deps = NN_DEPS + [":warn_about_ints"],
+    deps = NN_DEPS,
 )
 
 tf_kernel_library(
     name = "softsign_op",
     prefix = "softsign_op",
-    deps = NN_DEPS + [":warn_about_ints"],
+    deps = NN_DEPS,
 )
 
 tf_kernel_library(
@@ -3635,6 +3631,7 @@ tf_cuda_cc_test(
     name = "nn_ops_test",
     srcs = ["nn_ops_test.cc"],
     deps = [
+        ":host_constant_op",
         ":nn",
         ":ops_testutil",
         ":ops_util",
@@ -3768,7 +3765,7 @@ tf_kernel_library(
         "spacetobatch_functor.h",
         "spacetobatch_functor_gpu.cu.cc",
     ],
-    visibility = ["//visibility:private"],
+    visibility = [":friends"],
     deps = [
         ":bounds_check",
         "//tensorflow/core:framework",
@@ -3782,6 +3779,7 @@ tf_cuda_cc_test(
     srcs = ["spacetobatch_benchmark_test.cc"],
     deps = [
         ":batch_space_ops",
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -3921,6 +3919,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["random_op_test.cc"],
     deps = [
+        ":host_constant_op",
         ":random_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4175,6 +4174,7 @@ tf_cuda_cc_tests(
         "sparse_xent_op_test.cc",
     ],
     deps = [
+        ":host_constant_op",
         ":ops_testutil",
         ":ops_util",
         ":sparse",
@@ -4388,6 +4388,7 @@ cc_library(
         ":regex_full_match_op",
         ":regex_replace_op",
         ":string_join_op",
+        ":string_length_op",
         ":string_split_op",
         ":string_strip_op",
         ":string_to_hash_bucket_op",
@@ -4423,6 +4424,12 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
+    name = "string_length_op",
+    prefix = "string_length_op",
+    deps = STRING_DEPS,
+)
+
+tf_kernel_library(
     name = "regex_full_match_op",
     prefix = "regex_full_match_op",
     deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
@@ -4434,12 +4441,48 @@ tf_kernel_library(
     deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
 )
 
+tf_cc_test(
+    name = "regex_replace_op_test",
+    size = "small",
+    srcs = ["regex_replace_op_test.cc"],
+    deps = [
+        ":regex_replace_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
 tf_kernel_library(
     name = "string_split_op",
     prefix = "string_split_op",
     deps = STRING_DEPS,
 )
 
+tf_cc_test(
+    name = "string_split_op_test",
+    size = "small",
+    srcs = ["string_split_op_test.cc"],
+    deps = [
+        ":string_split_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
 tf_kernel_library(
     name = "string_strip_op",
     prefix = "string_strip_op",
@@ -4513,6 +4556,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["multinomial_op_test.cc"],
     deps = [
+        ":host_constant_op",
         ":multinomial_op",
         ":ops_util",
         "//tensorflow/core:core_cpu",
@@ -4540,6 +4584,7 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["parameterized_truncated_normal_op_test.cc"],
     deps = [
+        ":host_constant_op",
         ":ops_util",
         ":parameterized_truncated_normal_op",
         "//tensorflow/core:core_cpu",
@@ -5049,7 +5094,6 @@ filegroup(
         "training_ops.h",
         "transpose_functor.h",
         "transpose_op.h",
-        "warn_about_ints.h",
         "where_op.h",
         "xent_op.h",
     ],
@@ -5226,7 +5270,6 @@ filegroup(
         "transpose_functor_cpu.cc",
         "transpose_op.cc",
         "unique_op.cc",
-        "warn_about_ints.cc",
         "where_op.cc",
         "xent_op.cc",
         ":android_extended_ops_headers",
@@ -6153,8 +6196,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6168,8 +6210,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6184,8 +6225,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6204,8 +6244,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6220,8 +6259,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6236,56 +6274,43 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    ] + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = NN_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_aggregate_ops",
     prefix = "mkl_aggregate_ops",
-    deps = MATH_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = MATH_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
-    deps = ARRAY_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_reshape_op",
     prefix = "mkl_reshape_op",
-    deps = ARRAY_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_identity_op",
     prefix = "mkl_identity_op",
-    deps = ARRAY_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
     name = "mkl_lrn_op",
     prefix = "mkl_lrn_op",
-    deps = NN_DEPS + [
-        "//third_party/mkl:intel_binary_blob",
-    ] + if_mkl(["@mkl_dnn"]),
+    deps = NN_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6296,10 +6321,7 @@ tf_mkl_kernel_library(
         "cwise_ops_gradients.h",
     ],
     prefix = "mkl_cwise_ops_common",
-    deps = NN_DEPS + [
-        "cwise_op",
-        "//third_party/mkl:intel_binary_blob",
-    ],
+    deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
 # NOTE(lespeholt): This rule is deprecated, please use:
diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/adjust_contrast_op.h
index 7689c04214..f4a53c2ef9 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.h
+++ b/tensorflow/core/kernels/adjust_contrast_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
-#define TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -153,4 +153,4 @@ struct AdjustContrastv2 {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_CONTRAST_OP_H_
diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/adjust_hue_op.h
index 03d52a9e77..983a4072bf 100644
--- a/tensorflow/core/kernels/adjust_hue_op.h
+++ b/tensorflow/core/kernels/adjust_hue_op.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
-#define _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
@@ -37,4 +37,4 @@ struct AdjustHueGPU {
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
-#endif  // _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H
+#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H_
diff --git a/tensorflow/core/kernels/adjust_saturation_op.h b/tensorflow/core/kernels/adjust_saturation_op.h
index 05c45c07c3..fd28ba536f 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.h
+++ b/tensorflow/core/kernels/adjust_saturation_op.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H
-#define _TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H
+#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
@@ -37,4 +37,4 @@ struct AdjustSaturationGPU {
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
-#endif  // _TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H
+#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_SATURATION_OP_H_
diff --git a/tensorflow/core/kernels/aggregate_ops.h b/tensorflow/core/kernels/aggregate_ops.h
index 9ea49fc34b..e074d0c2d9 100644
--- a/tensorflow/core/kernels/aggregate_ops.h
+++ b/tensorflow/core/kernels/aggregate_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
-#define TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_H_
 
 // Functor definitions for Aggregate ops, must be compilable by nvcc.
 
@@ -223,4 +223,4 @@ struct Add9EigenImpl {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_AGGREGATE_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_H_
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index aa1cead928..3e87917b64 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_AGGREGATE_OPS_CPU_H_
-#define TENSORFLOW_KERNELS_AGGREGATE_OPS_CPU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_CPU_H_
+#define TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_CPU_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -250,4 +250,4 @@ struct Add9Functor<SYCLDevice, T> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_AGGREGATE_OPS_CPU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_AGGREGATE_OPS_CPU_H_
diff --git a/tensorflow/core/kernels/argmax_op.h b/tensorflow/core/kernels/argmax_op.h
index b8bc41e089..224aa4654d 100644
--- a/tensorflow/core/kernels/argmax_op.h
+++ b/tensorflow/core/kernels/argmax_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_ARGMAX_OP_H_
-#define TENSORFLOW_KERNELS_ARGMAX_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
 // Generator definition for ArgMaxOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -65,4 +65,4 @@ struct ArgMin {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_ARGMAX_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_ARGMAX_OP_H_
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index a450b1d1ee..74f926bdc8 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_ASSIGN_OP_H_
-#define TENSORFLOW_KERNELS_ASSIGN_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_ASSIGN_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ASSIGN_OP_H_
 
 #define EIGEN_USE_THREADS
 
@@ -143,4 +143,4 @@ class AssignOp : public OpKernel {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_ASSIGN_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_ASSIGN_OP_H_
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
index f5e81dbc09..1e49a66af9 100644
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
-#define TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_AVGPOOLING_OP_H_
+#define TENSORFLOW_CORE_KERNELS_AVGPOOLING_OP_H_
 // Functor definition for AvgPoolingOp, must be compilable by nvcc.
 
 #include "tensorflow/core/framework/tensor_types.h"
@@ -76,4 +76,4 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_AVGPOOLING_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_AVGPOOLING_OP_H_
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 475bda848d..766713a338 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -15,6 +15,9 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
+
 #define EIGEN_USE_THREADS
 
 #include <vector>
@@ -613,3 +616,5 @@ class BatchMatMul : public OpKernel {
       BatchMatMul<SYCLDevice, TYPE>)
 #endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/batch_norm_op.h b/tensorflow/core/kernels/batch_norm_op.h
index 48e73c8757..76b156f8fd 100644
--- a/tensorflow/core/kernels/batch_norm_op.h
+++ b/tensorflow/core/kernels/batch_norm_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
-#define TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_NORM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_NORM_OP_H_
 // Functor definition for BatchNormOp, must be compilable by nvcc.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -153,4 +153,4 @@ struct BatchNormGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_BATCH_NORM_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_NORM_OP_H_
diff --git a/tensorflow/core/kernels/betainc_op.h b/tensorflow/core/kernels/betainc_op.h
index c4aa9543ab..b941b27ad3 100644
--- a/tensorflow/core/kernels/betainc_op.h
+++ b/tensorflow/core/kernels/betainc_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_BETAINC_OP_H_
-#define TENSORFLOW_KERNELS_BETAINC_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BETAINC_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BETAINC_OP_H_
 // Functor definition for BetaincOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -48,4 +48,4 @@ struct Betainc {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_BETAINC_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BETAINC_OP_H_
diff --git a/tensorflow/core/kernels/bias_op.h b/tensorflow/core/kernels/bias_op.h
index 065934c709..77f683455d 100644
--- a/tensorflow/core/kernels/bias_op.h
+++ b/tensorflow/core/kernels/bias_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_BIAS_OP_H_
-#define TENSORFLOW_KERNELS_BIAS_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BIAS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BIAS_OP_H_
 // Functor definition for BiasOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -52,4 +52,4 @@ struct Bias {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_BIAS_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BIAS_OP_H_
diff --git a/tensorflow/core/kernels/bincount_op.h b/tensorflow/core/kernels/bincount_op.h
index cd3d560cd1..54cfb79de7 100644
--- a/tensorflow/core/kernels/bincount_op.h
+++ b/tensorflow/core/kernels/bincount_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_BINCOUNT_OP_H_
-#define TENSORFLOW_BINCOUNT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BINCOUNT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BINCOUNT_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -38,4 +38,4 @@ struct BincountFunctor {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_BINCOUNT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BINCOUNT_OP_H_
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
new file mode 100644
index 0000000000..3163c63949
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
@@ -0,0 +1,63 @@
+# Description:
+#   This directory contains common utilities used in boosted_trees.
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+# Quantiles
+
+cc_library(
+    name = "weighted_quantiles",
+    srcs = [],
+    hdrs = [
+        "weighted_quantiles_buffer.h",
+        "weighted_quantiles_stream.h",
+        "weighted_quantiles_summary.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "weighted_quantiles_buffer_test",
+    size = "small",
+    srcs = ["weighted_quantiles_buffer_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "weighted_quantiles_summary_test",
+    size = "small",
+    srcs = ["weighted_quantiles_summary_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "weighted_quantiles_stream_test",
+    size = "small",
+    srcs = ["weighted_quantiles_stream_test.cc"],
+    deps = [
+        ":weighted_quantiles",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h
new file mode 100644
index 0000000000..07aa9831c4
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h
@@ -0,0 +1,132 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace quantiles {
+
+// Buffering container ideally suited for scenarios where we need
+// to sort and dedupe/compact fixed chunks of a stream of weighted elements.
+template <typename ValueType, typename WeightType,
+          typename CompareFn = std::less<ValueType>>
+class WeightedQuantilesBuffer {
+ public:
+  struct BufferEntry {
+    BufferEntry(ValueType v, WeightType w)
+        : value(std::move(v)), weight(std::move(w)) {}
+    BufferEntry() : value(), weight(0) {}
+
+    bool operator<(const BufferEntry& other) const {
+      return kCompFn(value, other.value);
+    }
+    bool operator==(const BufferEntry& other) const {
+      return value == other.value && weight == other.weight;
+    }
+    friend std::ostream& operator<<(std::ostream& strm,
+                                    const BufferEntry& entry) {
+      return strm << "{" << entry.value << ", " << entry.weight << "}";
+    }
+    ValueType value;
+    WeightType weight;
+  };
+
+  explicit WeightedQuantilesBuffer(int64 block_size, int64 max_elements)
+      : max_size_(std::min(block_size << 1, max_elements)) {
+    QCHECK(max_size_ > 0) << "Invalid buffer specification: (" << block_size
+                          << ", " << max_elements << ")";
+    vec_.reserve(max_size_);
+  }
+
+  // Disallow copying as it's semantically non-sensical in the Squawd algorithm
+  // but enable move semantics.
+  WeightedQuantilesBuffer(const WeightedQuantilesBuffer& other) = delete;
+  WeightedQuantilesBuffer& operator=(const WeightedQuantilesBuffer&) = delete;
+  WeightedQuantilesBuffer(WeightedQuantilesBuffer&& other) = default;
+  WeightedQuantilesBuffer& operator=(WeightedQuantilesBuffer&& other) = default;
+
+  // Push entry to buffer and maintain a compact representation within
+  // pre-defined size limit.
+  void PushEntry(ValueType value, WeightType weight) {
+    // Callers are expected to act on a full compacted buffer after the
+    // PushEntry call returns.
+    QCHECK(!IsFull()) << "Buffer already full: " << max_size_;
+
+    // Ignore zero and negative weight entries.
+    if (weight <= 0) {
+      return;
+    }
+
+    // Push back the entry to the buffer.
+    vec_.push_back(BufferEntry(std::move(value), std::move(weight)));
+  }
+
+  // Returns a sorted vector view of the base buffer and clears the buffer.
+  // Callers should minimize how often this is called, ideally only right after
+  // the buffer becomes full.
+  std::vector<BufferEntry> GenerateEntryList() {
+    std::vector<BufferEntry> ret;
+    if (vec_.size() == 0) {
+      return ret;
+    }
+    ret.swap(vec_);
+    vec_.reserve(max_size_);
+    std::sort(ret.begin(), ret.end());
+    size_t num_entries = 0;
+    for (size_t i = 1; i < ret.size(); ++i) {
+      if (ret[i].value != ret[i - 1].value) {
+        BufferEntry tmp = ret[i];
+        ++num_entries;
+        ret[num_entries] = tmp;
+      } else {
+        ret[num_entries].weight += ret[i].weight;
+      }
+    }
+    ret.resize(num_entries + 1);
+    return ret;
+  }
+
+  int64 Size() const { return vec_.size(); }
+  bool IsFull() const { return vec_.size() >= max_size_; }
+  void Clear() { vec_.clear(); }
+
+ private:
+  using BufferVector = typename std::vector<BufferEntry>;
+
+  // Comparison function.
+  static constexpr decltype(CompareFn()) kCompFn = CompareFn();
+
+  // Base buffer.
+  size_t max_size_;
+  BufferVector vec_;
+};
+
+template <typename ValueType, typename WeightType, typename CompareFn>
+constexpr decltype(CompareFn())
+    WeightedQuantilesBuffer<ValueType, WeightType, CompareFn>::kCompFn;
+
+}  // namespace quantiles
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer_test.cc b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer_test.cc
new file mode 100644
index 0000000000..75f05d64f3
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer_test.cc
@@ -0,0 +1,99 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+using Buffer =
+    boosted_trees::quantiles::WeightedQuantilesBuffer<double, double>;
+using BufferEntry =
+    boosted_trees::quantiles::WeightedQuantilesBuffer<double,
+                                                      double>::BufferEntry;
+
+class WeightedQuantilesBufferTest : public ::testing::Test {};
+
+TEST_F(WeightedQuantilesBufferTest, Invalid) {
+  EXPECT_DEATH(
+      ({
+        boosted_trees::quantiles::WeightedQuantilesBuffer<double, double>
+            buffer(2, 0);
+      }),
+      "Invalid buffer specification");
+  EXPECT_DEATH(
+      ({
+        boosted_trees::quantiles::WeightedQuantilesBuffer<double, double>
+            buffer(0, 2);
+      }),
+      "Invalid buffer specification");
+}
+
+TEST_F(WeightedQuantilesBufferTest, PushEntryNotFull) {
+  Buffer buffer(20, 100);
+  buffer.PushEntry(5, 9);
+  buffer.PushEntry(2, 3);
+  buffer.PushEntry(-1, 7);
+  buffer.PushEntry(3, 0);  // This entry will be ignored.
+
+  EXPECT_FALSE(buffer.IsFull());
+  EXPECT_EQ(buffer.Size(), 3);
+}
+
+TEST_F(WeightedQuantilesBufferTest, PushEntryFull) {
+  // buffer capacity is 4.
+  Buffer buffer(2, 100);
+  buffer.PushEntry(5, 9);
+  buffer.PushEntry(2, 3);
+  buffer.PushEntry(-1, 7);
+  buffer.PushEntry(2, 1);
+
+  std::vector<BufferEntry> expected;
+  expected.emplace_back(-1, 7);
+  expected.emplace_back(2, 4);
+  expected.emplace_back(5, 9);
+
+  // At this point, we have pushed 4 entries and we expect the buffer to be
+  // full.
+  EXPECT_TRUE(buffer.IsFull());
+  EXPECT_EQ(buffer.GenerateEntryList(), expected);
+  EXPECT_FALSE(buffer.IsFull());
+}
+
+TEST_F(WeightedQuantilesBufferTest, PushEntryFullDeath) {
+  // buffer capacity is 4.
+  Buffer buffer(2, 100);
+  buffer.PushEntry(5, 9);
+  buffer.PushEntry(2, 3);
+  buffer.PushEntry(-1, 7);
+  buffer.PushEntry(2, 1);
+
+  std::vector<BufferEntry> expected;
+  expected.emplace_back(-1, 7);
+  expected.emplace_back(2, 4);
+  expected.emplace_back(5, 9);
+
+  // At this point, we have pushed 4 entries and we expect the buffer to be
+  // full.
+  EXPECT_TRUE(buffer.IsFull());
+  // Can't push any more entries before clearing.
+  EXPECT_DEATH(({ buffer.PushEntry(6, 6); }), "Buffer already full");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h
new file mode 100644
index 0000000000..525e2a6a64
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h
@@ -0,0 +1,330 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h"
+#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace quantiles {
+
+// Class to compute approximate quantiles with error bound guarantees for
+// weighted data sets.
+// This implementation is an adaptation of techniques from the following papers:
+// * (2001) Space-efficient online computation of quantile summaries.
+// * (2004) Power-conserving computation of order-statistics over
+//          sensor networks.
+// * (2007) A fast algorithm for approximate quantiles in high speed
+//          data streams.
+// * (2016) XGBoost: A Scalable Tree Boosting System.
+//
+// The key ideas at play are the following:
+// - Maintain an in-memory multi-level quantile summary in a way to guarantee
+//   a maximum approximation error of eps * W per bucket where W is the total
+//   weight across all points in the input dataset.
+// - Two base operations are defined: MERGE and COMPRESS. MERGE combines two
+//   summaries guaranteeing a epsNew = max(eps1, eps2). COMPRESS compresses
+//   a summary to b + 1 elements guaranteeing epsNew = epsOld + 1/b.
+// - b * sizeof(summary entry) must ideally be small enough to fit in an
+//   average CPU L2 cache.
+// - To distribute this algorithm with maintaining error bounds, we need
+//   the worker-computed summaries to have no more than eps / h error
+//   where h is the height of the distributed computation graph which
+//   is 2 for an MR with no combiner.
+//
+// We mainly want to max out IO bw by ensuring we're not compute-bound and
+// using a reasonable amount of RAM.
+//
+// Complexity:
+// Compute: O(n * log(1/eps * log(eps * n))).
+// Memory: O(1/eps * log^2(eps * n)) <- for one worker streaming through the
+//                                      entire dataset.
+// An epsilon value of zero would make the algorithm extremely inefficent and
+// therefore, is disallowed.
+template <typename ValueType, typename WeightType,
+          typename CompareFn = std::less<ValueType>>
+class WeightedQuantilesStream {
+ public:
+  using Buffer = WeightedQuantilesBuffer<ValueType, WeightType, CompareFn>;
+  using BufferEntry = typename Buffer::BufferEntry;
+  using Summary = WeightedQuantilesSummary<ValueType, WeightType, CompareFn>;
+  using SummaryEntry = typename Summary::SummaryEntry;
+
+  explicit WeightedQuantilesStream(double eps, int64 max_elements)
+      : eps_(eps), buffer_(1LL, 2LL), finalized_(false) {
+    // See the class documentation. An epsilon value of zero could cause
+    // perfoamance issues.
+    QCHECK(eps > 0) << "An epsilon value of zero is not allowed.";
+    std::tie(max_levels_, block_size_) = GetQuantileSpecs(eps, max_elements);
+    buffer_ = Buffer(block_size_, max_elements);
+    summary_levels_.reserve(max_levels_);
+  }
+
+  // Disallow copy and assign but enable move semantics for the stream.
+  WeightedQuantilesStream(const WeightedQuantilesStream& other) = delete;
+  WeightedQuantilesStream& operator=(const WeightedQuantilesStream&) = delete;
+  WeightedQuantilesStream(WeightedQuantilesStream&& other) = default;
+  WeightedQuantilesStream& operator=(WeightedQuantilesStream&& other) = default;
+
+  // Pushes one entry while maintaining approximation error invariants.
+  void PushEntry(const ValueType& value, const WeightType& weight) {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // Push element to base buffer.
+    buffer_.PushEntry(value, weight);
+
+    // When compacted buffer is full we need to compress
+    // and push weighted quantile summary up the level chain.
+    if (buffer_.IsFull()) {
+      PushBuffer(buffer_);
+    }
+  }
+
+  // Pushes full buffer while maintaining approximation error invariants.
+  void PushBuffer(Buffer& buffer) {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // Create local compressed summary and propagate.
+    local_summary_.BuildFromBufferEntries(buffer.GenerateEntryList());
+    local_summary_.Compress(block_size_, eps_);
+    PropagateLocalSummary();
+  }
+
+  // Pushes full summary while maintaining approximation error invariants.
+  void PushSummary(const std::vector<SummaryEntry>& summary) {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // Create local compressed summary and propagate.
+    local_summary_.BuildFromSummaryEntries(summary);
+    local_summary_.Compress(block_size_, eps_);
+    PropagateLocalSummary();
+  }
+
+  // Flushes approximator and finalizes state.
+  void Finalize() {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() may only be called once.";
+
+    // Flush any remaining buffer elements.
+    PushBuffer(buffer_);
+
+    // Create final merged summary.
+    local_summary_.Clear();
+    for (auto& summary : summary_levels_) {
+      local_summary_.Merge(summary);
+      summary.Clear();
+    }
+    summary_levels_.clear();
+    summary_levels_.shrink_to_fit();
+    finalized_ = true;
+  }
+
+  // Generates requested number of quantiles after finalizing stream.
+  // The returned quantiles can be queried using std::lower_bound to get
+  // the bucket for a given value.
+  std::vector<ValueType> GenerateQuantiles(int64 num_quantiles) const {
+    // Validate state.
+    QCHECK(finalized_)
+        << "Finalize() must be called before generating quantiles.";
+    return local_summary_.GenerateQuantiles(num_quantiles);
+  }
+
+  // Generates requested number of boundaries after finalizing stream.
+  // The returned boundaries can be queried using std::lower_bound to get
+  // the bucket for a given value.
+  // The boundaries, while still guaranteeing approximation bounds, don't
+  // necessarily represent the actual quantiles of the distribution.
+  // Boundaries are preferable over quantiles when the caller is less
+  // interested in the actual quantiles distribution and more interested in
+  // getting a representative sample of boundary values.
+  std::vector<ValueType> GenerateBoundaries(int64 num_boundaries) const {
+    // Validate state.
+    QCHECK(finalized_)
+        << "Finalize() must be called before generating boundaries.";
+    return local_summary_.GenerateBoundaries(num_boundaries);
+  }
+
+  // Calculates approximation error for the specified level.
+  // If the passed level is negative, the approximation error for the entire
+  // summary is returned. Note that after Finalize is called, only the overall
+  // error is available.
+  WeightType ApproximationError(int64 level = -1) const {
+    if (finalized_) {
+      QCHECK(level <= 0) << "Only overall error is available after Finalize()";
+      return local_summary_.ApproximationError();
+    }
+
+    if (summary_levels_.empty()) {
+      // No error even if base buffer isn't empty.
+      return 0;
+    }
+
+    // If level is negative, we get the approximation error
+    // for the top-most level which is the max approximation error
+    // in all summaries by construction.
+    if (level < 0) {
+      level = summary_levels_.size() - 1;
+    }
+    QCHECK(level < summary_levels_.size()) << "Invalid level.";
+    return summary_levels_[level].ApproximationError();
+  }
+
+  size_t MaxDepth() const { return summary_levels_.size(); }
+
+  // Generates requested number of quantiles after finalizing stream.
+  const Summary& GetFinalSummary() const {
+    // Validate state.
+    QCHECK(finalized_)
+        << "Finalize() must be called before requesting final summary.";
+    return local_summary_;
+  }
+
+  // Helper method which, given the desired approximation error
+  // and an upper bound on the number of elements, computes the optimal
+  // number of levels and block size and returns them in the tuple.
+  static std::tuple<int64, int64> GetQuantileSpecs(double eps,
+                                                   int64 max_elements);
+
+  // Serializes the internal state of the stream.
+  std::vector<Summary> SerializeInternalSummaries() const {
+    // The buffer should be empty for serialize to work.
+    QCHECK_EQ(buffer_.Size(), 0);
+    std::vector<Summary> result;
+    result.reserve(summary_levels_.size() + 1);
+    for (const Summary& summary : summary_levels_) {
+      result.push_back(summary);
+    }
+    result.push_back(local_summary_);
+    return result;
+  }
+
+  // Resets the state of the stream with a serialized state.
+  void DeserializeInternalSummaries(const std::vector<Summary>& summaries) {
+    // Clear the state before deserializing.
+    buffer_.Clear();
+    summary_levels_.clear();
+    local_summary_.Clear();
+    QCHECK_GT(max_levels_, summaries.size() - 1);
+    for (int i = 0; i < summaries.size() - 1; ++i) {
+      summary_levels_.push_back(summaries[i]);
+    }
+    local_summary_ = summaries[summaries.size() - 1];
+  }
+
+ private:
+  // Propagates local summary through summary levels while maintaining
+  // approximation error invariants.
+  void PropagateLocalSummary() {
+    // Validate state.
+    QCHECK(!finalized_) << "Finalize() already called.";
+
+    // No-op if there's nothing to add.
+    if (local_summary_.Size() <= 0) {
+      return;
+    }
+
+    // Propagate summary through levels.
+    size_t level = 0;
+    for (bool settled = false; !settled; ++level) {
+      // Ensure we have enough depth.
+      if (summary_levels_.size() <= level) {
+        summary_levels_.emplace_back();
+      }
+
+      // Merge summaries.
+      Summary& current_summary = summary_levels_[level];
+      local_summary_.Merge(current_summary);
+
+      // Check if we need to compress and propagate summary higher.
+      if (current_summary.Size() == 0 ||
+          local_summary_.Size() <= block_size_ + 1) {
+        current_summary = std::move(local_summary_);
+        settled = true;
+      } else {
+        // Compress, empty current level and propagate.
+        local_summary_.Compress(block_size_, eps_);
+        current_summary.Clear();
+      }
+    }
+  }
+
+  // Desired approximation precision.
+  double eps_;
+  // Maximum number of levels.
+  int64 max_levels_;
+  // Max block size per level.
+  int64 block_size_;
+  // Base buffer.
+  Buffer buffer_;
+  // Local summary used to minimize memory allocation and cache misses.
+  // After the stream is finalized, this summary holds the final quantile
+  // estimates.
+  Summary local_summary_;
+  // Summary levels;
+  std::vector<Summary> summary_levels_;
+  // Flag indicating whether the stream is finalized.
+  bool finalized_;
+};
+
+template <typename ValueType, typename WeightType, typename CompareFn>
+inline std::tuple<int64, int64>
+WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
+    double eps, int64 max_elements) {
+  int64 max_level = 1LL;
+  int64 block_size = 2LL;
+  QCHECK(eps >= 0 && eps < 1);
+  QCHECK_GT(max_elements, 0);
+
+  if (eps <= std::numeric_limits<double>::epsilon()) {
+    // Exact quantile computation at the expense of RAM.
+    max_level = 1;
+    block_size = std::max(max_elements, int64{2});
+  } else {
+    // The bottom-most level will become full at most
+    // (max_elements / block_size) times, the level above will become full
+    // (max_elements / 2 * block_size) times and generally level l becomes
+    // full (max_elements / 2^l * block_size) times until the last
+    // level max_level becomes full at most once meaning when the inequality
+    // (2^max_level * block_size >= max_elements) is satisfied.
+    // In what follows, we jointly solve for max_level and block_size by
+    // gradually increasing the level until the inequality above is satisfied.
+    // We could alternatively set max_level = ceil(log2(eps * max_elements));
+    // and block_size = ceil(max_level / eps) + 1 but that tends to give more
+    // pessimistic bounds and wastes RAM needlessly.
+    for (max_level = 1, block_size = 2;
+         (1LL << max_level) * block_size < max_elements; ++max_level) {
+      // Update upper bound on block size at current level, we always
+      // increase the estimate by 2 to hold the min/max elements seen so far.
+      block_size = static_cast<size_t>(ceil(max_level / eps)) + 1;
+    }
+  }
+  return std::make_tuple(max_level, std::max(block_size, int64{2}));
+}
+
+}  // namespace quantiles
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream_test.cc b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream_test.cc
new file mode 100644
index 0000000000..6c5b9fd23b
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream_test.cc
@@ -0,0 +1,276 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_stream.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+using Tuple = std::tuple<int64, int64>;
+
+using Summary =
+    boosted_trees::quantiles::WeightedQuantilesSummary<double, double>;
+using SummaryEntry =
+    boosted_trees::quantiles::WeightedQuantilesSummary<double,
+                                                       double>::SummaryEntry;
+using Stream =
+    boosted_trees::quantiles::WeightedQuantilesStream<double, double>;
+
+TEST(GetQuantileSpecs, InvalidEps) {
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(-0.01, 0L); }, "eps >= 0");
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(1.01, 0L); }, "eps < 1");
+}
+
+TEST(GetQuantileSpecs, ZeroEps) {
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(0.0, 0L); }, "max_elements > 0");
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.0, 1LL), Tuple(1LL, 2LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.0, 20LL), Tuple(1LL, 20LL));
+}
+
+TEST(GetQuantileSpecs, NonZeroEps) {
+  EXPECT_DEATH({ Stream::GetQuantileSpecs(0.01, 0L); }, "max_elements > 0");
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.1, 320LL), Tuple(4LL, 31LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.01, 25600LL), Tuple(6LL, 501LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.01, 104857600LL), Tuple(17LL, 1601LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.1, 104857600LL), Tuple(20LL, 191LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.01, 1LL << 40), Tuple(29LL, 2801LL));
+  EXPECT_EQ(Stream::GetQuantileSpecs(0.001, 1LL << 40), Tuple(26LL, 25001LL));
+}
+
+class WeightedQuantilesStreamTest : public ::testing::Test {};
+
+// Stream generators.
+void GenerateFixedUniformSummary(int32 worker_id, int64 max_elements,
+                                 double *total_weight, Stream *stream) {
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = static_cast<double>(i) / max_elements;
+    stream->PushEntry(x, 1.0);
+    ++(*total_weight);
+  }
+  stream->Finalize();
+}
+
+void GenerateFixedNonUniformSummary(int32 worker_id, int64 max_elements,
+                                    double *total_weight, Stream *stream) {
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = static_cast<double>(i) / max_elements;
+    stream->PushEntry(x, x);
+    (*total_weight) += x;
+  }
+  stream->Finalize();
+}
+
+void GenerateRandUniformFixedWeightsSummary(int32 worker_id, int64 max_elements,
+                                            double *total_weight,
+                                            Stream *stream) {
+  // Simulate uniform distribution stream.
+  random::PhiloxRandom philox(13 + worker_id);
+  random::SimplePhilox rand(&philox);
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = rand.RandDouble();
+    stream->PushEntry(x, 1);
+    ++(*total_weight);
+  }
+  stream->Finalize();
+}
+
+void GenerateRandUniformRandWeightsSummary(int32 worker_id, int64 max_elements,
+                                           double *total_weight,
+                                           Stream *stream) {
+  // Simulate uniform distribution stream.
+  random::PhiloxRandom philox(13 + worker_id);
+  random::SimplePhilox rand(&philox);
+  for (int64 i = 0; i < max_elements; ++i) {
+    const double x = rand.RandDouble();
+    const double w = rand.RandDouble();
+    stream->PushEntry(x, w);
+    (*total_weight) += w;
+  }
+  stream->Finalize();
+}
+
+// Single worker tests.
+void TestSingleWorkerStreams(
+    double eps, int64 max_elements,
+    const std::function<void(int32, int64, double *, Stream *)>
+        &worker_summary_generator,
+    std::initializer_list<double> expected_quantiles,
+    double quantiles_matcher_epsilon) {
+  // Generate single stream.
+  double total_weight = 0;
+  Stream stream(eps, max_elements);
+  worker_summary_generator(0, max_elements, &total_weight, &stream);
+
+  // Ensure we didn't lose track of any elements and are
+  // within approximation error bound.
+  EXPECT_LE(stream.ApproximationError(), eps);
+  EXPECT_NEAR(stream.GetFinalSummary().TotalWeight(), total_weight, 1e-6);
+
+  // Verify expected quantiles.
+  int i = 0;
+  auto actuals = stream.GenerateQuantiles(expected_quantiles.size() - 1);
+  for (auto expected_quantile : expected_quantiles) {
+    EXPECT_NEAR(actuals[i], expected_quantile, quantiles_matcher_epsilon);
+    ++i;
+  }
+}
+
+// Stream generators.
+void GenerateOneValue(int32 worker_id, int64 max_elements, double *total_weight,
+                      Stream *stream) {
+  stream->PushEntry(10, 1);
+  ++(*total_weight);
+  stream->Finalize();
+}
+
+void GenerateOneZeroWeightedValue(int32 worker_id, int64 max_elements,
+                                  double *total_weight, Stream *stream) {
+  stream->PushEntry(10, 0);
+  stream->Finalize();
+}
+
+TEST(WeightedQuantilesStreamTest, OneValue) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateOneValue,
+                          {10.0, 10.0, 10.0, 10.0, 10.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, OneZeroWeightValue) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateOneZeroWeightedValue, {},
+                          1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, FixedUniform) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateFixedUniformSummary,
+                          {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+                          1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, FixedNonUniform) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(eps, max_elements, GenerateFixedNonUniformSummary,
+                          {0, std::sqrt(0.1), std::sqrt(0.2), std::sqrt(0.3),
+                           std::sqrt(0.4), std::sqrt(0.5), std::sqrt(0.6),
+                           std::sqrt(0.7), std::sqrt(0.8), std::sqrt(0.9), 1.0},
+                          1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformFixedWeights) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(
+      eps, max_elements, GenerateRandUniformFixedWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformRandWeights) {
+  const double eps = 0.01;
+  const int64 max_elements = 1 << 16;
+  TestSingleWorkerStreams(
+      eps, max_elements, GenerateRandUniformRandWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+// Distributed tests.
+void TestDistributedStreams(
+    int32 num_workers, double eps, int64 max_elements,
+    const std::function<void(int32, int64, double *, Stream *)>
+        &worker_summary_generator,
+    std::initializer_list<double> expected_quantiles,
+    double quantiles_matcher_epsilon) {
+  // Simulate streams on each worker running independently
+  double total_weight = 0;
+  std::vector<std::vector<SummaryEntry>> worker_summaries;
+  for (int32 i = 0; i < num_workers; ++i) {
+    Stream stream(eps / 2, max_elements);
+    worker_summary_generator(i, max_elements / num_workers, &total_weight,
+                             &stream);
+    worker_summaries.push_back(stream.GetFinalSummary().GetEntryList());
+  }
+
+  // In the accumulation phase, we aggregate the summaries from each worker
+  // and build an overall summary while maintaining error bounds by ensuring we
+  // don't increase the error by more than eps / 2.
+  Stream reducer_stream(eps, max_elements);
+  for (const auto &summary : worker_summaries) {
+    reducer_stream.PushSummary(summary);
+  }
+  reducer_stream.Finalize();
+
+  // Ensure we didn't lose track of any elements and are
+  // within approximation error bound.
+  EXPECT_LE(reducer_stream.ApproximationError(), eps);
+  EXPECT_NEAR(reducer_stream.GetFinalSummary().TotalWeight(), total_weight,
+              total_weight);
+
+  // Verify expected quantiles.
+  int i = 0;
+  auto actuals =
+      reducer_stream.GenerateQuantiles(expected_quantiles.size() - 1);
+  for (auto expected_quantile : expected_quantiles) {
+    EXPECT_NEAR(actuals[i], expected_quantile, quantiles_matcher_epsilon);
+    ++i;
+  }
+}
+
+TEST(WeightedQuantilesStreamTest, FixedUniformDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(
+      num_workers, eps, max_elements, GenerateFixedUniformSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, FixedNonUniformDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(num_workers, eps, max_elements,
+                         GenerateFixedNonUniformSummary,
+                         {0, std::sqrt(0.1), std::sqrt(0.2), std::sqrt(0.3),
+                          std::sqrt(0.4), std::sqrt(0.5), std::sqrt(0.6),
+                          std::sqrt(0.7), std::sqrt(0.8), std::sqrt(0.9), 1.0},
+                         1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformFixedWeightsDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(
+      num_workers, eps, max_elements, GenerateRandUniformFixedWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+TEST(WeightedQuantilesStreamTest, RandUniformRandWeightsDistributed) {
+  const int32 num_workers = 10;
+  const double eps = 0.01;
+  const int64 max_elements = num_workers * (1 << 16);
+  TestDistributedStreams(
+      num_workers, eps, max_elements, GenerateRandUniformRandWeightsSummary,
+      {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, 1e-2);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
new file mode 100644
index 0000000000..31d7fe25a4
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
@@ -0,0 +1,344 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+
+#include <cstring>
+#include <vector>
+
+#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace quantiles {
+
+// Summary holding a sorted block of entries with upper bound guarantees
+// over the approximation error.
+template <typename ValueType, typename WeightType,
+          typename CompareFn = std::less<ValueType>>
+class WeightedQuantilesSummary {
+ public:
+  using Buffer = WeightedQuantilesBuffer<ValueType, WeightType, CompareFn>;
+  using BufferEntry = typename Buffer::BufferEntry;
+
+  struct SummaryEntry {
+    SummaryEntry(const ValueType& v, const WeightType& w, const WeightType& min,
+                 const WeightType& max) {
+      // Explicitly initialize all of memory (including padding from memory
+      // alignment) to allow the struct to be msan-resistant "plain old data".
+      //
+      // POD = http://en.cppreference.com/w/cpp/concept/PODType
+      memset(this, 0, sizeof(*this));
+
+      value = v;
+      weight = w;
+      min_rank = min;
+      max_rank = max;
+    }
+
+    SummaryEntry() {
+      memset(this, 0, sizeof(*this));
+
+      value = ValueType();
+      weight = 0;
+      min_rank = 0;
+      max_rank = 0;
+    }
+
+    bool operator==(const SummaryEntry& other) const {
+      return value == other.value && weight == other.weight &&
+             min_rank == other.min_rank && max_rank == other.max_rank;
+    }
+    friend std::ostream& operator<<(std::ostream& strm,
+                                    const SummaryEntry& entry) {
+      return strm << "{" << entry.value << ", " << entry.weight << ", "
+                  << entry.min_rank << ", " << entry.max_rank << "}";
+    }
+
+    // Max rank estimate for previous smaller value.
+    WeightType PrevMaxRank() const { return max_rank - weight; }
+
+    // Min rank estimate for next larger value.
+    WeightType NextMinRank() const { return min_rank + weight; }
+
+    ValueType value;
+    WeightType weight;
+    WeightType min_rank;
+    WeightType max_rank;
+  };
+
+  // Re-construct summary from the specified buffer.
+  void BuildFromBufferEntries(const std::vector<BufferEntry>& buffer_entries) {
+    entries_.clear();
+    entries_.reserve(buffer_entries.size());
+    WeightType cumulative_weight = 0;
+    for (const auto& entry : buffer_entries) {
+      WeightType current_weight = entry.weight;
+      entries_.emplace_back(entry.value, entry.weight, cumulative_weight,
+                            cumulative_weight + current_weight);
+      cumulative_weight += current_weight;
+    }
+  }
+
+  // Re-construct summary from the specified summary entries.
+  void BuildFromSummaryEntries(
+      const std::vector<SummaryEntry>& summary_entries) {
+    entries_.clear();
+    entries_.reserve(summary_entries.size());
+    entries_.insert(entries_.begin(), summary_entries.begin(),
+                    summary_entries.end());
+  }
+
+  // Merges two summaries through an algorithm that's derived from MergeSort
+  // for summary entries while guaranteeing that the max approximation error
+  // of the final merged summary is no greater than the approximation errors
+  // of each individual summary.
+  // For example consider summaries where each entry is of the form
+  // (element, weight, min rank, max rank):
+  // summary entries 1: (1, 3, 0, 3), (4, 2, 3, 5)
+  // summary entries 2: (3, 1, 0, 1), (4, 1, 1, 2)
+  // merged: (1, 3, 0, 3), (3, 1, 3, 4), (4, 3, 4, 7).
+  void Merge(const WeightedQuantilesSummary& other_summary) {
+    // Make sure we have something to merge.
+    const auto& other_entries = other_summary.entries_;
+    if (other_entries.empty()) {
+      return;
+    }
+    if (entries_.empty()) {
+      BuildFromSummaryEntries(other_summary.entries_);
+      return;
+    }
+
+    // Move current entries to make room for a new buffer.
+    std::vector<SummaryEntry> base_entries(std::move(entries_));
+    entries_.clear();
+    entries_.reserve(base_entries.size() + other_entries.size());
+
+    // Merge entries maintaining ranks. The idea is to stack values
+    // in order which we can do in linear time as the two summaries are
+    // already sorted. We keep track of the next lower rank from either
+    // summary and update it as we pop elements from the summaries.
+    // We handle the special case when the next two elements from either
+    // summary are equal, in which case we just merge the two elements
+    // and simultaneously update both ranks.
+    auto it1 = base_entries.cbegin();
+    auto it2 = other_entries.cbegin();
+    WeightType next_min_rank1 = 0;
+    WeightType next_min_rank2 = 0;
+    while (it1 != base_entries.cend() && it2 != other_entries.cend()) {
+      if (kCompFn(it1->value, it2->value)) {  // value1 < value2
+        // Take value1 and use the last added value2 to compute
+        // the min rank and the current value2 to compute the max rank.
+        entries_.emplace_back(it1->value, it1->weight,
+                              it1->min_rank + next_min_rank2,
+                              it1->max_rank + it2->PrevMaxRank());
+        // Update next min rank 1.
+        next_min_rank1 = it1->NextMinRank();
+        ++it1;
+      } else if (kCompFn(it2->value, it1->value)) {  // value1 > value2
+        // Take value2 and use the last added value1 to compute
+        // the min rank and the current value1 to compute the max rank.
+        entries_.emplace_back(it2->value, it2->weight,
+                              it2->min_rank + next_min_rank1,
+                              it2->max_rank + it1->PrevMaxRank());
+        // Update next min rank 2.
+        next_min_rank2 = it2->NextMinRank();
+        ++it2;
+      } else {  // value1 == value2
+        // Straight additive merger of the two entries into one.
+        entries_.emplace_back(it1->value, it1->weight + it2->weight,
+                              it1->min_rank + it2->min_rank,
+                              it1->max_rank + it2->max_rank);
+        // Update next min ranks for both.
+        next_min_rank1 = it1->NextMinRank();
+        next_min_rank2 = it2->NextMinRank();
+        ++it1;
+        ++it2;
+      }
+    }
+
+    // Fill in any residual.
+    while (it1 != base_entries.cend()) {
+      entries_.emplace_back(it1->value, it1->weight,
+                            it1->min_rank + next_min_rank2,
+                            it1->max_rank + other_entries.back().max_rank);
+      ++it1;
+    }
+    while (it2 != other_entries.cend()) {
+      entries_.emplace_back(it2->value, it2->weight,
+                            it2->min_rank + next_min_rank1,
+                            it2->max_rank + base_entries.back().max_rank);
+      ++it2;
+    }
+  }
+
+  // Compresses buffer into desired size. The size specification is
+  // considered a hint as we always keep the first and last elements and
+  // maintain strict approximation error bounds.
+  // The approximation error delta is taken as the max of either the requested
+  // min error or 1 / size_hint.
+  // After compression, the approximation error is guaranteed to increase
+  // by no more than that error delta.
+  // This algorithm is linear in the original size of the summary and is
+  // designed to be cache-friendly.
+  void Compress(int64 size_hint, double min_eps = 0) {
+    // No-op if we're already within the size requirement.
+    size_hint = std::max(size_hint, int64{2});
+    if (entries_.size() <= size_hint) {
+      return;
+    }
+
+    // First compute the max error bound delta resulting from this compression.
+    double eps_delta = TotalWeight() * std::max(1.0 / size_hint, min_eps);
+
+    // Compress elements ensuring approximation bounds and elements diversity
+    // are both maintained.
+    int64 add_accumulator = 0, add_step = entries_.size();
+    auto write_it = entries_.begin() + 1, last_it = write_it;
+    for (auto read_it = entries_.begin(); read_it + 1 != entries_.end();) {
+      auto next_it = read_it + 1;
+      while (next_it != entries_.end() && add_accumulator < add_step &&
+             next_it->PrevMaxRank() - read_it->NextMinRank() <= eps_delta) {
+        add_accumulator += size_hint;
+        ++next_it;
+      }
+      if (read_it == next_it - 1) {
+        ++read_it;
+      } else {
+        read_it = next_it - 1;
+      }
+      (*write_it++) = (*read_it);
+      last_it = read_it;
+      add_accumulator -= add_step;
+    }
+    // Write last element and resize.
+    if (last_it + 1 != entries_.end()) {
+      (*write_it++) = entries_.back();
+    }
+    entries_.resize(write_it - entries_.begin());
+  }
+
+  // To construct the boundaries we first run a soft compress over a copy
+  // of the summary and retrieve the values.
+  // The resulting boundaries are guaranteed to both contain at least
+  // num_boundaries unique elements and maintain approximation bounds.
+  std::vector<ValueType> GenerateBoundaries(int64 num_boundaries) const {
+    std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
+
+    // Generate soft compressed summary.
+    WeightedQuantilesSummary<ValueType, WeightType, CompareFn>
+        compressed_summary;
+    compressed_summary.BuildFromSummaryEntries(entries_);
+    // Set an epsilon for compression that's at most 1.0 / num_boundaries
+    // more than epsilon of original our summary since the compression operation
+    // adds ~1.0/num_boundaries to final approximation error.
+    float compression_eps = ApproximationError() + (1.0 / num_boundaries);
+    compressed_summary.Compress(num_boundaries, compression_eps);
+
+    // Return boundaries.
+    output.reserve(compressed_summary.entries_.size());
+    for (const auto& entry : compressed_summary.entries_) {
+      output.push_back(entry.value);
+    }
+    return output;
+  }
+
+  // To construct the desired n-quantiles we repetitively query n ranks from the
+  // original summary. The following algorithm is an efficient cache-friendly
+  // O(n) implementation of that idea which avoids the cost of the repetitive
+  // full rank queries O(nlogn).
+  std::vector<ValueType> GenerateQuantiles(int64 num_quantiles) const {
+    std::vector<ValueType> output;
+    if (entries_.empty()) {
+      return output;
+    }
+    num_quantiles = std::max(num_quantiles, int64{2});
+    output.reserve(num_quantiles + 1);
+
+    // Make successive rank queries to get boundaries.
+    // We always keep the first (min) and last (max) entries.
+    for (size_t cur_idx = 0, rank = 0; rank <= num_quantiles; ++rank) {
+      // This step boils down to finding the next element sub-range defined by
+      // r = (rmax[i + 1] + rmin[i + 1]) / 2 where the desired rank d < r.
+      WeightType d_2 = 2 * (rank * entries_.back().max_rank / num_quantiles);
+      size_t next_idx = cur_idx + 1;
+      while (next_idx < entries_.size() &&
+             d_2 >= entries_[next_idx].min_rank + entries_[next_idx].max_rank) {
+        ++next_idx;
+      }
+      cur_idx = next_idx - 1;
+
+      // Determine insertion order.
+      if (next_idx == entries_.size() ||
+          d_2 < entries_[cur_idx].NextMinRank() +
+                    entries_[next_idx].PrevMaxRank()) {
+        output.push_back(entries_[cur_idx].value);
+      } else {
+        output.push_back(entries_[next_idx].value);
+      }
+    }
+    return output;
+  }
+
+  // Calculates current approximation error which should always be <= eps.
+  double ApproximationError() const {
+    if (entries_.empty()) {
+      return 0;
+    }
+
+    WeightType max_gap = 0;
+    for (auto it = entries_.cbegin() + 1; it < entries_.end(); ++it) {
+      max_gap = std::max(max_gap,
+                         std::max(it->max_rank - it->min_rank - it->weight,
+                                  it->PrevMaxRank() - (it - 1)->NextMinRank()));
+    }
+    return static_cast<double>(max_gap) / TotalWeight();
+  }
+
+  ValueType MinValue() const {
+    return !entries_.empty() ? entries_.front().value
+                             : std::numeric_limits<ValueType>::max();
+  }
+  ValueType MaxValue() const {
+    return !entries_.empty() ? entries_.back().value
+                             : std::numeric_limits<ValueType>::max();
+  }
+  WeightType TotalWeight() const {
+    return !entries_.empty() ? entries_.back().max_rank : 0;
+  }
+  int64 Size() const { return entries_.size(); }
+  void Clear() { entries_.clear(); }
+  const std::vector<SummaryEntry>& GetEntryList() const { return entries_; }
+
+ private:
+  // Comparison function.
+  static constexpr decltype(CompareFn()) kCompFn = CompareFn();
+
+  // Summary entries.
+  std::vector<SummaryEntry> entries_;
+};
+
+template <typename ValueType, typename WeightType, typename CompareFn>
+constexpr decltype(CompareFn())
+    WeightedQuantilesSummary<ValueType, WeightType, CompareFn>::kCompFn;
+
+}  // namespace quantiles
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary_test.cc b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary_test.cc
new file mode 100644
index 0000000000..ccd1215cf4
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary_test.cc
@@ -0,0 +1,223 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+using Buffer = boosted_trees::quantiles::WeightedQuantilesBuffer<float, float>;
+using BufferEntry =
+    boosted_trees::quantiles::WeightedQuantilesBuffer<float,
+                                                      float>::BufferEntry;
+using Summary =
+    boosted_trees::quantiles::WeightedQuantilesSummary<float, float>;
+using SummaryEntry =
+    boosted_trees::quantiles::WeightedQuantilesSummary<float,
+                                                       float>::SummaryEntry;
+
+class WeightedQuantilesSummaryTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Constructs a buffer of 10 weighted unique entries.
+    buffer1_.reset(new Buffer(10, 1000));
+    buffer1_->PushEntry(5, 9);
+    buffer1_->PushEntry(2, 3);
+    buffer1_->PushEntry(-1, 7);
+    buffer1_->PushEntry(-7, 1);
+    buffer1_->PushEntry(3, 2);
+    buffer1_->PushEntry(-2, 3);
+    buffer1_->PushEntry(21, 8);
+    buffer1_->PushEntry(-13, 4);
+    buffer1_->PushEntry(8, 2);
+    buffer1_->PushEntry(-5, 6);
+
+    // Constructs a buffer of 7 weighted unique entries.
+    buffer2_.reset(new Buffer(7, 1000));
+    buffer2_->PushEntry(9, 2);
+    buffer2_->PushEntry(-7, 3);
+    buffer2_->PushEntry(2, 1);
+    buffer2_->PushEntry(4, 13);
+    buffer2_->PushEntry(0, 5);
+    buffer2_->PushEntry(-5, 3);
+    buffer2_->PushEntry(11, 3);
+  }
+
+  void TearDown() override { buffer1_->Clear(); }
+
+  std::unique_ptr<Buffer> buffer1_;
+  std::unique_ptr<Buffer> buffer2_;
+  const double buffer1_min_value_ = -13;
+  const double buffer1_max_value_ = 21;
+  const double buffer1_total_weight_ = 45;
+  const double buffer2_min_value_ = -7;
+  const double buffer2_max_value_ = 11;
+  const double buffer2_total_weight_ = 30;
+};
+
+TEST_F(WeightedQuantilesSummaryTest, BuildFromBuffer) {
+  Summary summary;
+  summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+
+  // We expect no approximation error because no compress operation occurred.
+  EXPECT_EQ(summary.ApproximationError(), 0);
+
+  // Check first and last elements in the summary.
+  const auto& entries = summary.GetEntryList();
+  // First element's rmin should be zero.
+  EXPECT_EQ(summary.MinValue(), buffer1_min_value_);
+  EXPECT_EQ(entries.front(), SummaryEntry(-13, 4, 0, 4));
+  // Last element's rmax should be cumulative weight.
+  EXPECT_EQ(summary.MaxValue(), buffer1_max_value_);
+  EXPECT_EQ(entries.back(), SummaryEntry(21, 8, 37, 45));
+  // Check total weight.
+  EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_);
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressSeparately) {
+  const auto entry_list = buffer1_->GenerateEntryList();
+  for (int new_size = 9; new_size >= 2; --new_size) {
+    Summary summary;
+    summary.BuildFromBufferEntries(entry_list);
+    summary.Compress(new_size);
+
+    // Expect a max approximation error of 1 / n
+    // ie. eps0 + 1/n but eps0 = 0.
+    EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
+    EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
+
+    // Min/Max elements and total weight should not change.
+    EXPECT_EQ(summary.MinValue(), buffer1_min_value_);
+    EXPECT_EQ(summary.MaxValue(), buffer1_max_value_);
+    EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_);
+  }
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressSequentially) {
+  Summary summary;
+  summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+  for (int new_size = 9; new_size >= 2; new_size -= 2) {
+    double prev_eps = summary.ApproximationError();
+    summary.Compress(new_size);
+
+    // Expect a max approximation error of prev_eps + 1 / n.
+    EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
+    EXPECT_LE(summary.ApproximationError(), prev_eps + 1.0 / new_size);
+
+    // Min/Max elements and total weight should not change.
+    EXPECT_EQ(summary.MinValue(), buffer1_min_value_);
+    EXPECT_EQ(summary.MaxValue(), buffer1_max_value_);
+    EXPECT_EQ(summary.TotalWeight(), buffer1_total_weight_);
+  }
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressRandomized) {
+  // Check multiple size compressions and ensure approximation bounds
+  // are always respected.
+  int prev_size = 1;
+  int size = 2;
+  float max_value = 1 << 20;
+  while (size < (1 << 16)) {
+    // Create buffer of size from uniform random elements.
+    Buffer buffer(size, size << 4);
+    random::PhiloxRandom philox(13);
+    random::SimplePhilox rand(&philox);
+    for (int i = 0; i < size; ++i) {
+      buffer.PushEntry(rand.RandFloat() * max_value,
+                       rand.RandFloat() * max_value);
+    }
+
+    // Create summary and compress.
+    Summary summary;
+    summary.BuildFromBufferEntries(buffer.GenerateEntryList());
+    int new_size = std::max(rand.Uniform(size), 2u);
+    summary.Compress(new_size);
+
+    // Ensure approximation error is acceptable.
+    EXPECT_TRUE(summary.Size() >= new_size && summary.Size() <= new_size + 2);
+    EXPECT_LE(summary.ApproximationError(), 1.0 / new_size);
+
+    // Update size to next fib number.
+    size_t last_size = size;
+    size += prev_size;
+    prev_size = last_size;
+  }
+}
+
+TEST_F(WeightedQuantilesSummaryTest, MergeSymmetry) {
+  // Create two separate summaries and merge.
+  const auto list_1 = buffer1_->GenerateEntryList();
+  const auto list_2 = buffer2_->GenerateEntryList();
+  Summary summary1;
+  summary1.BuildFromBufferEntries(list_1);
+  Summary summary2;
+  summary2.BuildFromBufferEntries(list_2);
+
+  // Merge summary 2 into 1 and verify.
+  summary1.Merge(summary2);
+  EXPECT_EQ(summary1.ApproximationError(), 0.0);
+  EXPECT_EQ(summary1.MinValue(),
+            std::min(buffer1_min_value_, buffer2_min_value_));
+  EXPECT_EQ(summary1.MaxValue(),
+            std::max(buffer1_max_value_, buffer2_max_value_));
+  EXPECT_EQ(summary1.TotalWeight(),
+            buffer1_total_weight_ + buffer2_total_weight_);
+  EXPECT_EQ(summary1.Size(), 14);  // 14 unique values.
+
+  // Merge summary 1 into 2 and verify same result.
+  summary1.BuildFromBufferEntries(list_1);
+  summary2.Merge(summary1);
+  EXPECT_EQ(summary2.ApproximationError(), 0.0);
+  EXPECT_EQ(summary2.MinValue(),
+            std::min(buffer1_min_value_, buffer2_min_value_));
+  EXPECT_EQ(summary2.MaxValue(),
+            std::max(buffer1_max_value_, buffer2_max_value_));
+  EXPECT_EQ(summary2.TotalWeight(),
+            buffer1_total_weight_ + buffer2_total_weight_);
+  EXPECT_EQ(summary2.Size(), 14);  // 14 unique values.
+}
+
+TEST_F(WeightedQuantilesSummaryTest, CompressThenMerge) {
+  // Create two separate summaries and merge.
+  Summary summary1;
+  summary1.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+  Summary summary2;
+  summary2.BuildFromBufferEntries(buffer2_->GenerateEntryList());
+
+  // Compress summaries.
+  summary1.Compress(5);  // max error is 1/5.
+  const auto eps1 = 1.0 / 5;
+  EXPECT_LE(summary1.ApproximationError(), eps1);
+  summary2.Compress(3);  // max error is 1/3.
+  const auto eps2 = 1.0 / 3;
+  EXPECT_LE(summary2.ApproximationError(), eps2);
+
+  // Merge guarantees an approximation error of max(eps1, eps2).
+  // Merge summary 2 into 1 and verify.
+  summary1.Merge(summary2);
+  EXPECT_LE(summary1.ApproximationError(), std::max(eps1, eps2));
+  EXPECT_EQ(summary1.MinValue(),
+            std::min(buffer1_min_value_, buffer2_min_value_));
+  EXPECT_EQ(summary1.MaxValue(),
+            std::max(buffer1_max_value_, buffer2_max_value_));
+  EXPECT_EQ(summary1.TotalWeight(),
+            buffer1_total_weight_ + buffer2_total_weight_);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h
index c8c60c5524..18727c0db3 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_BOUNDS_CHECK_H_
-#define TENSORFLOW_UTIL_BOUNDS_CHECK_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
+#define TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
 
 #include <type_traits>
 
@@ -51,4 +51,4 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
 }  // namespace internal
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
index 73fdd5d28e..a2327a7272 100644
--- a/tensorflow/core/kernels/broadcast_to_op.h
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
-#define TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -239,4 +239,4 @@ struct BroadcastTo {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_BROADCAST_TO_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
diff --git a/tensorflow/core/kernels/bucketize_op.h b/tensorflow/core/kernels/bucketize_op.h
index c8e461beb9..32be475f86 100644
--- a/tensorflow/core/kernels/bucketize_op.h
+++ b/tensorflow/core/kernels/bucketize_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_BUCKETIZE_OP_H_
-#define TENSORFLOW_BUCKETIZE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BUCKETIZE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BUCKETIZE_OP_H_
 
 #include <vector>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -38,4 +38,4 @@ struct BucketizeFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_BUCKETIZE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BUCKETIZE_OP_H_
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 0478c93280..3a72567655 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -98,7 +98,13 @@ void CastOpBase::Compute(OpKernelContext* ctx) {
     ctx->set_output(0, inp);
   } else {
     Tensor in;
-    in.UnsafeCopyFromInternal(inp, src_dtype_, inp.shape());
+    if (external_src_dtype_ != src_dtype_) {
+      // If the type is a quantized type we need to do an UnsafeCopyFromInternal
+      // since the src_dtype_ is different from external_src_type_.
+      in.UnsafeCopyFromInternal(inp, src_dtype_, inp.shape());
+    } else {
+      in = inp;
+    }
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in.shape(), &out));
     out->set_dtype(dst_dtype_);
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 527ab528c9..84c44f6b5e 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CAST_OP_H_
-#define TENSORFLOW_KERNELS_CAST_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CAST_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bfloat16.h"
@@ -323,4 +323,4 @@ struct functor_traits<scalar_cast_op<float, ::tensorflow::bfloat16>> {
 }  // namespace internal
 }  // namespace Eigen
 
-#endif  // TENSORFLOW_KERNELS_CAST_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CAST_OP_H_
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h
index 90bfce1419..4de14bc339 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_COLORSPACE_OP_H_
-#define TENSORFLOW_KERNELS_COLORSPACE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -91,4 +91,4 @@ struct HSVToRGB {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_COLORSPACE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_COLORSPACE_OP_H_
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 16784c4770..8b53ecf121 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONCAT_LIB_H_
-#define TENSORFLOW_KERNELS_CONCAT_LIB_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
+#define TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
 
 #include <vector>
 
@@ -66,4 +66,4 @@ void ConcatSYCL(
 #endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 720b506537..29f3a427fe 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
+
 #define EIGEN_USE_THREADS
 
 #include <vector>
@@ -162,3 +165,5 @@ void ConcatSYCLImpl(
 }
 #endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h
index 414891b142..a7836896c7 100644
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_H_
-#define TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_H_
+#define TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_H_
 
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/typed_conditional_accumulator_base.h"
@@ -133,4 +133,4 @@ class ConditionalAccumulator
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_H_
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index c7c7c98369..b7b7482a00 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
-#define TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
 
 #include <deque>
 
@@ -199,4 +199,4 @@ class TypeConverter<Eigen::half, U> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_H_
diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h
index 33c2d596c8..012a0dcc12 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base_op.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
-#define TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
 
 #define EIGEN_USE_THREADS
 
@@ -234,4 +234,4 @@ class ConditionalAccumulatorBaseTakeGradientOp
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONDITIONAL_ACCUMULATOR_BASE_OP_H_
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index a888422d49..426c404f43 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -140,44 +140,6 @@ REGISTER_SYCL_KERNEL(SYCL, bool);
 #undef REGISTER_SYCL_KERNEL
 #endif
 
-HostConstantOp::HostConstantOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), tensor_(ctx->output_type(0)) {
-  const TensorProto* proto = nullptr;
-  AllocatorAttributes alloc_attr;
-  alloc_attr.set_on_host(true);
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
-  OP_REQUIRES_OK(
-      ctx, ctx->device()->MakeTensorFromProto(*proto, alloc_attr, &tensor_));
-  OP_REQUIRES(
-      ctx, ctx->output_type(0) == tensor_.dtype(),
-      errors::InvalidArgument("Type mismatch between value (",
-                              DataTypeString(tensor_.dtype()), ") and dtype (",
-                              DataTypeString(ctx->output_type(0)), ")"));
-}
-
-void HostConstantOp::Compute(OpKernelContext* ctx) {
-  ctx->set_output(0, tensor_);
-}
-
-#if GOOGLE_CUDA
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(Name("Const")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
-                        HostConstantOp);
-#endif
-
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Const")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
-                        HostConstantOp);
-#endif  // TENSORFLOW_USE_SYCL
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
@@ -297,8 +259,9 @@ class ZerosLikeOp : public OpKernel {
           errors::InvalidArgument("ZerosLike non-scalar Tensor with "
                                   "dtype=DT_VARIANT is not supported."));
       const Variant& v = input.scalar<Variant>()();
-      Tensor out(ctx->device()->GetAllocator(AllocatorAttributes()), DT_VARIANT,
-                 TensorShape({}));
+      // DT_VARIANT tensors must be allocated on CPU since they wrap C++
+      // objects which can not be efficiently represented in GPU memory.
+      Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
       Variant* out_v = &(out.scalar<Variant>()());
       OP_REQUIRES_OK(ctx, UnaryOpVariant<Device>(
                               ctx, ZEROS_LIKE_VARIANT_UNARY_OP, v, out_v));
diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h
index b98153e347..77ba441863 100644
--- a/tensorflow/core/kernels/constant_op.h
+++ b/tensorflow/core/kernels/constant_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONSTANT_OP_H_
-#define TENSORFLOW_KERNELS_CONSTANT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -36,20 +36,6 @@ class ConstantOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ConstantOp);
 };
 
-// HostConstantOp differs from ConstantOp in that its output is always
-// in host memory.
-class HostConstantOp : public OpKernel {
- public:
-  explicit HostConstantOp(OpKernelConstruction* ctx);
-  void Compute(OpKernelContext* ctx) override;
-  bool IsExpensive() override { return false; }
-  ~HostConstantOp() override {}
-
- private:
-  Tensor tensor_;
-  TF_DISALLOW_COPY_AND_ASSIGN(HostConstantOp);
-};
-
 class PlaceholderOp : public OpKernel {
  public:
   explicit PlaceholderOp(OpKernelConstruction* ctx);
@@ -61,4 +47,4 @@ class PlaceholderOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONSTANT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 8edbcc9077..c607fcf298 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
-#define TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONTROL_FLOW_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CONTROL_FLOW_OPS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -115,4 +115,4 @@ class LoopCondOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 6b7544fd4c..de9b69828e 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONV_2D_H_
-#define TENSORFLOW_KERNELS_CONV_2D_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_2D_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -298,4 +298,4 @@ template <>
 class ConvAlgorithmMap<Eigen::ThreadPoolDevice> {};
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONV_2D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_H_
diff --git a/tensorflow/core/kernels/conv_3d.h b/tensorflow/core/kernels/conv_3d.h
index 083dec63cc..02e3655ad1 100644
--- a/tensorflow/core/kernels/conv_3d.h
+++ b/tensorflow/core/kernels/conv_3d.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Functors for 3d convolution.
 
-#ifndef TENSORFLOW_KERNELS_CONV_3D_H_
-#define TENSORFLOW_KERNELS_CONV_3D_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_3D_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_3D_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
@@ -45,4 +45,4 @@ struct CuboidConvolution<CPUDevice, T> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONV_3D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_3D_H_
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 5bf709af08..fc0a2f123f 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -63,7 +63,7 @@ Status ConvBackpropExtractAndVerifyDimensionV2(
     return errors::InvalidArgument(
         label, ": Size of out_backprop doesn't match computed: ", "actual = ",
         dim->output_size, ", computed = ", out_size,
-        "spatial_dim: ", spatial_dim, " input: ", dim->input_size,
+        " spatial_dim: ", spatial_dim, " input: ", dim->input_size,
         " filter: ", dim->filter_size, " output: ", dim->output_size,
         " stride: ", dim->stride, " dilation: ", dim->dilation);
   }
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 09a3b78776..adf4601b43 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CONV_OPS_H_
-#define TENSORFLOW_KERNELS_CONV_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -68,4 +68,4 @@ struct Im2ColBufferResource : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CONV_OPS_H
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/cross_op.h b/tensorflow/core/kernels/cross_op.h
index ca6beba52b..45bc46a921 100644
--- a/tensorflow/core/kernels/cross_op.h
+++ b/tensorflow/core/kernels/cross_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_COLORSPACE_OP_H_
-#define TENSORFLOW_KERNELS_COLORSPACE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CROSS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_CROSS_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -51,4 +51,4 @@ struct Cross {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_COLORSPACE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CROSS_OP_H_
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index b2e8ee23a9..2c30d036df 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================
 */
 
+#ifndef TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
+#define TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
+
 // This header declares the class CudaSolver, which contains wrappers of linear
 // algebra solvers in the cuBlas and cuSolverDN libraries for use in TensorFlow
 // kernels.
@@ -433,3 +436,5 @@ inline DeviceLapackInfo CudaSolver::GetDeviceLapackInfo(
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_CUDA_SOLVERS_H_
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.h b/tensorflow/core/kernels/cudnn_pooling_gpu.h
index 280d697fc2..738e928246 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Helper functions to run 3d pooling on GPU using CuDNN.
 
-#ifndef TENSORFLOW_KERNELS_CUDNN_POOLING_GPU_H_
-#define TENSORFLOW_KERNELS_CUDNN_POOLING_GPU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CUDNN_POOLING_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CUDNN_POOLING_GPU_H_
 
 #include <array>
 
@@ -67,4 +67,4 @@ class DnnPooling3dGradOp {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CUDNN_POOLING_GPU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CUDNN_POOLING_GPU_H_
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index d6a2403816..313d976e2c 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -24,8 +24,7 @@ REGISTER5(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16, int16,
           int32, int64);
 REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
           bfloat16, complex64, complex128);
-REGISTER5(BinaryOp, CPU, "UnsafeDiv", functor::unsafe_div, float, double, int16,
-          int32, int64);
+REGISTER2(BinaryOp, CPU, "DivNoNan", functor::div_no_nan, float, double);
 
 #if GOOGLE_CUDA
 REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
@@ -34,6 +33,7 @@ REGISTER4(BinaryOp, GPU, "TruncateDiv", functor::div, uint8, uint16, int16,
           int64);
 REGISTER5(BinaryOp, GPU, "RealDiv", functor::div, float, Eigen::half, double,
           complex64, complex128);
+REGISTER2(BinaryOp, GPU, "DivNoNan", functor::div_no_nan, float, double);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
index 0b05416274..25ccdcfb00 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
@@ -21,6 +21,7 @@ namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(div, Eigen::half, float, double, uint8, uint16, int16, int32,
                 int64, complex64, complex128);
+DEFINE_BINARY2(div_no_nan, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 98df0844ea..d6988a562c 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -33,6 +33,11 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
+namespace functor {
+template <typename Device, typename T>
+struct SelectScalarHandler;
+}  // namespace functor
+
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
  public:
@@ -131,16 +136,8 @@ class SelectOp : public OpKernel {
             then->shape().DebugString(), " vs. ",
             else_->shape().DebugString()));
 
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
-                            {"t", "e"}, "output", then->shape(), &output));
-
-    if (output->NumElements() > 0) {
-      functor::SelectScalarFunctor<Device, T> func;
-      TTypes<bool>::ConstScalar cond_scalar = cond->scalar<bool>();
-      func(ctx->eigen_device<Device>(), output->flat<T>(), cond_scalar,
-           then->flat<T>(), else_->flat<T>());
-    }
+    functor::SelectScalarHandler<Device, T> handler;
+    handler(ctx, cond, then, else_);
   }
 
  private:
@@ -209,6 +206,40 @@ struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
 #endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
+struct SelectScalarHandler {
+  void operator()(OpKernelContext* ctx, const Tensor* cond, const Tensor* then,
+                  const Tensor* else_) {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"t", "e"}, "output", then->shape(), &output));
+
+    if (output->NumElements() > 0) {
+      functor::SelectScalarFunctor<Device, T> func;
+      TTypes<bool>::ConstScalar cond_scalar = cond->scalar<bool>();
+      func(ctx->eigen_device<Device>(), output->flat<T>(), cond_scalar,
+           then->flat<T>(), else_->flat<T>());
+    }
+  }
+};
+
+// Specilization for CPU device. Forward input to output depending on the `cond`
+// value.
+// TODO(sjhwang): Consider specializing for GPUDevice as well by using
+// GPUDevice::memcpyDeviceToHost() to fetch bool value.
+template <typename T>
+struct SelectScalarHandler<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, const Tensor* cond, const Tensor* then,
+                  const Tensor* else_) {
+    if (cond->scalar<bool>()()) {
+      OP_REQUIRES_OK(ctx, ctx->set_output("output", *then));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->set_output("output", *else_));
+    }
+  }
+};
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename Device, typename T>
 struct SelectScalarFunctorBase {
   void operator()(const Device& d, typename TTypes<T>::Flat out,
                   TTypes<bool>::ConstScalar cond,
@@ -218,11 +249,6 @@ struct SelectScalarFunctorBase {
   }
 };
 
-// CPU Specializations of Select functors with scalar
-template <typename T>
-struct SelectScalarFunctor<CPUDevice, T>
-    : SelectScalarFunctorBase<CPUDevice, T> {};
-#ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
     : SelectScalarFunctorBase<SYCLDevice, T> {};
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 1014519059..22eb66e979 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CWISE_OPS_H_
-#define TENSORFLOW_KERNELS_CWISE_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_H_
 
 #include <cmath>
 #include <functional>
@@ -154,8 +154,8 @@ struct functor_traits<safe_div_or_mod_op<T, DivOrMod>> {
 };
 
 template <typename T>
-struct unsafe_div_op {
-  EIGEN_EMPTY_STRUCT_CTOR(unsafe_div_op)
+struct div_no_nan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(div_no_nan_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
                                                            const T& b) const {
     if (b != 0) {
@@ -167,7 +167,7 @@ struct unsafe_div_op {
 };
 
 template <typename T>
-struct functor_traits<unsafe_div_op<T>> {
+struct functor_traits<div_no_nan_op<T>> {
   enum {
     Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::AddCost,
     PacketAccess = false,
@@ -742,7 +742,7 @@ struct safe_div : base<T, Eigen::internal::safe_div_or_mod_op<
 };
 
 template <typename T>
-struct unsafe_div : base<T, Eigen::internal::unsafe_div_op<T>> {};
+struct div_no_nan : base<T, Eigen::internal::div_no_nan_op<T>> {};
 
 template <typename T>
 struct fmod : base<T, Eigen::internal::scalar_fmod_op<T>> {};
@@ -1036,4 +1036,4 @@ struct BatchSelectFunctor {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CWISE_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_H_
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index e32eccf547..f77d7238af 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
-#define TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_COMMON_H_
 
 // See docs in ../ops/math_ops.cc.
 
@@ -602,4 +602,4 @@ struct ApproximateEqual<CPUDevice, T> {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index 965e42dcce..cfae273bf4 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -17,8 +17,8 @@ limitations under the License.
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
-#define TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
 
 #define EIGEN_USE_GPU
 
@@ -188,4 +188,4 @@ struct ApproximateEqual<GPUDevice, T> {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
index e81b840a50..15e5de0f72 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -17,8 +17,8 @@ limitations under the License.
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
-#define TENSORFLOW_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
 
 #define EIGEN_USE_GPU
 
@@ -68,4 +68,4 @@ struct SimpleBinaryFunctor<GPUDevice, Functor> {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GPU_GRADIENTS_CU_H_
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 7a6f14babc..53b53cc277 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GRADIENTS_H_
-#define TENSORFLOW_KERNELS_CWISE_OPS_GRADIENTS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_GRADIENTS_H_
+#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_GRADIENTS_H_
 
 #define EIGEN_USE_THREADS
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -208,4 +208,4 @@ struct igamma_grad_a : base<T, Eigen::internal::scalar_igamma_der_a_op<T>> {};
 }  // end namespace functor
 
 }  // end namespace tensorflow
-#endif  // TENSORFLOW_KERNELS_CWISE_OPS_GRADIENTS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_GRADIENTS_H_
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 607a694dba..7716043055 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -232,6 +232,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "parse_example_dataset_op",
+    srcs = ["parse_example_dataset_op.cc"],
+    deps = [
+        ":parallel_map_iterator",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+    ],
+)
+
 tf_kernel_library(
     name = "parallel_map_dataset_op",
     srcs = ["parallel_map_dataset_op.cc"],
@@ -668,6 +678,7 @@ tf_kernel_library(
         ":padded_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
+        ":parse_example_dataset_op",
         ":prefetch_dataset_op",
         ":random_dataset_op",
         ":range_dataset_op",
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 82da385405..abdf6ee4e8 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -172,31 +172,17 @@ class BorrowedArgsCallFrame : public CallFrameBase {
 
 }  // namespace
 
-Status CapturedFunction::MaybeInstantiate(
-    IteratorContext* ctx, FunctionLibraryRuntime::Handle* out_handle) {
-  mutex_lock l(mu_);
+Status CapturedFunction::GetHandle(IteratorContext* ctx,
+                                   FunctionLibraryRuntime::Handle* out_handle) {
+  tf_shared_lock l(mu_);
   if (lib_ == nullptr) {
-    // The context's runtime will be used for all subsequent calls.
-    lib_ = ctx->lib();
-    DCHECK(f_handle_ == kInvalidHandle);
-    FunctionLibraryRuntime::InstantiateOptions inst_opts;
-    inst_opts.overlay_lib = ctx->function_library().get();
-    inst_opts.state_handle = std::to_string(random::New64());
-    TF_RETURN_IF_ERROR(lib_->Instantiate(func_.name(), AttrSlice(&func_.attr()),
-                                         inst_opts, &f_handle_));
-    const FunctionBody* fbody = lib_->GetFunctionBody(f_handle_);
-    if (fbody == nullptr) {
-      return errors::Internal("Failed to instantiate function body.");
-    }
-    ret_types_ = fbody->ret_types;
-  } else {
-    // TODO(mrry): Consider moving this under a shared lock, as it is
-    // the common case.
-    if (ctx->lib() != lib_) {
-      return errors::Internal(
-          "Captured function was called with a different "
-          "FunctionLibraryRuntime*, which is not permitted.");
-    }
+    return errors::Internal("Captured function \"", func_.name(),
+                            "\" was called before it was instantiated.");
+  }
+  if (ctx->lib() != lib_) {
+    return errors::Internal("Captured function \"", func_.name(),
+                            "\" was called with a different "
+                            "FunctionLibraryRuntime*, which is not permitted.");
   }
   *out_handle = f_handle_;
   return Status::OK();
@@ -205,7 +191,7 @@ Status CapturedFunction::MaybeInstantiate(
 Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
                              std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle));
+  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
 
   FunctionLibraryRuntime::Options f_opts;
   f_opts.step_id = CapturedFunction::generate_step_id();
@@ -242,7 +228,7 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
                                              const std::vector<Tensor>& args,
                                              std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle));
+  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
 
   FunctionLibraryRuntime::Options f_opts;
   f_opts.step_id = CapturedFunction::generate_step_id();
@@ -277,9 +263,30 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
 }
 
 Status CapturedFunction::Instantiate(IteratorContext* ctx) {
-  FunctionLibraryRuntime::Handle unused_handle;
-  TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &unused_handle));
   mutex_lock l(mu_);
+  if (lib_ == nullptr) {
+    // The context's runtime will be used for all subsequent calls.
+    lib_ = ctx->lib();
+    DCHECK(f_handle_ == kInvalidHandle);
+    FunctionLibraryRuntime::InstantiateOptions inst_opts;
+    inst_opts.overlay_lib = ctx->function_library().get();
+    inst_opts.state_handle = std::to_string(random::New64());
+    inst_opts.create_kernels_eagerly = true;
+    Status s = (lib_->Instantiate(func_.name(), AttrSlice(&func_.attr()),
+                                  inst_opts, &f_handle_));
+    TF_RETURN_IF_ERROR(s);
+    const FunctionBody* fbody = lib_->GetFunctionBody(f_handle_);
+    if (fbody == nullptr) {
+      return errors::Internal("Failed to instantiate function body.");
+    }
+    ret_types_ = fbody->ret_types;
+  } else {
+    if (ctx->lib() != lib_) {
+      return errors::Internal(
+          "Captured function was called with a different "
+          "FunctionLibraryRuntime*, which is not permitted.");
+    }
+  }
   if (captured_runner_ == nullptr) {
     captured_runner_ = *ctx->runner();
   }
@@ -343,7 +350,7 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
   // be deleted before `done` is called. Take care not to capture `ctx` in any
   // code that may execute asynchronously in this function.
   FunctionLibraryRuntime::Handle handle;
-  Status s = MaybeInstantiate(ctx, &handle);
+  Status s = GetHandle(ctx, &handle);
   if (!s.ok()) {
     done(s);
     return;
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index e9ad3e381d..c95f2b1c01 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -116,8 +116,8 @@ class CapturedFunction {
   CapturedFunction(const NameAttrList& func,
                    std::vector<Tensor> captured_inputs);
 
-  Status MaybeInstantiate(IteratorContext* ctx,
-                          FunctionLibraryRuntime::Handle* out_handle);
+  Status GetHandle(IteratorContext* ctx,
+                   FunctionLibraryRuntime::Handle* out_handle);
 
   mutex mu_;
   const NameAttrList func_;
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index a80e102ccf..f5c7d336a6 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -149,7 +149,9 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<FilterDatasetBase>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(ctx);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 07bcb9d414..21e627a8e8 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -129,7 +129,9 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(ctx);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 3c3d78b724..ccee690d7e 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
@@ -80,20 +81,20 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
       }
     }
 
+    Status Initialize(IteratorContext* ctx) override {
+      TF_RETURN_IF_ERROR(dataset()->init_func_->Instantiate(ctx));
+      TF_RETURN_IF_ERROR(dataset()->next_func_->Instantiate(ctx));
+      TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
+      TF_RETURN_IF_ERROR(
+          dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
+      return Status::OK();
+    }
+
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
       mutex_lock l(mu_);
 
-      if (!initialized_) {
-        TF_RETURN_IF_ERROR(
-            dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
-        // Explicitly instantiate the finalize function here so that
-        // we can invoke it in the destructor.
-        TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
-        initialized_ = true;
-      }
-
       if (finalized_) {
         *end_of_sequence = true;
         return Status::OK();
@@ -121,7 +122,6 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    bool initialized_ GUARDED_BY(mu_) = false;
     bool finalized_ GUARDED_BY(mu_) = false;
     std::vector<Tensor> state_ GUARDED_BY(mu_);
   };
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.h b/tensorflow/core/kernels/data/generator_dataset_op.h
index 3f84fa9c2e..8407543136 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.h
+++ b/tensorflow/core/kernels/data/generator_dataset_op.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_DATA_GENERATOR_DATASET_OP_H_
 
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/kernels/data/captured_function.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
index be4132a064..4a388645f2 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
@@ -190,7 +190,14 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_finalize_func_->Instantiate(ctx));
+        return Status::OK();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 288695f3cd..f993a68934 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -205,7 +205,13 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_window_size_func_->Instantiate(ctx));
+        return Status::OK();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 58b79d6026..6bba667759 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -1,4 +1,3 @@
-
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -156,7 +155,9 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
             args_list_(params.dataset->cycle_length_) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(ctx);
       }
 
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 61a6c06135..4e9b280968 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -104,9 +104,8 @@ class IteratorResource : public ResourceBase {
                  bool* end_of_sequence) {
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
     if (captured_iterator) {
-      if (lib_ != nullptr) {
-        ctx->set_lib(lib_);
-      }
+      CHECK_NOTNULL(lib_);
+      ctx->set_lib(lib_);
       return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence);
     } else {
       return errors::FailedPrecondition(
@@ -162,8 +161,10 @@ class IteratorResource : public ResourceBase {
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
     std::unique_ptr<IteratorBase> iterator;
+    IteratorContext iter_ctx(ctx);
+    iter_ctx.set_lib(lib);
     TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(IteratorContext(ctx), "Iterator", &iterator));
+        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
     TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
 
@@ -198,6 +199,8 @@ class IteratorResource : public ResourceBase {
     return lib_def_;
   }
 
+  FunctionLibraryRuntime* function_library_runtime() { return lib_; }
+
   // Transfers ownership of iterator to this. This method is thread-safe.
   Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
     if (iterator) {
@@ -258,7 +261,7 @@ class VariantTensorDataReader : public IteratorStateReader {
   }
 
   bool Contains(StringPiece key) override {
-    return map_.find(key.ToString()) != map_.end();
+    return map_.find(string(key)) != map_.end();
   }
 
  private:
@@ -279,18 +282,18 @@ class VariantTensorDataReader : public IteratorStateReader {
 
   template <typename T>
   Status ReadScalarInternal(StringPiece key, T* val) {
-    if (map_.find(key.ToString()) == map_.end()) {
+    if (map_.find(string(key)) == map_.end()) {
       return errors::NotFound(key);
     }
-    *val = data_->tensors(map_[key.ToString()]).scalar<T>()();
+    *val = data_->tensors(map_[string(key)]).scalar<T>()();
     return Status::OK();
   }
 
   Status ReadTensorInternal(StringPiece key, Tensor* val) {
-    if (map_.find(key.ToString()) == map_.end()) {
+    if (map_.find(string(key)) == map_.end()) {
       return errors::NotFound(key);
     }
-    *val = data_->tensors(map_[key.ToString()]);
+    *val = data_->tensors(map_[string(key)]);
     return Status::OK();
   }
 
@@ -339,7 +342,7 @@ class VariantTensorDataWriter : public IteratorStateWriter {
     // Write key to the metadata proto. This gets written to `data_`
     // when `Flush()` is called. We do this lazily to avoid multiple
     // serialization calls.
-    metadata_proto_.add_keys(key.ToString());
+    metadata_proto_.add_keys(string(key));
 
     // Update tensors.
     *(data_->add_tensors()) = val;
@@ -612,8 +615,10 @@ void MakeIteratorOp::Compute(OpKernelContext* ctx) {
   core::ScopedUnref unref(iterator_resource);
 
   std::unique_ptr<IteratorBase> iterator;
+  IteratorContext iter_ctx(ctx);
+  iter_ctx.set_lib(iterator_resource->function_library_runtime());
   OP_REQUIRES_OK(
-      ctx, dataset->MakeIterator(IteratorContext(ctx), "Iterator", &iterator));
+      ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
   OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
 }
 
@@ -837,8 +842,10 @@ class OneShotIteratorOp : public AsyncOpKernel {
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
     std::unique_ptr<IteratorBase> iter;
+    IteratorContext iter_ctx(ctx);
+    iter_ctx.set_lib(lib);
     TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(IteratorContext(ctx), "Iterator", &iter));
+        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iter));
     TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
 
     (*iterator)->Ref();
@@ -922,39 +929,33 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
       std::move(done)));
 }
 
-class IteratorGetNextSyncOp : public OpKernel {
- public:
-  explicit IteratorGetNextSyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    IteratorResource* iterator;
-    OP_REQUIRES_OK(ctx,
-                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
-    core::ScopedUnref unref_iterator(iterator);
-
-    std::vector<Tensor> components;
-    bool end_of_sequence = false;
-
-    IteratorContext::Params params;
-    params.env = ctx->env();
-    params.runner = *(ctx->runner());
-    params.function_library = iterator->function_library();
-    DeviceBase* device = ctx->function_library()->device();
-    params.allocator_getter = [device](AllocatorAttributes attrs) {
-      return device->GetAllocator(attrs);
-    };
-    IteratorContext iter_ctx(std::move(params));
+void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
+  IteratorResource* iterator;
+  OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+  core::ScopedUnref unref_iterator(iterator);
+
+  std::vector<Tensor> components;
+  bool end_of_sequence = false;
+
+  IteratorContext::Params params;
+  params.env = ctx->env();
+  params.runner = *(ctx->runner());
+  params.function_library = iterator->function_library();
+  DeviceBase* device = ctx->function_library()->device();
+  params.allocator_getter = [device](AllocatorAttributes attrs) {
+    return device->GetAllocator(attrs);
+  };
+  IteratorContext iter_ctx(std::move(params));
 
-    OP_REQUIRES_OK(ctx,
-                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
-    OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
+  OP_REQUIRES_OK(ctx,
+                 iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+  OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
 
-    for (int i = 0; i < components.size(); ++i) {
-      // TODO(mrry): Check that the shapes match the shape attrs.
-      ctx->set_output(i, components[i]);
-    }
+  for (int i = 0; i < components.size(); ++i) {
+    // TODO(mrry): Check that the shapes match the shape attrs.
+    ctx->set_output(i, components[i]);
   }
-};
+}
 
 class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
  public:
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index e426febcce..723564286c 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -116,6 +116,13 @@ class IteratorGetNextOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
+class IteratorGetNextSyncOp : public OpKernel {
+ public:
+  explicit IteratorGetNextSyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
 class IteratorToStringHandleOp : public OpKernel {
  public:
   explicit IteratorToStringHandleOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 0e17011b05..c4df7f2756 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -204,7 +204,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(ctx);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 294fb1c49a..26ae26a7fd 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -127,7 +127,9 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(ctx);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index d66716ef66..607d0ca028 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -74,7 +74,11 @@ class MapDefunOp : public AsyncOpKernel {
       arg_shapes->at(i).RemoveDim(0);  // Remove the first batch dimension
       OP_REQUIRES_ASYNC(
           ctx, batch_size == ctx->input(i).dim_size(0),
-          errors::InvalidArgument("All inputs must have the same dimension 0."),
+          errors::InvalidArgument(
+              "All inputs must have the same dimension 0. Input ", i,
+              " has leading dimension ", ctx->input(i).dim_size(0),
+              ", while all previous inputs have leading dimension ", batch_size,
+              "."),
           done);
     }
 
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index b097598cd9..9b14078407 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -97,19 +97,27 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(
           db.AddInputDataset(&serialization_ctx, input_, &input_node));
       string output_node = input_node->name();
+
       GraphDef graph_def;
       TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
       VLOG(3) << "Before optimization: " << graph_def.DebugString();
+
       TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
       VLOG(3) << "After optimization: " << graph_def.DebugString();
-      flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(),
-                                                    graph_def.library()));
+
+      // Instantiate the optimized input pipeline by running the optimized graph
+      // using the optimized function library.
+      TF_RETURN_IF_ERROR(
+          ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
+      TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph_def.library()));
+
       Graph graph(OpRegistry::Global());
       TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
       std::vector<Tensor> outputs;
       GraphRunner graph_runner(ctx->function_library()->device());
-      TF_RETURN_IF_ERROR(graph_runner.Run(&graph, ctx->function_library(), {},
-                                          {output_node}, &outputs));
+
+      TF_RETURN_IF_ERROR(
+          graph_runner.Run(&graph, lib_, {}, {output_node}, &outputs));
       TF_RETURN_IF_ERROR(
           GetDatasetFromVariantTensor(outputs[0], &optimized_input_));
       optimized_input_->Ref();
@@ -142,8 +150,14 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->optimized_input_->MakeIterator(ctx, prefix(),
-                                                         &input_impl_);
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        params.stats_aggregator_getter = ctx->stats_aggregator_getter();
+        params.lib = dataset()->lib_;
+        params.allocator_getter = ctx->allocator_getter();
+        return dataset()->optimized_input_->MakeIterator(
+            IteratorContext(params), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -153,8 +167,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         params.env = ctx->env();
         params.runner = *(ctx->runner());
         params.stats_aggregator_getter = ctx->stats_aggregator_getter();
-        params.lib = ctx->lib();
-        params.function_library = dataset()->flib_def_;
+        params.lib = dataset()->lib_;
         params.allocator_getter = ctx->allocator_getter();
         IteratorContext iter_ctx(params);
         return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
@@ -236,7 +249,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
     }
 
     DatasetBase* optimized_input_;
-    std::shared_ptr<FunctionLibraryDefinition> flib_def_;
+    FunctionLibraryRuntime* lib_ = nullptr;
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
+    std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
     const DatasetBase* input_;
     const std::vector<string> optimizations_;
     const DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index e492a8215a..bf86361a71 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -251,7 +251,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(ctx);
       }
 
       // It is implemented so that it matches the deterministic interleave
@@ -279,7 +281,12 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
             if (!current_worker->outputs.empty()) {
               // We have an element!
               next_index_ = index;
-              if (i == 0) {
+              const bool element_acquired_sloppily =
+                  dataset()->sloppy_ && i > 1;
+              if (!element_acquired_sloppily) {
+                // If the element was acquired in the regular (non-sloppy)
+                // order, then advance the current block and cycle pointers to
+                // the next element in the regular order.
                 block_count_++;
                 if (block_count_ == dataset()->block_length_) {
                   next_index_ = (index + 1) % interleave_indices_.size();
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index a407abfce4..e03a4e353b 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -88,6 +88,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
+      auto init_func = [this](IteratorContext* ctx) {
+        return captured_func_->Instantiate(ctx);
+      };
+
       auto map_func = [this](IteratorContext* ctx,
                              std::vector<Tensor> input_element,
                              std::vector<Tensor>* result, StatusCallback done) {
@@ -97,7 +101,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
-          std::move(map_func), num_parallel_calls_);
+          std::move(init_func), std::move(map_func), num_parallel_calls_);
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 4d32b719a4..61f8139b9e 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -26,10 +26,12 @@ class ParallelMapIterator : public DatasetBaseIterator {
  public:
   explicit ParallelMapIterator(
       const typename DatasetBaseIterator::BaseParams& params,
-      const DatasetBase* input_dataset, ParallelMapIteratorFunction map_func,
-      int32 num_parallel_calls)
+      const DatasetBase* input_dataset,
+      std::function<Status(IteratorContext*)> init_func,
+      ParallelMapIteratorFunction map_func, int32 num_parallel_calls)
       : DatasetBaseIterator(params),
         input_dataset_(input_dataset),
+        init_func_(std::move(init_func)),
         map_func_(std::move(map_func)),
         num_parallel_calls_(num_parallel_calls) {}
 
@@ -50,7 +52,12 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   Status Initialize(IteratorContext* ctx) override {
-    return input_dataset_->MakeIterator(ctx, prefix(), &input_impl_);
+    TF_RETURN_IF_ERROR(
+        input_dataset_->MakeIterator(ctx, prefix(), &input_impl_));
+    if (init_func_) {
+      TF_RETURN_IF_ERROR(init_func_(ctx));
+    }
+    return Status::OK();
   }
 
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -285,6 +292,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   const DatasetBase* const input_dataset_;  // Not owned.
+  const std::function<Status(IteratorContext*)> init_func_;
   const ParallelMapIteratorFunction map_func_;
   const int32 num_parallel_calls_;
   // Used for coordination between the main thread and the runner thread.
@@ -311,8 +319,18 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset, ParallelMapIteratorFunction map_func,
     int32 num_parallel_calls) {
-  return std::unique_ptr<IteratorBase>(new ParallelMapIterator(
-      params, input_dataset, std::move(map_func), num_parallel_calls));
+  return NewParallelMapIterator(params, input_dataset, nullptr,
+                                std::move(map_func), num_parallel_calls);
+}
+
+std::unique_ptr<IteratorBase> NewParallelMapIterator(
+    const DatasetBaseIterator::BaseParams& params,
+    const DatasetBase* input_dataset,
+    std::function<Status(IteratorContext*)> init_func,
+    ParallelMapIteratorFunction map_func, int32 num_parallel_calls) {
+  return std::unique_ptr<IteratorBase>(
+      new ParallelMapIterator(params, input_dataset, std::move(init_func),
+                              std::move(map_func), num_parallel_calls));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index 2ce36c3869..7e6cc586f3 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -33,7 +33,15 @@ using ParallelMapIteratorFunction =
                        std::vector<Tensor>*, StatusCallback)>;
 
 // Returns a new iterator that applies `map_func` to the elements of
-// `input_dataset` using the given degree of parallelism.
+// `input_dataset` using the given degree of parallelism. `init_func` (if
+// specified) will be executed when the iterator is initialized (see
+// `IteratorBase::Initialize()`) and enables the user to specify error checking
+// logic that can fail early.
+std::unique_ptr<IteratorBase> NewParallelMapIterator(
+    const DatasetBaseIterator::BaseParams& params,
+    const DatasetBase* input_dataset,
+    std::function<Status(IteratorContext*)> init_func,
+    ParallelMapIteratorFunction map_func, int32 num_parallel_calls);
 std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset, ParallelMapIteratorFunction map_func,
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
new file mode 100644
index 0000000000..9057800d94
--- /dev/null
+++ b/tensorflow/core/kernels/data/parse_example_dataset_op.cc
@@ -0,0 +1,372 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <deque>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/kernels/data/parallel_map_iterator.h"
+#include "tensorflow/core/util/example_proto_fast_parsing.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ParseExampleDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sparse_keys", &sparse_keys_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dense_keys", &dense_keys_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sparse_types", &sparse_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tdense", &dense_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dense_shapes", &dense_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    for (int i = 0; i < dense_shapes_.size(); ++i) {
+      bool shape_ok = true;
+      if (dense_shapes_[i].dims() == -1) {
+        shape_ok = false;
+      } else {
+        for (int d = 1; d < dense_shapes_[i].dims(); ++d) {
+          if (dense_shapes_[i].dim_size(d) == -1) {
+            shape_ok = false;
+          }
+        }
+      }
+      OP_REQUIRES(ctx, shape_ok,
+                  errors::InvalidArgument(
+                      "dense_shapes[", i,
+                      "] has unknown rank or unknown inner dimensions: ",
+                      dense_shapes_[i].DebugString()));
+      TensorShape dense_shape;
+      if (dense_shapes_[i].dims() > 0 && dense_shapes_[i].dim_size(0) == -1) {
+        variable_length_.push_back(true);
+        for (int d = 1; d < dense_shapes_[i].dims(); ++d) {
+          dense_shape.AddDim(dense_shapes_[i].dim_size(d));
+        }
+      } else {
+        variable_length_.push_back(false);
+        dense_shapes_[i].AsTensorShape(&dense_shape);
+      }
+      elements_per_stride_.push_back(dense_shape.num_elements());
+    }
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 num_parallel_calls;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                            &num_parallel_calls));
+    OP_REQUIRES(ctx, num_parallel_calls > 0,
+                errors::InvalidArgument(
+                    "num_parallel_calls must be greater than zero."));
+
+    OpInputList dense_default_tensors;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("dense_defaults", &dense_default_tensors));
+
+    OP_REQUIRES(ctx, dense_default_tensors.size() == dense_keys_.size(),
+                errors::InvalidArgument(
+                    "Expected len(dense_defaults) == len(dense_keys) but got: ",
+                    dense_default_tensors.size(), " vs. ", dense_keys_.size()));
+
+    std::vector<Tensor> dense_defaults;
+    dense_defaults.reserve(dense_default_tensors.size());
+    for (const Tensor& dense_default_t : dense_default_tensors) {
+      dense_defaults.push_back(dense_default_t);
+    }
+
+    for (int d = 0; d < dense_keys_.size(); ++d) {
+      const Tensor& def_value = dense_defaults[d];
+      if (variable_length_[d]) {
+        OP_REQUIRES(ctx, def_value.NumElements() == 1,
+                    errors::InvalidArgument(
+                        "dense_shape[", d, "] is a variable length shape: ",
+                        dense_shapes_[d].DebugString(),
+                        ", therefore "
+                        "def_value[",
+                        d,
+                        "] must contain a single element ("
+                        "the padding element).  But its shape is: ",
+                        def_value.shape().DebugString()));
+      } else if (def_value.NumElements() > 0) {
+        OP_REQUIRES(ctx, dense_shapes_[d].IsCompatibleWith(def_value.shape()),
+                    errors::InvalidArgument(
+                        "def_value[", d,
+                        "].shape() == ", def_value.shape().DebugString(),
+                        " is not compatible with dense_shapes_[", d,
+                        "] == ", dense_shapes_[d].DebugString()));
+      }
+      OP_REQUIRES(ctx, def_value.dtype() == dense_types_[d],
+                  errors::InvalidArgument(
+                      "dense_defaults[", d, "].dtype() == ",
+                      DataTypeString(def_value.dtype()), " != dense_types_[", d,
+                      "] == ", DataTypeString(dense_types_[d])));
+    }
+
+    example::FastParseExampleConfig config;
+    std::map<string, int> key_to_output_index;
+    for (int d = 0; d < dense_keys_.size(); ++d) {
+      config.dense.push_back({dense_keys_[d], dense_types_[d], dense_shapes_[d],
+                              dense_default_tensors[d], variable_length_[d],
+                              elements_per_stride_[d]});
+      auto result = key_to_output_index.insert({dense_keys_[d], 0});
+      OP_REQUIRES(ctx, result.second,
+                  errors::InvalidArgument("Duplicate key not allowed: ",
+                                          dense_keys_[d]));
+    }
+    for (int d = 0; d < sparse_keys_.size(); ++d) {
+      config.sparse.push_back({sparse_keys_[d], sparse_types_[d]});
+      auto result = key_to_output_index.insert({sparse_keys_[d], 0});
+      OP_REQUIRES(ctx, result.second,
+                  errors::InvalidArgument("Duplicate key not allowed: ",
+                                          sparse_keys_[d]));
+    }
+    int i = 0;
+    for (auto it = key_to_output_index.begin(); it != key_to_output_index.end();
+         it++) {
+      it->second = i++;
+    }
+
+    *output = new Dataset(ctx, input, std::move(dense_defaults),
+                          std::move(sparse_keys_), std::move(dense_keys_),
+                          std::move(key_to_output_index), std::move(config),
+                          num_parallel_calls, sparse_types_, dense_types_,
+                          dense_shapes_, output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            std::vector<Tensor> dense_defaults, std::vector<string> sparse_keys,
+            std::vector<string> dense_keys,
+            std::map<string, int> key_to_output_index,
+            example::FastParseExampleConfig config, int32 num_parallel_calls,
+            const DataTypeVector& sparse_types,
+            const DataTypeVector& dense_types,
+            const std::vector<PartialTensorShape>& dense_shapes,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          dense_defaults_(std::move(dense_defaults)),
+          sparse_keys_(std::move(sparse_keys)),
+          dense_keys_(std::move(dense_keys)),
+          key_to_output_index_(std::move(key_to_output_index)),
+          config_(std::move(config)),
+          num_parallel_calls_(num_parallel_calls),
+          sparse_types_(sparse_types),
+          dense_types_(dense_types),
+          dense_shapes_(dense_shapes),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      auto map_fn = [this](IteratorContext* ctx,
+                           std::vector<Tensor> input_element,
+                           std::vector<Tensor>* result, StatusCallback done) {
+        (*ctx->runner())([this, ctx, input_element, result, done]() {
+          thread::ThreadPool* device_threadpool =
+              ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers;
+          std::vector<string> slice_vec;
+          for (Tensor t : input_element) {
+            auto serialized_t = t.flat<string>();
+            gtl::ArraySlice<string> slice(serialized_t.data(),
+                                          serialized_t.size());
+            for (auto it = slice.begin(); it != slice.end(); it++)
+              slice_vec.push_back(*it);
+          }
+          example::FastParseExampleConfig config = config_;
+          // local copy of config_ for modification.
+          auto stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            config.collect_feature_stats = true;
+          }
+          example::Result example_result;
+          Status s = FastParseExample(config, slice_vec, {}, device_threadpool,
+                                      &example_result);
+          if (s.ok()) {
+            (*result).resize(key_to_output_index_.size());
+            for (int d = 0; d < dense_keys_.size(); ++d) {
+              int output_index = key_to_output_index_.at(dense_keys_[d]);
+              CHECK(example_result.dense_values[d].dtype() ==
+                    output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(output_dtypes()[output_index]) << ", got "
+                  << DataTypeString(example_result.dense_values[d].dtype())
+                  << ").";
+              CHECK(output_shapes()[output_index].IsCompatibleWith(
+                  example_result.dense_values[d].shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << output_shapes()[output_index].DebugString() << ", got "
+                  << example_result.dense_values[d].shape().DebugString()
+                  << ").";
+              (*result)[output_index] = example_result.dense_values[d];
+            }
+            for (int d = 0; d < sparse_keys_.size(); ++d) {
+              Tensor serialized_sparse = Tensor(DT_VARIANT, TensorShape({3}));
+              auto serialized_sparse_t = serialized_sparse.vec<Variant>();
+              serialized_sparse_t(0) = example_result.sparse_indices[d];
+              serialized_sparse_t(1) = example_result.sparse_values[d];
+              serialized_sparse_t(2) = example_result.sparse_shapes[d];
+              int output_index = key_to_output_index_.at(sparse_keys_[d]);
+              CHECK(serialized_sparse.dtype() == output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(output_dtypes()[output_index]) << ", got "
+                  << DataTypeString(serialized_sparse.dtype()) << ").";
+              CHECK(output_shapes()[output_index].IsCompatibleWith(
+                  serialized_sparse.shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << output_shapes()[output_index].DebugString() << ", got "
+                  << serialized_sparse.shape().DebugString() << ").";
+              (*result)[output_index] = serialized_sparse;
+            }
+            // TODO(b/111553342): User provided tags instead of fixed tag.
+            if (stats_aggregator) {
+              stats_aggregator->IncrementCounter(
+                  "examples_count", "trainer",
+                  example_result.feature_stats.size());
+              for (example::PerExampleFeatureStats feature_stats :
+                   example_result.feature_stats) {
+                stats_aggregator->AddToHistogram(
+                    strings::StrCat("record_stats", ":features"),
+                    {static_cast<double>(feature_stats.features_count)});
+                stats_aggregator->IncrementCounter(
+                    "features_count", "trainer", feature_stats.features_count);
+                stats_aggregator->IncrementCounter(
+                    "feature_values_count", "trainer",
+                    feature_stats.feature_values_count);
+                stats_aggregator->AddToHistogram(
+                    strings::StrCat("record_stats", ":feature-values"),
+                    {static_cast<double>(feature_stats.feature_values_count)});
+              }
+            }
+          }
+          done(s);
+        });
+      };
+
+      return NewParallelMapIterator(
+          {this, strings::StrCat(prefix, "::ParseExample")}, input_,
+          std::move(map_fn), num_parallel_calls_);
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ParseExampleDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
+      Node* num_parallle_calls_node;
+      std::vector<Node*> dense_defaults_nodes;
+      dense_defaults_nodes.reserve(dense_defaults_.size());
+
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(num_parallel_calls_, &num_parallle_calls_node));
+
+      for (const Tensor& dense_default : dense_defaults_) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(dense_default, &node));
+        dense_defaults_nodes.emplace_back(node);
+      }
+
+      AttrValue sparse_keys_attr;
+      AttrValue dense_keys_attr;
+      AttrValue sparse_types_attr;
+      AttrValue dense_attr;
+      AttrValue dense_shapes_attr;
+
+      b->BuildAttrValue(sparse_keys_, &sparse_keys_attr);
+      b->BuildAttrValue(dense_keys_, &dense_keys_attr);
+      b->BuildAttrValue(sparse_types_, &sparse_types_attr);
+      b->BuildAttrValue(dense_types_, &dense_attr);
+      b->BuildAttrValue(dense_shapes_, &dense_shapes_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(this,
+                                       {
+                                           {0, input_graph_node},
+                                           {1, num_parallle_calls_node},
+                                       },
+                                       {{2, dense_defaults_nodes}},
+                                       {{"sparse_keys", sparse_keys_attr},
+                                        {"dense_keys", dense_keys_attr},
+                                        {"sparse_types", sparse_types_attr},
+                                        {"Tdense", dense_attr},
+                                        {"dense_shapes", dense_shapes_attr}},
+                                       output));
+      return Status::OK();
+    }
+
+   private:
+    const DatasetBase* const input_;
+    const std::vector<Tensor> dense_defaults_;
+    const std::vector<string> sparse_keys_;
+    const std::vector<string> dense_keys_;
+    const std::map<string, int> key_to_output_index_;
+    const example::FastParseExampleConfig config_;
+    const int64 num_parallel_calls_;
+    const DataTypeVector sparse_types_;
+    const DataTypeVector dense_types_;
+    const std::vector<PartialTensorShape> dense_shapes_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  std::vector<string> sparse_keys_;
+  std::vector<string> dense_keys_;
+  DataTypeVector sparse_types_;
+  DataTypeVector dense_types_;
+  std::vector<PartialTensorShape> dense_shapes_;
+  std::vector<bool> variable_length_;
+  std::vector<std::size_t> elements_per_stride_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParseExampleDataset").Device(DEVICE_CPU),
+                        ParseExampleDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 5e9ace3486..299949b99f 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -172,32 +172,39 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
     class ForeverIterator : public DatasetIterator<Dataset> {
      public:
       explicit ForeverIterator(const Params& params)
-          : DatasetIterator<Dataset>(params), input_impl_(nullptr) {}
+          : DatasetIterator<Dataset>(params),
+            input_impl_(nullptr),
+            first_call_(true) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
         do {
-          bool first_call = false;
           if (!input_impl_) {
-            first_call = true;
             TF_RETURN_IF_ERROR(
                 dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           }
-          TF_RETURN_IF_ERROR(
-              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-          if (!*end_of_sequence) {
+          Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+          if (first_call_ && *end_of_sequence) {
+            // If the first call to GetNext() fails because the end
+            // of sequence has been reached, we terminate the
+            // iteration immediately. (Otherwise, this iterator
+            // would loop infinitely and never produce a value.)
+            input_impl_.reset();
             return Status::OK();
+          }
+          first_call_ = false;
+          if (!*end_of_sequence) {
+            return s;
           } else {
             input_impl_.reset();
-            if (first_call) {
-              // If the first call to GetNext() fails because the end
-              // of sequence has been reached, we terminate the
-              // iteration immediately. (Otherwise, this iterator
-              // would loop infinitely and never produce a value.)
-              return Status::OK();
-            }
+            first_call_ = true;
           }
         } while (true);
       }
@@ -205,7 +212,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-        if (input_impl_)
+        if (!first_call_)
           TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         else
           TF_RETURN_IF_ERROR(
@@ -218,10 +225,12 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         if (reader->Contains(full_name("uninitialized"))) {
           input_impl_.reset();
+          first_call_ = true;
         } else {
           TF_RETURN_IF_ERROR(
               dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+          first_call_ = false;
         }
         return Status::OK();
       }
@@ -229,6 +238,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
      private:
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      bool first_call_ GUARDED_BY(mu_);
     };
 
     const int64 count_;
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
index e4cb31e2b2..5d3319b19f 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -153,7 +153,9 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             state_(params.dataset->initial_state_) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(ctx);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
index 1ca144cb40..bc416fa78b 100644
--- a/tensorflow/core/kernels/data_format_ops.h
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
-#define TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_FORMAT_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_FORMAT_OPS_H_
 // Functor definition for data format dim mapping ops, must be compilable
 // by nvcc.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -83,4 +83,4 @@ struct DataFormatVecPermute {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_FORMAT_OPS_H_
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 53a23b1306..f7c68e8d47 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_DEBUG_OP_H_
-#define TENSORFLOW_KERNELS_DEBUG_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
@@ -389,4 +389,4 @@ class DebugNumericSummaryOp : public BaseDebugOp {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_DEBUG_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DEBUG_OPS_H_
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
index 240c13261e..61b5731250 100644
--- a/tensorflow/core/kernels/dense_update_functor.h
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DENSE_UPDATE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_DENSE_UPDATE_FUNCTOR_H_
 
 #define EIGEN_USE_THREADS
 
@@ -105,4 +105,4 @@ Status VariantCopyFn<GPUDevice>(OpKernelContext* context, const Tensor& from,
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DENSE_UPDATE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index 099696105b..cb0a76dac4 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -499,4 +499,4 @@ SpatialConvolutionBackwardKernel(
 
 }  // end namespace Eigen
 
-#endif  // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/extract_image_patches_op.h b/tensorflow/core/kernels/extract_image_patches_op.h
index e430a23d20..64b8c0338b 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.h
+++ b/tensorflow/core/kernels/extract_image_patches_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
-#define TENSORFLOW_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
+#define TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -53,4 +53,4 @@ struct ExtractImagePatchesForward {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EXTRACT_IMAGE_PATCHES_OP_H_
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index d51acc38ef..045a96ac1e 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
-#define TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FAKE_QUANT_OPS_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_FAKE_QUANT_OPS_FUNCTOR_H_
 
 #include <tuple>
 
@@ -277,4 +277,4 @@ struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FAKE_QUANT_OPS_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h
index 4c8b3f01a7..46bffa5173 100644
--- a/tensorflow/core/kernels/fill_functor.h
+++ b/tensorflow/core/kernels/fill_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FILL_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_FILL_FUNCTOR_H_
 
 #define EIGEN_USE_THREADS
 
@@ -89,4 +89,4 @@ struct SetOneFunctor<Eigen::ThreadPoolDevice, string> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_FILL_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FILL_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h
index 2d7a230fc0..55a959f3c3 100644
--- a/tensorflow/core/kernels/fractional_pool_common.h
+++ b/tensorflow/core/kernels/fractional_pool_common.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_KERNELS_FRACTIONAL_POOL_COMMON_H_
-#define TENSORFLOW_KERNELS_FRACTIONAL_POOL_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FRACTIONAL_POOL_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_FRACTIONAL_POOL_COMMON_H_
 
 #include <algorithm>
 #include <vector>
@@ -75,4 +75,4 @@ std::vector<int64> GeneratePoolingSequence(int input_length, int output_length,
                                            bool pseudo_random);
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_FRACTIONAL_POOL_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FRACTIONAL_POOL_COMMON_H_
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
index d6c68df986..c45b6f79e3 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_FUSED_BATCH_NORM_OP_H_
-#define TENSORFLOW_KERNELS_FUSED_BATCH_NORM_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FUSED_BATCH_NORM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_FUSED_BATCH_NORM_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -128,4 +128,4 @@ struct FusedBatchNormFreezeGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_FUSED_BATCH_NORM_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FUSED_BATCH_NORM_OP_H_
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 2c6e8bf3bc..cd2873bdca 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_GATHER_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_GATHER_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -176,4 +176,4 @@ struct GatherFunctor<GPUDevice, Variant, Index> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_GATHER_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 60780fb50c..003badb74d 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_GATHER_ND_OP_H_
-#define TENSORFLOW_KERNELS_GATHER_ND_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_H_
 // Functor definition for GatherOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -47,4 +47,4 @@ Status DoGatherNd(OpKernelContext* c, const Tensor& params,
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_GATHER_ND_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_H_
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
index dc028c2f1e..ad0112e6cb 100644
--- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
-#define TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
 
 // Specialization of GatherNdSlice to CPU
 
@@ -142,4 +142,4 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/gemm_functors.h b/tensorflow/core/kernels/gemm_functors.h
index 4b30c1f17f..1c80844085 100644
--- a/tensorflow/core/kernels/gemm_functors.h
+++ b/tensorflow/core/kernels/gemm_functors.h
@@ -24,6 +24,9 @@ limitations under the License.
 #error "EIGEN_USE_THREADS must be enabled by all .cc files including this."
 #endif  // EIGEN_USE_THREADS
 
+#ifndef TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
+#define TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
+
 #include <string.h>
 #include <map>
 #include <vector>
@@ -116,3 +119,5 @@ class FastGemmFunctor<float, float, float> {
   }
 };
 #endif  // USE_CBLAS_GEMM
+
+#endif  // TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
index ada96ae4ea..d0d5c3e018 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
-#define TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFER_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFER_UTILS_H_
 
 #include <queue>
 #include <utility>
@@ -56,4 +56,4 @@ class GraphTransferUtils {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_HEXAGON_GRAPH_TRANSFER_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFER_UTILS_H_
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index 86c1c5625f..4328d51916 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -228,4 +228,4 @@ class GraphTransferer {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 132cfde2db..1b382996f8 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
-#define TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
 
 #include <unordered_map>
 #include <vector>
@@ -88,4 +88,4 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
index b9328c8e0e..270d697e96 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
@@ -55,4 +55,4 @@ class HexagonOpsDefinitions final : public IRemoteFusedGraphOpsDefinitions {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
diff --git a/tensorflow/core/kernels/hexagon/soc_interface.h b/tensorflow/core/kernels/hexagon/soc_interface.h
index 062103ed98..d1a41d47c8 100644
--- a/tensorflow/core/kernels/hexagon/soc_interface.h
+++ b/tensorflow/core/kernels/hexagon/soc_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_
-#define TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_SOC_INTERFACE_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_SOC_INTERFACE_H_
 
 // Declaration of APIs provided by hexagon shared library. This header is shared
 // with both hexagon library built with qualcomm SDK and tensorflow.
@@ -111,4 +111,4 @@ void soc_interface_SetDebugFlag(uint64_t flag);
 }
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_PLATFORM_HEXAGON_SOC_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_SOC_INTERFACE_H_
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index d303e9c877..b12910d27d 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_HINGE_LOSS_H_
-#define TENSORFLOW_KERNELS_HINGE_LOSS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HINGE_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_HINGE_LOSS_H_
 
 #include <algorithm>
 #include <limits>
@@ -123,4 +123,4 @@ class HingeLossUpdater : public DualLossUpdater {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_HINGE_LOSS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_HINGE_LOSS_H_
diff --git a/tensorflow/core/kernels/histogram_op.h b/tensorflow/core/kernels/histogram_op.h
index 1b253f7fed..b14fc2bee3 100644
--- a/tensorflow/core/kernels/histogram_op.h
+++ b/tensorflow/core/kernels/histogram_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_HISTOGRAM_OP_H_
-#define TENSORFLOW_HISTOGRAM_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HISTOGRAM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_HISTOGRAM_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -35,4 +35,4 @@ struct HistogramFixedWidthFunctor {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_HISTOGRAM_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_HISTOGRAM_OP_H_
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
new file mode 100644
index 0000000000..d08a7c9bd2
--- /dev/null
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/host_constant_op.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+_HostConstantOp::_HostConstantOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+  const TensorProto* proto = nullptr;
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+  OP_REQUIRES_OK(
+      ctx, ctx->device()->MakeTensorFromProto(*proto, alloc_attr, &tensor_));
+  OP_REQUIRES(
+      ctx, ctx->output_type(0) == tensor_.dtype(),
+      errors::InvalidArgument("Type mismatch between value (",
+                              DataTypeString(tensor_.dtype()), ") and dtype (",
+                              DataTypeString(ctx->output_type(0)), ")"));
+}
+
+void _HostConstantOp::Compute(OpKernelContext* ctx) {
+  ctx->set_output(0, tensor_);
+}
+
+#if GOOGLE_CUDA
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("Const")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("dtype"),
+                        _HostConstantOp);
+#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("Const")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("dtype"),
+                        _HostConstantOp);
+#endif  // TENSORFLOW_USE_SYCL
+
+// HostConst: forced to generate output on the host.
+// Only used in tests; no op is registered for this kernel
+// externally (i.e., in array_ops.cc)
+REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), _HostConstantOp);
+REGISTER_KERNEL_BUILDER(
+    Name("HostConst").Device(DEVICE_GPU).HostMemory("output"), _HostConstantOp);
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"),
+    _HostConstantOp);
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // end namespace tensorflow
+
diff --git a/tensorflow/core/kernels/host_constant_op.h b/tensorflow/core/kernels/host_constant_op.h
new file mode 100644
index 0000000000..1b887ea1aa
--- /dev/null
+++ b/tensorflow/core/kernels/host_constant_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// HostConstantOp differs from ConstantOp in that its output is always
+// in host memory.
+class _HostConstantOp : public OpKernel {
+ public:
+  explicit _HostConstantOp(OpKernelConstruction* ctx);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override { return false; }
+  ~_HostConstantOp() override {}
+
+ private:
+  Tensor tensor_;
+  TF_DISALLOW_COPY_AND_ASSIGN(_HostConstantOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_HOST_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index 6072412689..b2329f4b61 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
-#define TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_EXECUTOR_H_
+#define TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_EXECUTOR_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
@@ -74,4 +74,4 @@ class IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_EXECUTOR_H_
diff --git a/tensorflow/core/kernels/identity_n_op.h b/tensorflow/core/kernels/identity_n_op.h
index 490bbf456c..7339cbbe29 100644
--- a/tensorflow/core/kernels/identity_n_op.h
+++ b/tensorflow/core/kernels/identity_n_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_IDENTITY_N_OP_H_
-#define TENSORFLOW_KERNELS_IDENTITY_N_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IDENTITY_N_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IDENTITY_N_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -41,4 +41,4 @@ class IdentityNOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_IDENTITY_N_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IDENTITY_N_OP_H_
diff --git a/tensorflow/core/kernels/identity_op.h b/tensorflow/core/kernels/identity_op.h
index f8856a1b9b..6b74868ad4 100644
--- a/tensorflow/core/kernels/identity_op.h
+++ b/tensorflow/core/kernels/identity_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_IDENTITY_OP_H_
-#define TENSORFLOW_KERNELS_IDENTITY_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IDENTITY_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IDENTITY_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -37,4 +37,4 @@ class IdentityOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_IDENTITY_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IDENTITY_OP_H_
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index 8dcb5977c6..1d4fa1a7db 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -18,8 +18,8 @@ limitations under the License.
 // reduce code duplication and ensure consistency across the different
 // resizers, it performs the input validation.
 
-#ifndef TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
-#define TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
+#define TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
 
 #define EIGEN_USE_THREADS
 
@@ -191,4 +191,4 @@ struct ImageResizerGradientState {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_IMAGE_RESIZER_STATE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMAGE_RESIZER_STATE_H_
diff --git a/tensorflow/core/kernels/immutable_constant_op.h b/tensorflow/core/kernels/immutable_constant_op.h
index 795331b4b2..97af8c7dc5 100644
--- a/tensorflow/core/kernels/immutable_constant_op.h
+++ b/tensorflow/core/kernels/immutable_constant_op.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_KERNELS_IMMUTABLE_CONSTANT_OP_H_
-#define TENSORFLOW_KERNELS_IMMUTABLE_CONSTANT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_IMMUTABLE_CONSTANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_IMMUTABLE_CONSTANT_OP_H_
 
 #include <memory>
 
@@ -46,4 +46,4 @@ class ImmutableConstantOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_IMMUTABLE_CONSTANT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_IMMUTABLE_CONSTANT_OP_H_
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
index 06d53eba30..fcf468f5a8 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.cc
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
-
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -32,6 +31,13 @@ Status InitializableLookupTable::Find(OpKernelContext* ctx, const Tensor& keys,
   return DoFind(keys, values, default_value);
 }
 
+Status InitializableLookupTable::ImportValues(OpKernelContext* ctx,
+                                              const Tensor& keys,
+                                              const Tensor& values) {
+  lookup::KeyValueTensorIterator iter(&keys, &values);
+  return Initialize(iter);
+}
+
 Status InitializableLookupTable::Initialize(InitTableIterator& iter) {
   if (!iter.Valid()) {
     return iter.status();
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index b4f81d9a70..424fe5df3c 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
-#define TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+#define TENSORFLOW_CORE_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
 
 #include "tensorflow/core/framework/lookup_interface.h"
 #include "tensorflow/core/platform/macros.h"
@@ -58,11 +58,7 @@ class InitializableLookupTable : public LookupInterface {
   }
 
   Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
-                      const Tensor& values) final {
-    return errors::Unimplemented(
-        "ImportValues not supported by InitializableLookupTable "
-        "implementations");
-  }
+                      const Tensor& values) final;
 
   TensorShape key_shape() const final { return TensorShape(); }
 
@@ -155,7 +151,58 @@ class InitializableLookupTable : public LookupInterface {
   bool is_initialized_ = false;
 };
 
+// Iterator to initialize tables given 'keys' and 'values' tensors.
+//
+// The two tensors are returned in the first iteration. It doesn't loop
+// over each element of the tensor since insertions in the lookup table can
+// process batches.
+class KeyValueTensorIterator
+    : public InitializableLookupTable::InitTableIterator {
+ public:
+  // keys and values are not owned by the iterator.
+  explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values)
+      : keys_(keys), values_(values), valid_(true), status_(Status::OK()) {
+    TensorShape key_shape = keys_->shape();
+    if (!key_shape.IsSameSize(values_->shape())) {
+      valid_ = false;
+      status_ = errors::InvalidArgument(
+          "keys and values should have the same dimension.",
+          key_shape.DebugString(), " vs ", values_->shape().DebugString());
+    }
+    if (key_shape.num_elements() == 0) {
+      valid_ = false;
+      status_ =
+          errors::InvalidArgument("keys and values cannot be empty tensors.");
+    }
+  }
+
+  bool Valid() const override { return valid_; }
+
+  void Next() override {
+    valid_ = false;
+    status_ = errors::OutOfRange("No more data.");
+  }
+
+  const Tensor& keys() const override { return *keys_; }
+
+  const Tensor& values() const override { return *values_; }
+
+  Status status() const override { return status_; }
+
+  int64 total_size() const override {
+    return keys_ == nullptr ? -1 : keys_->NumElements();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator);
+
+  const Tensor* keys_;    // Doesn't own it.
+  const Tensor* values_;  // Doesn't own it.
+  bool valid_;            // true if the iterator points to an existing range.
+  Status status_;
+};
+
 }  // namespace lookup
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_
diff --git a/tensorflow/core/kernels/inplace_ops_functor.h b/tensorflow/core/kernels/inplace_ops_functor.h
index b806787e91..2023869f49 100644
--- a/tensorflow/core/kernels/inplace_ops_functor.h
+++ b/tensorflow/core/kernels/inplace_ops_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_INPLACE_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_INPLACE_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_INPLACE_OPS_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_INPLACE_OPS_FUNCTOR_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -46,4 +46,4 @@ Status DoCopy(const Device& device, const Tensor& x, Tensor* y);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_INPLACE_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_INPLACE_OPS_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h
index 4953aa237c..465ef96a51 100644
--- a/tensorflow/core/kernels/l2loss_op.h
+++ b/tensorflow/core/kernels/l2loss_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_
-#define TENSORFLOW_KERNELS_L2LOSS_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_L2LOSS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_L2LOSS_OP_H_
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -30,4 +30,4 @@ struct L2LossOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_L2LOSS_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_L2LOSS_OP_H_
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index f7c3f1950b..692f916439 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
-#define TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
 
 // Classes to support linear algebra functionality, similar to the numpy.linalg
 // module. Supports batch computation on several matrices at once, sharding the
@@ -194,4 +194,4 @@ extern template class LinearAlgebraOp<complex128>;
 #define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
   REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar)
 
-#endif  // TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 42871c6113..b3f74c060b 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -261,14 +261,15 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
         out_tensor.flat<dtype>().constant(dtype(0));             \
     break;
 
-      TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+      TF_CALL_POD_TYPES(DTYPE_CASE)
 
 #undef DTYPE_CASE
       default:
         return errors::InvalidArgument(
-            "Trying to compute zeros_like for unsupported dtype",
-            out_tensor.dtype());
+            "Trying to compute zeros_like for unsupported dtype ",
+            DataTypeString(out_tensor.dtype()));
     }
+    y->tensors.emplace_back(out_tensor);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 6479e6f5dc..b43902e0b9 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_LOGISTIC_LOSS_H_
-#define TENSORFLOW_KERNELS_LOGISTIC_LOSS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LOGISTIC_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_LOGISTIC_LOSS_H_
 
 #include <cmath>
 
@@ -131,4 +131,4 @@ class LogisticLossUpdater : public DualLossUpdater {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_LOGISTIC_LOSS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LOGISTIC_LOSS_H_
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index b352dd257c..6e77e1ee01 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -74,13 +74,11 @@ class InitializeTableOp : public OpKernel {
                     "Keys and values must have the same size ",
                     keys.NumElements(), " vs ", values.NumElements()));
 
-    lookup::KeyValueTensorIterator iter(&keys, &values);
-
     int memory_used_before = 0;
     if (ctx->track_allocations()) {
       memory_used_before = table->MemoryUsed();
     }
-    OP_REQUIRES_OK(ctx, table->Initialize(iter));
+    OP_REQUIRES_OK(ctx, table->ImportValues(ctx, keys, values));
     if (ctx->track_allocations()) {
       ctx->record_persistent_memory_allocation(table->MemoryUsed() -
                                                memory_used_before);
diff --git a/tensorflow/core/kernels/lookup_table_init_op.h b/tensorflow/core/kernels/lookup_table_init_op.h
index 177a26daa8..101e528659 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.h
+++ b/tensorflow/core/kernels/lookup_table_init_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
-#define TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_INIT_OP_H_
 
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
 
@@ -30,4 +30,4 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
 }  // namespace lookup
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_INIT_OP_H_
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index cbe8560267..2e8d9c623c 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -813,17 +813,21 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImportV2").Device(DEVICE_CPU),
       LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
                     value_dtype>)
 
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32, string);
+REGISTER_KERNEL(int64, double);
+REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
+REGISTER_KERNEL(int64, string);
+REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(string, double);
 REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, int32);
 REGISTER_KERNEL(string, int64);
-REGISTER_KERNEL(int64, string);
-REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(string, string);
-REGISTER_KERNEL(string, bool);
-REGISTER_KERNEL(int32, int32);
-REGISTER_KERNEL(int32, string);
 
 #undef REGISTER_KERNEL
 
@@ -844,12 +848,20 @@ REGISTER_KERNEL(int32, string);
       LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int64);
-REGISTER_KERNEL(int64, string);
-REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
+REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(int64, Variant);
+REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(string, double);
+REGISTER_KERNEL(string, float);
+REGISTER_KERNEL(string, int32);
+REGISTER_KERNEL(string, int64);
 
 #undef REGISTER_KERNEL
 
@@ -870,10 +882,19 @@ REGISTER_KERNEL(int64, Variant);
       LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, int64);
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int64, double);
+REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(string, double);
+REGISTER_KERNEL(string, float);
+REGISTER_KERNEL(string, int32);
+REGISTER_KERNEL(string, int64);
 
 #undef REGISTER_KERNEL
 
@@ -894,13 +915,20 @@ REGISTER_KERNEL(string, bool);
       LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, float);
-REGISTER_KERNEL(int64, double);
-REGISTER_KERNEL(string, float);
-REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int32, double);
+REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int32, int32);
 REGISTER_KERNEL(int64, bool);
+REGISTER_KERNEL(int64, double);
+REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, Variant);
+REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(string, double);
+REGISTER_KERNEL(string, float);
+REGISTER_KERNEL(string, int32);
+REGISTER_KERNEL(string, int64);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 3977f16299..9451247f26 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
-#define TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
 
 #include "tensorflow/core/framework/lookup_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -102,9 +102,12 @@ class LookupTableOp : public OpKernel {
   ~LookupTableOp() override {
     // If the table object was not shared, delete it.
     if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
-      TF_CHECK_OK(
-          cinfo_.resource_manager()->template Delete<lookup::LookupInterface>(
-              cinfo_.container(), cinfo_.name()));
+      if (!cinfo_.resource_manager()
+               ->template Delete<lookup::LookupInterface>(cinfo_.container(),
+                                                          cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
+      }
     }
   }
 
@@ -272,4 +275,4 @@ class HashTable : public InitializableLookupTable {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h
index 894769960a..ec28cf9fa7 100644
--- a/tensorflow/core/kernels/lookup_util.h
+++ b/tensorflow/core/kernels/lookup_util.h
@@ -46,57 +46,6 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
                                    int32 value_index, Env* env,
                                    InitializableLookupTable* table);
 
-// Iterator to initialize tables given 'keys' and 'values' tensors.
-//
-// The two tensors are returned in the first iteration. It doesn't loop
-// over each element of the tensor since insertions in the lookup table can
-// process batches.
-class KeyValueTensorIterator
-    : public InitializableLookupTable::InitTableIterator {
- public:
-  // keys and values are not owned by the iterator.
-  explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values)
-      : keys_(keys), values_(values), valid_(true), status_(Status::OK()) {
-    TensorShape key_shape = keys_->shape();
-    if (!key_shape.IsSameSize(values_->shape())) {
-      valid_ = false;
-      status_ = errors::InvalidArgument(
-          "keys and values should have the same dimension.",
-          key_shape.DebugString(), " vs ", values_->shape().DebugString());
-    }
-    if (key_shape.num_elements() == 0) {
-      valid_ = false;
-      status_ =
-          errors::InvalidArgument("keys and values cannot be empty tensors.");
-    }
-  }
-
-  bool Valid() const override { return valid_; }
-
-  void Next() override {
-    valid_ = false;
-    status_ = errors::OutOfRange("No more data.");
-  }
-
-  const Tensor& keys() const override { return *keys_; }
-
-  const Tensor& values() const override { return *values_; }
-
-  Status status() const override { return status_; }
-
-  int64 total_size() const override {
-    return keys_ == nullptr ? -1 : keys_->NumElements();
-  }
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator);
-
-  const Tensor* keys_;    // Doesn't own it.
-  const Tensor* values_;  // Doesn't own it.
-  bool valid_;            // true if the iterator points to an existing range.
-  Status status_;
-};
-
 }  // namespace lookup
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/loss.h b/tensorflow/core/kernels/loss.h
index a77aa7587b..7db348800e 100644
--- a/tensorflow/core/kernels/loss.h
+++ b/tensorflow/core/kernels/loss.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_KERNELS_LOSS_H_
-#define TENSORFLOW_KERNELS_LOSS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_LOSS_H_
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -56,4 +56,4 @@ class DualLossUpdater {
 };
 
 }  // namespace tensorflow
-#endif  // TENSORFLOW_KERNELS_LOSS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LOSS_H_
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 628895ca86..4b74a64025 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_MATMUL_OP_H_
-#define TENSORFLOW_KERNELS_MATMUL_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MATMUL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MATMUL_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -117,4 +117,4 @@ typedef Eigen::GpuDevice GPUDevice;
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MATMUL_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MATMUL_OP_H_
diff --git a/tensorflow/core/kernels/matrix_band_part_op.h b/tensorflow/core/kernels/matrix_band_part_op.h
index 97cc950793..b04e36db8e 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.h
+++ b/tensorflow/core/kernels/matrix_band_part_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_MATRIX_DIAG_OP_H_
-#define TENSORFLOW_KERNELS_MATRIX_DIAG_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_BAND_PART_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MATRIX_BAND_PART_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -34,4 +34,4 @@ struct MatrixBandPartFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MATRIX_DIAG_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_BAND_PART_OP_H_
diff --git a/tensorflow/core/kernels/matrix_diag_op.h b/tensorflow/core/kernels/matrix_diag_op.h
index 14095845b8..108ba0f56b 100644
--- a/tensorflow/core/kernels/matrix_diag_op.h
+++ b/tensorflow/core/kernels/matrix_diag_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_MATRIX_DIAG_OP_H_
-#define TENSORFLOW_KERNELS_MATRIX_DIAG_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_DIAG_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MATRIX_DIAG_OP_H_
 
 // Generator definition for MatrixDiagOp, must be compilable by nvcc.
 
@@ -91,4 +91,4 @@ struct MatrixDiag {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MATRIX_DIAG_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_DIAG_OP_H_
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc
index 99db898301..01d4894438 100644
--- a/tensorflow/core/kernels/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/matrix_exponential_op.cc
@@ -49,6 +49,7 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixExponentialOp);
 };
 
+// Deprecated kernels (2018/08/21).
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<float>), float);
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<double>), double);
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex64>),
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.h b/tensorflow/core/kernels/matrix_set_diag_op.h
index aeb144559f..341ef12e97 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.h
+++ b/tensorflow/core/kernels/matrix_set_diag_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_MATRIX_SET_DIAG_OP_H_
-#define TENSORFLOW_KERNELS_MATRIX_SET_DIAG_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_SET_DIAG_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MATRIX_SET_DIAG_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -34,4 +34,4 @@ struct MatrixSetDiag {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MATRIX_SET_DIAG_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_SET_DIAG_OP_H_
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_impl.h b/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
index 0e09078365..00a05a87a3 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
+++ b/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 
 #include "third_party/eigen3/Eigen/Cholesky"
@@ -159,3 +162,5 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MATRIX_SOLVE_LS_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/maxpooling_op.h b/tensorflow/core/kernels/maxpooling_op.h
index f82e57d44c..2adb8081ce 100644
--- a/tensorflow/core/kernels/maxpooling_op.h
+++ b/tensorflow/core/kernels/maxpooling_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
-#define TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_H_
 // Functor definition for MaxPoolingOp, must be compilable by nvcc.
 
 #include "tensorflow/core/framework/numeric_types.h"
@@ -51,4 +51,4 @@ struct SpatialMaxPooling<Device, qint8> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MAXPOOLING_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_H_
diff --git a/tensorflow/core/kernels/mirror_pad_op.h b/tensorflow/core/kernels/mirror_pad_op.h
index 81150a9e79..cc4b6941b9 100644
--- a/tensorflow/core/kernels/mirror_pad_op.h
+++ b/tensorflow/core/kernels/mirror_pad_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_MIRROR_PAD_OP_H_
-#define TENSORFLOW_KERNELS_MIRROR_PAD_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -437,4 +437,4 @@ struct MirrorPadGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MIRROR_PAD_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_H_
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
index f27ca139c9..98e3be082d 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
+++ b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
-#define TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -42,4 +42,4 @@ TF_CALL_NUMBER_TYPES(DEFINE_CPU_SPECS);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MIRROR_PAD_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 50c25e1da7..afbfaa83f3 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -82,11 +82,11 @@ struct MklConvBwdFilterParams {
 };
 
 template <typename T>
-class MklConv2DBwdFilterPrimitive : public MklPrimitive {
+class MklConvBwdFilterPrimitive : public MklPrimitive {
  public:
-  explicit MklConv2DBwdFilterPrimitive(
-      const MklConvBwdFilterParams& convBwdFilterDims) :
-            cpu_engine_(engine::cpu, 0) {
+  explicit MklConvBwdFilterPrimitive(
+      const MklConvBwdFilterParams& convBwdFilterDims)
+      : cpu_engine_(engine::cpu, 0) {
     context_.bwd_filter_stream.reset(new stream(stream::kind::eager));
     // create conv primitive
     if (context_.conv_bwd_filter == nullptr) {
@@ -94,7 +94,7 @@ class MklConv2DBwdFilterPrimitive : public MklPrimitive {
     }
   }
 
-  ~MklConv2DBwdFilterPrimitive() {}
+  ~MklConvBwdFilterPrimitive() {}
 
   // Convolution backward weights with bias
   //   src_data:         input data buffer of src
@@ -297,38 +297,36 @@ class MklConv2DBwdFilterPrimitive : public MklPrimitive {
 };
 
 template <typename T>
-class MklConv2DBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
+class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static MklConv2DBwdFilterPrimitive<T>* Get(
+  static MklConvBwdFilterPrimitive<T>* Get(
       const MklConvBwdFilterParams& convBwdFilterDims) {
-    MklConv2DBwdFilterPrimitive<T>* conv2d_bwd_filter = nullptr;
+    MklConvBwdFilterPrimitive<T>* conv_bwd_filter = nullptr;
 
     // look into the pool for reusable primitive
-    conv2d_bwd_filter = dynamic_cast<MklConv2DBwdFilterPrimitive<T>*> (
-      MklConv2DBwdFilterPrimitiveFactory<T>::GetInstance().GetConv2dBwdFilter(
-      convBwdFilterDims));
-
-    if (conv2d_bwd_filter == nullptr) {
-        conv2d_bwd_filter = new MklConv2DBwdFilterPrimitive<T>(
-            convBwdFilterDims);
-        MklConv2DBwdFilterPrimitiveFactory<T>::GetInstance().SetConv2dBwdFilter(
-            convBwdFilterDims, conv2d_bwd_filter);
+    conv_bwd_filter = dynamic_cast<MklConvBwdFilterPrimitive<T>*>(
+        MklConvBwdFilterPrimitiveFactory<T>::GetInstance().GetConvBwdFilter(
+            convBwdFilterDims));
+
+    if (conv_bwd_filter == nullptr) {
+      conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
+      MklConvBwdFilterPrimitiveFactory<T>::GetInstance().SetConvBwdFilter(
+          convBwdFilterDims, conv_bwd_filter);
     }
-    return conv2d_bwd_filter;
+    return conv_bwd_filter;
   }
 
-
  private:
-  MklConv2DBwdFilterPrimitiveFactory() {}
-  ~MklConv2DBwdFilterPrimitiveFactory() {}
+  MklConvBwdFilterPrimitiveFactory() {}
+  ~MklConvBwdFilterPrimitiveFactory() {}
 
-  static MklConv2DBwdFilterPrimitiveFactory& GetInstance() {
-    static MklConv2DBwdFilterPrimitiveFactory instance_;
+  static MklConvBwdFilterPrimitiveFactory& GetInstance() {
+    static MklConvBwdFilterPrimitiveFactory instance_;
     return instance_;
   }
 
   static string CreateKey(const MklConvBwdFilterParams& convBwdFilterDims) {
-    string prefix = "conv2d_bwd_filter";
+    string prefix = "conv_bwd_filter";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdFilterDims.src_dims);
@@ -342,14 +340,14 @@ class MklConv2DBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
     return key_creator.GetKey();
   }
 
-  MklPrimitive* GetConv2dBwdFilter(
+  MklPrimitive* GetConvBwdFilter(
       const MklConvBwdFilterParams& convBwdFilterDims) {
     string key = CreateKey(convBwdFilterDims);
     return this->GetOp(key);
   }
 
-  void SetConv2dBwdFilter(
-      const MklConvBwdFilterParams& convBwdFilterDims, MklPrimitive* op) {
+  void SetConvBwdFilter(const MklConvBwdFilterParams& convBwdFilterDims,
+                        MklPrimitive* op) {
     string key = CreateKey(convBwdFilterDims);
     this->SetOp(key, op);
   }
@@ -738,14 +736,13 @@ TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #else
 
 template <typename Device, class T, bool biasEnabled>
-class MklConv2DCustomBackpropFilterOp
-    : public MklConv2DBackpropCommonOp<Device, T> {
+class MklConvCustomBackpropFilterOp
+    : public MklConvBackpropCommonOp<Device, T> {
  public:
-  explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) {
-  }
+  explicit MklConvCustomBackpropFilterOp(OpKernelConstruction* context)
+      : MklConvBackpropCommonOp<Device, T>(context) {}
 
-  ~MklConv2DCustomBackpropFilterOp() {}
+  ~MklConvCustomBackpropFilterOp() {}
 
   void Compute(OpKernelContext* context) {
     try {
@@ -753,6 +750,9 @@ class MklConv2DCustomBackpropFilterOp
       MklDnnData<T> diff_dst(&cpu_engine_);
       MklDnnData<T> diff_filter(&cpu_engine_);  // output
 
+      // This flag indicates Conv2D or Conv3D
+      bool isConv2D = (this->strides_.size() == 4);
+
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
       const Tensor& src_tensor = MklGetInput(context, kInputIdx);
@@ -813,7 +813,10 @@ class MklConv2DCustomBackpropFilterOp
           &fwd_dst_dims, &padding_left, &padding_right);
       if (!context->status().ok()) return;
 
-      auto tf_fmt = TFDataFormatToMklDnnDataFormat(this->data_format_);
+      auto tf_fmt = isConv2D
+                        ? TFDataFormatToMklDnnDataFormat(this->data_format_)
+                        : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
+
       auto fwd_src_md =
           src_mkl_shape.IsMklTensor()
               ? src_mkl_shape.GetMklLayout()
@@ -832,21 +835,19 @@ class MklConv2DCustomBackpropFilterOp
       if (biasEnabled) {
         TensorShape obp_tf_shape = GetTfShape(context, 2);
         depth = (this->data_format_ == FORMAT_NCHW)
-              ? obp_tf_shape.dim_size(1)
-              : obp_tf_shape.dim_size(3);
+                    ? obp_tf_shape.dim_size(1)
+                    : obp_tf_shape.dim_size(isConv2D ? 3 : 4);
         diff_bias_dims = {static_cast<int>(depth)};
       }
+      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
-      dilations[kDilationH] -= 1;
-      dilations[kDilationW] -= 1;
-
-      MklConv2DBwdFilterPrimitive<T> *conv2d_bwd_filter = nullptr;
+      MklConvBwdFilterPrimitive<T>* conv_bwd_filter = nullptr;
       MklConvBwdFilterParams convBwdFilterDims(fwd_src_dims, fwd_filter_dims,
           diff_bias_dims, diff_dst_dims, strides, dilations, padding_left,
           padding_right, TFPaddingToMklDnnPadding(this->padding_));
-      conv2d_bwd_filter = MklConv2DBwdFilterPrimitiveFactory<T>::Get(
-          convBwdFilterDims);
-      auto bwd_filter_pd = conv2d_bwd_filter->GetPrimitiveDesc();
+      conv_bwd_filter =
+          MklConvBwdFilterPrimitiveFactory<T>::Get(convBwdFilterDims);
+      auto bwd_filter_pd = conv_bwd_filter->GetPrimitiveDesc();
 
       // allocate output tensors: diff_fitler and diff_bias (w bias)
       auto bwd_output_dims = GetOutputDims(fwd_src_dims, fwd_filter_dims);
@@ -854,14 +855,26 @@ class MklConv2DCustomBackpropFilterOp
       // diff_filter
       MklDnnShape diff_filter_mkl_shape;
       diff_filter_mkl_shape.SetMklTensor(false);
-      // output_dims_mkl_order is in OIHW format.
-      TensorShape diff_filter_tf_shape(
-                    {bwd_output_dims[MklDnnDims::Dim_H],
-                     bwd_output_dims[MklDnnDims::Dim_W],
-                     bwd_output_dims[MklDnnDims::Dim_I],
-                     bwd_output_dims[MklDnnDims::Dim_O]});
-      AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                diff_filter_tf_shape, diff_filter_mkl_shape);
+
+      if (isConv2D) {
+        // Conv2D: output_dims_mkl_order is in OIHW format.
+        TensorShape diff_filter_tf_shape({bwd_output_dims[MklDnnDims::Dim_H],
+                                          bwd_output_dims[MklDnnDims::Dim_W],
+                                          bwd_output_dims[MklDnnDims::Dim_I],
+                                          bwd_output_dims[MklDnnDims::Dim_O]});
+        AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+      } else {
+        // Conv3D: output_dims_mkl_order is in OIDHW format.
+        TensorShape diff_filter_tf_shape(
+            {bwd_output_dims[MklDnnDims3D::Dim3d_D],
+             bwd_output_dims[MklDnnDims3D::Dim3d_H],
+             bwd_output_dims[MklDnnDims3D::Dim3d_W],
+             bwd_output_dims[MklDnnDims3D::Dim3d_I],
+             bwd_output_dims[MklDnnDims3D::Dim3d_O]});
+        AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+      }
 
       Tensor* diff_bias_tensor = nullptr;
       if (biasEnabled) {
@@ -871,7 +884,7 @@ class MklConv2DCustomBackpropFilterOp
 
       // check if src and diff_dst need reorder
       T *src_data = nullptr;
-      if (fwd_src_md.data.format != conv2d_bwd_filter->GetSrcMemoryFormat()) {
+      if (fwd_src_md.data.format != conv_bwd_filter->GetSrcMemoryFormat()) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
         src.CheckReorderToOpMem(bwd_filter_pd->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
@@ -882,7 +895,7 @@ class MklConv2DCustomBackpropFilterOp
 
       T *diff_dst_data = nullptr;
       if (diff_dst_md.data.format !=
-          conv2d_bwd_filter->GetDiffDstMemoryFormat()) {
+          conv_bwd_filter->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_filter_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
@@ -897,7 +910,7 @@ class MklConv2DCustomBackpropFilterOp
       bool diff_filter_reorder_required = false;
       T *diff_filter_data = nullptr;
       if (GetOutputFormat(tf_fmt) !=
-          conv2d_bwd_filter->GetDiffFilterMemoryFormat()) {
+          conv_bwd_filter->GetDiffFilterMemoryFormat()) {
         // Allocate diff filter tensor as Tensorflow layout
         diff_filter.SetUsrMem(bwd_output_dims, GetOutputFormat(tf_fmt),
                               diff_filter_tensor);
@@ -915,10 +928,10 @@ class MklConv2DCustomBackpropFilterOp
       if (biasEnabled) {
         T* diff_bias_data = static_cast<T*>(const_cast<T*>(
                          diff_bias_tensor->flat<T>().data()));
-        conv2d_bwd_filter->Execute(src_data, diff_filter_data,
-                               diff_bias_data, diff_dst_data);
+        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_bias_data,
+                                 diff_dst_data);
       } else {
-        conv2d_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data);
+        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data);
       }
 
       // Reorder diff_filter back to Tensorflow layout if necessary
@@ -947,7 +960,7 @@ class MklConv2DCustomBackpropFilterOp
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
     CHECK(!filter_mkl_shape.IsMklTensor())
-        << "Conv2DBackpropFilter: filter should not be in MKL Layout";
+        << "ConvBackpropFilter: filter should not be in MKL Layout";
   }
 
   // Get TensorFlow shape of input tensor.
@@ -983,9 +996,11 @@ class MklConv2DCustomBackpropFilterOp
     return fwd_filter_dims;
   }
 
-  // Output layout is Tensorflow's filter layout (HWIO).
+  // Output layout is Tensorflow's filter layout
+  //   Conv2D: HWIO;  Conv3D: DHWIO
   memory::format GetOutputFormat(const memory::format data_format) {
-    return memory::format::hwio;
+    return (this->strides_.size() == 4) ? memory::format::hwio
+                                        : memory::format::dhwio;
   }
 
   // Allocate output tensor.
@@ -1027,24 +1042,27 @@ class MklConv2DCustomBackpropFilterOp
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_MklConv2DBackpropFilter")                                   \
-          .Device(DEVICE_CPU)                                            \
-          .TypeConstraint<T>("T")                                        \
-          .Label(mkl_op_registry::kMklOpLabel),                          \
-      MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);             \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_MklConv2DBackpropFilterWithBias")                           \
-          .Device(DEVICE_CPU)                                            \
-          .TypeConstraint<T>("T")                                        \
-          .Label(mkl_op_registry::kMklOpLabel),                          \
-      MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>);              \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
-                              .Device(DEVICE_CPU)                        \
-                              .TypeConstraint<T>("T")                    \
-                              .Label(mkl_op_registry::kMklOpLabel),      \
-                          MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")                     \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")             \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklConvCustomBackpropFilterOp<CPUDevice, T, true>);  \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")       \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklDummyOp<CPUDevice, T>);                           \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropFilterV2")                   \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T")                          \
+                              .Label(mkl_op_registry::kMklOpLabel),            \
+                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 38e014d68e..b5a98301e2 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 #ifndef INTEL_MKL_ML_ONLY
 
-/// utility classes enabling primitive reuse for backward conv2d ops.
+/// utility classes enabling primitive reuse for backward conv ops.
 struct MklConvBwdInputParams {
   memory::dims diff_src_dims;
   memory::dims filter_dims;
@@ -83,11 +83,11 @@ struct MklConvBwdInputParams {
 };
 
 template <typename T>
-class MklConv2DBwdInputPrimitive : public MklPrimitive {
+class MklConvBwdInputPrimitive : public MklPrimitive {
  public:
-  explicit MklConv2DBwdInputPrimitive(
-      const MklConvBwdInputParams& convBwdInputDims) :
-           cpu_engine_(engine::cpu, 0) {
+  explicit MklConvBwdInputPrimitive(
+      const MklConvBwdInputParams& convBwdInputDims)
+      : cpu_engine_(engine::cpu, 0) {
     context_.bwd_input_stream.reset(new stream(stream::kind::eager));
 
     // create conv primitive
@@ -95,7 +95,7 @@ class MklConv2DBwdInputPrimitive : public MklPrimitive {
       Setup(convBwdInputDims);
     }
   }
-  ~MklConv2DBwdInputPrimitive() {}
+  ~MklConvBwdInputPrimitive() {}
 
   // Convolution backward filter (weights)
   //   diff_src_data: output data buffer of diff_src
@@ -134,7 +134,7 @@ class MklConv2DBwdInputPrimitive : public MklPrimitive {
   }
 
  private:
-  // Primitive reuse context for Conv2D Bwd Input op
+  // Primitive reuse context for Conv Bwd Input op
   struct ConvBwdInputContext {
     // expected memory format for this primitive instance
     memory::format filter_fmt;
@@ -235,38 +235,37 @@ class MklConv2DBwdInputPrimitive : public MklPrimitive {
 };
 
 template <typename T>
-class MklConv2DBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
+class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
  private:
-  MklConv2DBwdInputPrimitiveFactory() {}
-  ~MklConv2DBwdInputPrimitiveFactory() {}
+  MklConvBwdInputPrimitiveFactory() {}
+  ~MklConvBwdInputPrimitiveFactory() {}
 
  public:
-  static MklConv2DBwdInputPrimitive<T>* Get(
+  static MklConvBwdInputPrimitive<T>* Get(
       const MklConvBwdInputParams& convBwdInputDims) {
-    MklConv2DBwdInputPrimitive<T>* conv2d_bwd_input = nullptr;
+    MklConvBwdInputPrimitive<T>* conv_bwd_input = nullptr;
 
     // look into the pool for reusable primitive
-    conv2d_bwd_input = dynamic_cast<MklConv2DBwdInputPrimitive<T>*> (
-        MklConv2DBwdInputPrimitiveFactory<T>::GetInstance().GetConv2dBwdInput(
+    conv_bwd_input = dynamic_cast<MklConvBwdInputPrimitive<T>*>(
+        MklConvBwdInputPrimitiveFactory<T>::GetInstance().GetConvBwdInput(
             convBwdInputDims));
 
-    if (conv2d_bwd_input == nullptr) {
-      conv2d_bwd_input = new MklConv2DBwdInputPrimitive<T>(
-          convBwdInputDims);
-      MklConv2DBwdInputPrimitiveFactory<T>::GetInstance().SetConv2dBwdInput(
-          convBwdInputDims, conv2d_bwd_input);
+    if (conv_bwd_input == nullptr) {
+      conv_bwd_input = new MklConvBwdInputPrimitive<T>(convBwdInputDims);
+      MklConvBwdInputPrimitiveFactory<T>::GetInstance().SetConvBwdInput(
+          convBwdInputDims, conv_bwd_input);
     }
-    return conv2d_bwd_input;
+    return conv_bwd_input;
   }
 
  private:
-  static MklConv2DBwdInputPrimitiveFactory& GetInstance() {
-    static MklConv2DBwdInputPrimitiveFactory instance_;
+  static MklConvBwdInputPrimitiveFactory& GetInstance() {
+    static MklConvBwdInputPrimitiveFactory instance_;
     return instance_;
   }
 
   static string CreateKey(const MklConvBwdInputParams& convBwdInputDims) {
-    string prefix = "conv2d_bwd_input";
+    string prefix = "conv_bwd_input";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdInputDims.diff_src_dims);
@@ -279,14 +278,13 @@ class MklConv2DBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
     return key_creator.GetKey();
   }
 
-  MklPrimitive* GetConv2dBwdInput(
-      const MklConvBwdInputParams& convBwdInputDims) {
+  MklPrimitive* GetConvBwdInput(const MklConvBwdInputParams& convBwdInputDims) {
     string key = CreateKey(convBwdInputDims);
     return this->GetOp(key);
   }
 
-  void SetConv2dBwdInput(
-      const MklConvBwdInputParams& convBwdInputDims, MklPrimitive *op) {
+  void SetConvBwdInput(const MklConvBwdInputParams& convBwdInputDims,
+                       MklPrimitive* op) {
     string key = CreateKey(convBwdInputDims);
     this->SetOp(key, op);
   }
@@ -594,23 +592,34 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
   TensorFormat data_format;
 };
 
+#define REGISTER_MKL_CPU_KERNELS(T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
+#undef REGISTER_MKL_CPU_KERNELS
+
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp
-    : public MklConv2DBackpropCommonOp<Device, T> {
+class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
  public:
-  explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) {
-  }
+  explicit MklConvCustomBackpropInputOp(OpKernelConstruction* context)
+      : MklConvBackpropCommonOp<Device, T>(context) {}
 
-  ~MklConv2DCustomBackpropInputOp() {}
+  ~MklConvCustomBackpropInputOp() {}
 
   void Compute(OpKernelContext* context) {
     try {
       MklDnnData<T> filter(&cpu_engine);
       MklDnnData<T> diff_dst(&cpu_engine);
 
+      // This flag indicate Conv2D or Conv3D
+      bool isConv2D = (this->strides_.size() == 4);
+
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
       const Tensor& src_tensor = MklGetInput(context, kInputIdx);
@@ -626,7 +635,7 @@ class MklConv2DCustomBackpropInputOp
                         diff_dst_mkl_shape);
 
       // Allow operator-specific generation of shapes.
-      // E.g., Conv2DBackpropFilter gets filter as filter_sizes. It is a
+      // E.g., ConvBackpropFilter gets filter as filter_sizes. It is a
       // tensor containing shape of filter. So filter.shape() is not
       // a correct way to get filter shape. These operator-specific calls
       // allow this class to handle this case.
@@ -655,6 +664,7 @@ class MklConv2DCustomBackpropInputOp
         }
         return;
       }
+
       // By default, all dims are in MKL order. Only dims in TF order
       // are those with postfix tf_order.
       memory::dims diff_dst_dims, fwd_src_dims, fwd_filter_dims;
@@ -673,15 +683,18 @@ class MklConv2DCustomBackpropInputOp
       // Create Convolution forward descriptor since Convolution backward
       // API needs it. For that, we first need to create input, filter
       // and output memory descriptors.
-      auto tf_fmt = TFDataFormatToMklDnnDataFormat(this->data_format_);
+      auto tf_fmt = isConv2D
+                        ? TFDataFormatToMklDnnDataFormat(this->data_format_)
+                        : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
 
       // If filter is in MKL layout, then simply grab filter layout;
       // otherwise, construct filter in TF layout.
       // For TF layout, filter is in HWIO format.
       auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
-                         ? filter_mkl_shape.GetMklLayout()
-                         : memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
+                               ? filter_mkl_shape.GetMklLayout()
+                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                              isConv2D ? memory::format::hwio
+                                                       : memory::format::dhwio);
 
       conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
       if (!context->status().ok()) return;
@@ -689,18 +702,15 @@ class MklConv2DCustomBackpropInputOp
                        ? diff_dst_mkl_shape.GetMklLayout()
                        : memory::desc(diff_dst_dims,
                            MklDnnType<T>(), tf_fmt);
+      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
-      dilations[kDilationH] -= 1;
-      dilations[kDilationW] -= 1;
-
-      MklConv2DBwdInputPrimitive<T> *conv2d_bwd_input = nullptr;
-      conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
+      MklConvBwdInputPrimitive<T>* conv_bwd_input = nullptr;
       MklConvBwdInputParams convBwdInputDims(fwd_src_dims, fwd_filter_dims,
           diff_dst_dims, strides, dilations, padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
-      conv2d_bwd_input = MklConv2DBwdInputPrimitiveFactory<T>::Get(
-          convBwdInputDims);
-      auto bwd_input_pd = conv2d_bwd_input->GetPrimitiveDesc();
+      conv_bwd_input =
+          MklConvBwdInputPrimitiveFactory<T>::Get(convBwdInputDims);
+      auto bwd_input_pd = conv_bwd_input->GetPrimitiveDesc();
 
       // allocate output tensor
       auto diff_src_pd = bwd_input_pd->diff_src_primitive_desc();
@@ -723,7 +733,7 @@ class MklConv2DCustomBackpropInputOp
       // check if filter and diff_dst need reorder
       T* filter_data = nullptr;
       if (fwd_filter_md.data.format !=
-          conv2d_bwd_input->GetFilterMemoryFormat()) {
+          conv_bwd_input->GetFilterMemoryFormat()) {
         filter.SetUsrMem(fwd_filter_md, &filter_tensor);
         filter.CheckReorderToOpMem(bwd_input_pd->weights_primitive_desc());
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
@@ -733,8 +743,7 @@ class MklConv2DCustomBackpropInputOp
       }
 
       T* diff_dst_data = nullptr;
-      if (diff_dst_md.data.format !=
-          conv2d_bwd_input->GetDiffDstMemoryFormat()) {
+      if (diff_dst_md.data.format != conv_bwd_input->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_input_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
@@ -745,7 +754,7 @@ class MklConv2DCustomBackpropInputOp
       }
 
       // execute convolution input bwd
-      conv2d_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
+      conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -770,7 +779,7 @@ class MklConv2DCustomBackpropInputOp
     // of the Tensor and never an actual tensor. So it will never be in MKL
     // layout.
     CHECK(!input_mkl_shape.IsMklTensor())
-        << "Conv2DBackpropInput: input should not be in MKL Layout";
+        << "ConvBackpropInput: input should not be in MKL Layout";
   }
 
   // Get TensorFlow shape of input tensor.
@@ -778,10 +787,10 @@ class MklConv2DCustomBackpropInputOp
                                const Tensor& input_tensor) {
     TensorShape input_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
-    CHECK_EQ(
-        TensorShapeUtils::MakeShape(input_tensor.vec<int32>(), &input_tf_shape)
-            .ok(),
-        true);
+    // Conv[2D|3D]BackpropInputV2 supports both DT_INT32 and DT_INT64
+    // output_shape MakeShape is able to handle both DT_INT32 and DT_INT64 for
+    // input_tensor.
+    CHECK_EQ(this->MakeShape(input_tensor, &input_tf_shape).ok(), true);
     return input_tf_shape;
   }
 
@@ -792,7 +801,7 @@ class MklConv2DCustomBackpropInputOp
   }
 
   // Get the Tensorflow shape of Output (diff_src),
-  // which is same as shape of Conv2D 'input'.
+  // which is same as shape of Conv 'input'.
   TensorShape GetOutputTfShape(const TensorShape& input_shape,
                                const TensorShape& filter_shape,
                                const TensorShape& outbprop_shape) {
@@ -800,7 +809,7 @@ class MklConv2DCustomBackpropInputOp
   }
 
   // Get the Tensorflow shape of Output (diff_src),
-  // which is same as shape of Conv2D 'input'.
+  // which is same as shape of Conv 'input'.
   const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
                                     const memory::dims& fwd_filter_dims) {
     return fwd_input_dims;
@@ -839,17 +848,22 @@ class MklConv2DCustomBackpropInputOp
   }
 };
 
-#endif  // INTEL_MKL_ML_ONLY
-
-#define REGISTER_MKL_CPU_KERNELS(T)                                 \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
+#define REGISTER_MKL_CPU_KERNELS(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")              \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),    \
+                          MklConvCustomBackpropInputOp<CPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")            \
+                              .Device(DEVICE_CPU)                      \
+                              .TypeConstraint<T>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),    \
+                          MklConvCustomBackpropInputOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 #undef REGISTER_MKL_CPU_KERNELS
 
+#endif  // INTEL_MKL_ML_ONLY
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index bca1aa21a8..c6295c7280 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -85,9 +85,9 @@ struct MklConvFwdParams {
 };
 
 template <typename T>
-class MklConv2DFwdPrimitive : public MklPrimitive {
+class MklConvFwdPrimitive : public MklPrimitive {
  public:
-  explicit MklConv2DFwdPrimitive(const MklConvFwdParams& convFwdDims)
+  explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
       : cpu_engine_(engine::cpu, 0) {
     context_.fwd_stream.reset(new stream(stream::kind::eager));
     // create conv primitive
@@ -96,7 +96,7 @@ class MklConv2DFwdPrimitive : public MklPrimitive {
     }
   }
 
-  ~MklConv2DFwdPrimitive() {}
+  ~MklConvFwdPrimitive() {}
 
   // Convolution forward execute with bias
   //   src_data:    input data buffer of src
@@ -269,37 +269,36 @@ class MklConv2DFwdPrimitive : public MklPrimitive {
 };
 
 template <typename T>
-class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
+class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static MklConv2DFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims) {
-    MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
+  static MklConvFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims) {
+    MklConvFwdPrimitive<T>* conv_fwd = nullptr;
 
     // try to find a suitable one in pool
-    conv2d_fwd = dynamic_cast<MklConv2DFwdPrimitive<T>*>(
-        MklConv2DFwdPrimitiveFactory<T>::GetInstance().GetConv2DFwd(
-            convFwdDims));
-
-    if (conv2d_fwd == nullptr) {
-      conv2d_fwd = new MklConv2DFwdPrimitive<T>(convFwdDims);
-      MklConv2DFwdPrimitiveFactory<T>::GetInstance().SetConv2DFwd(convFwdDims,
-                                                                  conv2d_fwd);
+    conv_fwd = dynamic_cast<MklConvFwdPrimitive<T>*>(
+        MklConvFwdPrimitiveFactory<T>::GetInstance().GetConvFwd(convFwdDims));
+
+    if (conv_fwd == nullptr) {
+      conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
+      MklConvFwdPrimitiveFactory<T>::GetInstance().SetConvFwd(convFwdDims,
+                                                              conv_fwd);
     }
-    return conv2d_fwd;
+    return conv_fwd;
   }
 
  private:
-  MklConv2DFwdPrimitiveFactory() {}
-  ~MklConv2DFwdPrimitiveFactory() {}
+  MklConvFwdPrimitiveFactory() {}
+  ~MklConvFwdPrimitiveFactory() {}
 
   static const int kDilationH = 0, kDilationW = 1;
 
-  static MklConv2DFwdPrimitiveFactory& GetInstance() {
-    static MklConv2DFwdPrimitiveFactory instance_;
+  static MklConvFwdPrimitiveFactory& GetInstance() {
+    static MklConvFwdPrimitiveFactory instance_;
     return instance_;
   }
 
   static string CreateKey(const MklConvFwdParams& convFwdDims) {
-    string prefix = "conv2d_fwd_";
+    string prefix = "conv_fwd_";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convFwdDims.src_dims);
@@ -313,12 +312,12 @@ class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     return key_creator.GetKey();
   }
 
-  MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) {
+  MklPrimitive* GetConvFwd(const MklConvFwdParams& convFwdDims) {
     string key = CreateKey(convFwdDims);
     return this->GetOp(key);
   }
 
-  void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
+  void SetConvFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
     string key = CreateKey(convFwdDims);
     this->SetOp(key, op);
   }
@@ -331,11 +330,11 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // For now, MKL-ML is default. So making MKL-DNN not a default choice.
 #ifdef INTEL_MKL_ML_ONLY
 template <typename Device, typename T, bool biasEnabled>
-class MklConv2DOp : public OpKernel {
+class MklConvOp : public OpKernel {
  public:
-  ~MklConv2DOp() {}
+  ~MklConvOp() {}
 
-  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit MklConvOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
@@ -755,21 +754,22 @@ class MklConv2DOp : public OpKernel {
 
 #else
 
+// Base class for convolution forward operations
 template <typename Device, typename T, bool biasEnabled>
-class MklConv2DOp : public OpKernel {
+class MklConvOp : public OpKernel {
  public:
-  ~MklConv2DOp() {}
+  ~MklConvOp() {}
 
-  explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit MklConvOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, strides_.size() == 4,
+    OP_REQUIRES(context, (strides_.size() == 4 || strides_.size() == 5),
                 errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+                                        "specify 4 or 5 dimensions"));
 
     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
@@ -778,20 +778,39 @@ class MklConv2DOp : public OpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
-    const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
-    const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
-    const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context, dilation_h > 0 && dilation_w > 0,
-        errors::InvalidArgument("Dilated rates should be larger than 0."));
+
+    if (strides_.size() == 4) {
+      OP_REQUIRES(context, dilations_.size() == 4,
+                  errors::InvalidArgument("Sliding window dilations field must "
+                                          "specify 4 dimensions"));
+      const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+      const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+      const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+      const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+      OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilations in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context, dilation_h > 0 && dilation_w > 0,
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    } else if (strides_.size() == 5) {
+      OP_REQUIRES(context, dilations_.size() == 5,
+                  errors::InvalidArgument("Dilation rates field must "
+                                          "specify 5 dimensions"));
+      OP_REQUIRES(context,
+                  (GetTensorDim(dilations_, data_format_, 'N') == 1 &&
+                   GetTensorDim(dilations_, data_format_, 'C') == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilations rates in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context,
+          (GetTensorDim(dilations_, data_format_, '0') > 0 &&
+           GetTensorDim(dilations_, data_format_, '1') > 0 &&
+           GetTensorDim(dilations_, data_format_, '2') > 0),
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -837,7 +856,8 @@ class MklConv2DOp : public OpKernel {
         AllocateOutputSetMklShape(context, kOutputIndex_Dst,
                     &dst_tensor, src_tf_shape, dst_mkl_shape);
 
-        // MklConv2D also outputs converted filter as 2nd output of Conv2D.
+        // MklConv2D/3D also outputs converted filter
+        // as 2nd output of Conv2D/3D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
@@ -846,15 +866,20 @@ class MklConv2DOp : public OpKernel {
         return;
       }
 
+      bool isConv2D = (strides_.size() == 4);
+
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      auto tf_fmt = isConv2D ? TFDataFormatToMklDnnDataFormat(data_format_)
+                             : TFDataFormatToMklDnn3DDataFormat(data_format_);
 
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
-      // layout (NHWC or NCHW depending on data format).
+      // layout depending on data format:
+      //     Conv2D: NHWC or NCHW
+      //     Conv3D: NDHWC or NCDHW
       auto src_md = src_mkl_shape.IsMklTensor()
                         ? src_mkl_shape.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
@@ -864,31 +889,30 @@ class MklConv2DOp : public OpKernel {
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
                            ? filter_mkl_shape.GetMklLayout()
                            : memory::desc(filter_dims, MklDnnType<T>(),
-                                          memory::format::hwio);
-
+                                          isConv2D ? memory::format::hwio
+                                                   : memory::format::dhwio);
       // MKLDNN dilation starts from 0.
-      dilations[kDilationH] -= 1;
-      dilations[kDilationW] -= 1;
+      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
       // get a conv2d fwd from primitive pool
-      MklConv2DFwdPrimitive<T>* conv2d_fwd = nullptr;
+      MklConvFwdPrimitive<T>* conv_fwd = nullptr;
       if (biasEnabled) {
         memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
         MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
+        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(convFwdDims);
       } else {
         MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv2d_fwd = MklConv2DFwdPrimitiveFactory<T>::Get(convFwdDims);
+        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(convFwdDims);
       }
 
       // allocate output tensors output_tensor and filter_out_tensor
       std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
-          conv2d_fwd->GetPrimitiveDesc();
+          conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd,
                        dst_dims_mkl_order, tf_fmt, &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
@@ -900,7 +924,7 @@ class MklConv2DOp : public OpKernel {
 
       // check whether src/filter need reorder
       T *src_data = nullptr;
-      if (src_md.data.format != conv2d_fwd->GetSrcMemoryFormat()) {
+      if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
@@ -908,7 +932,7 @@ class MklConv2DOp : public OpKernel {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
       }
       T* filter_data = nullptr;
-      if (filter_md.data.format != conv2d_fwd->GetFilterMemoryFormat()) {
+      if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
         filter.SetUsrMem(filter_md, &filter_tensor);
         filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
                                    filter.GetTensorBuffer(filter_out_tensor));
@@ -918,16 +942,15 @@ class MklConv2DOp : public OpKernel {
             static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
       }
 
-
       // execute convolution
       if (biasEnabled) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
         T* bias_data = static_cast<T*>(const_cast<T*>(
             bias_tensor.flat<T>().data()));
 
-        conv2d_fwd->Execute(src_data, filter_data, bias_data, dst_data);
+        conv_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
-        conv2d_fwd->Execute(src_data, filter_data, dst_data);
+        conv_fwd->Execute(src_data, filter_data, dst_data);
       }
     } catch (mkldnn::error &e) {
       string error_msg = tensorflow::strings::StrCat(
@@ -1038,17 +1061,18 @@ class MklConv2DOp : public OpKernel {
 
 #endif
 
+// Register 2D operations
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, false>);        \
+                          MklConvOp<CPUDevice, T, false>);          \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, true>);         \
+                          MklConvOp<CPUDevice, T, true>);           \
   REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
@@ -1057,5 +1081,14 @@ class MklConv2DOp : public OpKernel {
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
+// Register 3D operations
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3D")                        \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConvOp<CPUDevice, T, false>);
+TF_CALL_float(REGISTER_MKL_CPU);
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 838c06f49d..01cc606f41 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -79,9 +79,16 @@ class MklDnnConvUtil {
     // For now we take the stride from the second and third dimensions only
     // (we do not support striding on the batch or depth dimension).
     CHECK_NOTNULL(strides);
-    int stride_rows = GetTensorDim(strides_, data_format_, 'H');
-    int stride_cols = GetTensorDim(strides_, data_format_, 'W');
-    *strides = {stride_rows, stride_cols};
+    if (strides_.size() == 4) {
+      int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+      int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+      *strides = {stride_rows, stride_cols};
+    } else if (strides_.size() == 5) {
+      int stride_planes = GetTensorDim(strides_, data_format_, '0');
+      int stride_rows = GetTensorDim(strides_, data_format_, '1');
+      int stride_cols = GetTensorDim(strides_, data_format_, '2');
+      *strides = {stride_planes, stride_rows, stride_cols};
+    }
   }
 
   // Calculate Convolution dilations
@@ -89,13 +96,20 @@ class MklDnnConvUtil {
     // For now we take the dilation from the second and third dimensions only
     // (we do not support dilation on the batch or depth dimension).
     CHECK_NOTNULL(dilations);
-    int dilations_rows = GetTensorDim(dilations_, data_format_, 'H');
-    int dilations_cols = GetTensorDim(dilations_, data_format_, 'W');
-    *dilations = {dilations_rows, dilations_cols};
+    if (dilations_.size() == 4) {
+      int dilations_rows = GetTensorDim(dilations_, data_format_, 'H');
+      int dilations_cols = GetTensorDim(dilations_, data_format_, 'W');
+      *dilations = {dilations_rows, dilations_cols};
+    } else if (dilations_.size() == 5) {
+      int dilations_planes = GetTensorDim(dilations_, data_format_, '0');
+      int dilations_rows = GetTensorDim(dilations_, data_format_, '1');
+      int dilations_cols = GetTensorDim(dilations_, data_format_, '2');
+      *dilations = {dilations_planes, dilations_rows, dilations_cols};
+    }
   }
 
   // Calculate Convolution input size in MKL-DNN order. MKL-DNN
-  // requires input in NCHW format. Function does not return anything.
+  // requires input in NCHW/NCDHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
   virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
@@ -113,40 +127,62 @@ class MklDnnConvUtil {
     int64 input_depth_raw = GetTensorDim(input_shape, data_format_, 'C');
     int input_depth = static_cast<int>(input_depth_raw);
 
-    // Input rows/height
-    int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');
-    CHECK_BOUNDS(input_rows_raw, "Input rows too large");
-    int input_rows = static_cast<int>(input_rows_raw);
-
-    // Input columns/width
-    int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');
-    CHECK_BOUNDS(input_cols_raw, "Input cols too large");
-    int input_cols = static_cast<int>(input_cols_raw);
-
     // Input batch
     int64 input_batch_raw = GetTensorDim(input_shape, data_format_, 'N');
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
+    if (strides_.size() == 4) {  // NCHW format for Conv2D
+      // Input rows/height
+      int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');
+      CHECK_BOUNDS(input_rows_raw, "Input rows too large");
+      int input_rows = static_cast<int>(input_rows_raw);
+
+      // Input columns/width
+      int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');
+      CHECK_BOUNDS(input_cols_raw, "Input cols too large");
+      int input_cols = static_cast<int>(input_cols_raw);
+
+      // MKL-DNN always requires input in NCHW format Conv2D.
+      std::vector<int> mkldnn_sizes(4, -1);
+      mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
+      mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
+      mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
+      mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
+
+      *input_dims = mkldnn_sizes;
+    } else if (strides_.size() == 5) {  // NCDHW format for Conv3D
+      // Input planes/third-dimension
+      int64 input_planes_raw = GetTensorDim(input_shape, data_format_, '0');
+      CHECK_BOUNDS(input_planes_raw, "Input depth too large");
+      int input_planes = static_cast<int>(input_planes_raw);
+
+      // Input rows/height
+      int64 input_rows_raw = GetTensorDim(input_shape, data_format_, '1');
+      CHECK_BOUNDS(input_rows_raw, "Input rows too large");
+      int input_rows = static_cast<int>(input_rows_raw);
+
+      // Input columns/width
+      int64 input_cols_raw = GetTensorDim(input_shape, data_format_, '2');
+      CHECK_BOUNDS(input_cols_raw, "Input cols too large");
+      int input_cols = static_cast<int>(input_cols_raw);
+
+      // MKL-DNN always requires input in NCDHW format for Conv3D.
+      std::vector<int> mkldnn_sizes(5, -1);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_N] = input_batch;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_C] = input_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_D] = input_planes;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_H] = input_rows;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_W] = input_cols;
+
+      *input_dims = mkldnn_sizes;
+    }
 #undef CHECK_BOUNDS
-
-    // MKL-DNN always requires input in NCHW format.
-    std::vector<int> mkldnn_sizes(4, -1);
-    mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
-    mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
-    mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
-    mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
-
-    *input_dims = mkldnn_sizes;
   }
 
-  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
-  // requires filter in OIHW format. Function does not return anything.
-  // But errors arising from sanity checks are returned in context's
-  // status.
-  //
-  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
-  // requires filter in OIHW format. Function does not return anything.
+  // Calculate Convolution filter size in MKL-DNN order.
+  // MKL-DNN requires filter in OIHW (Conv2D) or OIDHW (Conv3D) format.
+  // Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status. This function differs from GetConvFilterSizeInMklOrder in
   // parameter for input - it accepts src_shape since Convolution Backward
@@ -159,11 +195,13 @@ class MklDnnConvUtil {
                                               memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
-    OP_REQUIRES(context_, filter_shape.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
+    OP_REQUIRES(context_, filter_shape.dims() == strides_.size(),
+                errors::InvalidArgument((strides_.size() == 4)
+                                            ? "filter must be 4-dimensional: "
+                                            : "filter must be 5-dimensional: ",
                                         filter_shape.DebugString()));
 
-    for (int i = 0; i < 3; i++) {
+    for (int i = 0; i < ((strides_.size() == 4) ? 3 : 5); i++) {
       OP_REQUIRES(context_,
                   FastBoundsCheck(filter_shape.dim_size(i),
                                   std::numeric_limits<int>::max()),
@@ -172,32 +210,57 @@ class MklDnnConvUtil {
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", input_depth,
-                    " vs ", filter_shape.dim_size(2)));
-
-    // TF filter is always in (rows, cols, in_depth, out_depth) order.
-    int filter_rows = static_cast<int>(filter_shape.dim_size(0));
-    int filter_cols = static_cast<int>(filter_shape.dim_size(1));
-    int in_depth = static_cast<int>(filter_shape.dim_size(2));
-    int out_depth = static_cast<int>(filter_shape.dim_size(3));
-
-    // MKL-DNN always needs filter in OIHW format.
-    // OIHW = (out_depth, in_depth, rows, cols)
-    std::vector<int> mkldnn_sizes(4, -1);
-    mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
-    mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
-    mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
-    mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
-
-    *filter_dims = mkldnn_sizes;
+    if (strides_.size() == 4) {  // Conv2D
+      OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                  errors::InvalidArgument(
+                      "input and filter must have the same depth: ",
+                      input_depth, " vs ", filter_shape.dim_size(2)));
+
+      // TF filter is always in (rows, cols, in_depth, out_depth) order.
+      int filter_rows = static_cast<int>(filter_shape.dim_size(0));
+      int filter_cols = static_cast<int>(filter_shape.dim_size(1));
+      int in_depth = static_cast<int>(filter_shape.dim_size(2));
+      int out_depth = static_cast<int>(filter_shape.dim_size(3));
+
+      // MKL-DNN always needs filter in OIHW format.
+      // OIHW = (out_depth, in_depth, rows, cols)
+      std::vector<int> mkldnn_sizes(4, -1);
+      mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
+      mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
+      mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+      mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+
+      *filter_dims = mkldnn_sizes;
+    } else {  // Conv3D
+      OP_REQUIRES(context_, input_depth == filter_shape.dim_size(3),
+                  errors::InvalidArgument(
+                      "input and filter must have the same depth: ",
+                      input_depth, " vs ", filter_shape.dim_size(3)));
+
+      // TF filter is always in (planes, rows, cols, in_depth, out_depth) order.
+      int filter_planes = static_cast<int>(filter_shape.dim_size(0));
+      int filter_rows = static_cast<int>(filter_shape.dim_size(1));
+      int filter_cols = static_cast<int>(filter_shape.dim_size(2));
+      int in_depth = static_cast<int>(filter_shape.dim_size(3));
+      int out_depth = static_cast<int>(filter_shape.dim_size(4));
+
+      // MKL-DNN always needs filter in OIDHW format.
+      // OIDHW = (out_depth, in_depth, planes, rows, cols)
+      std::vector<int> mkldnn_sizes(5, -1);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_O] = out_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_I] = in_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_D] = filter_planes;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_H] = filter_rows;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_W] = filter_cols;
+
+      *filter_dims = mkldnn_sizes;
+    }
   }
 
-  // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
-  // requires filter in OIHW format. Function does not return anything.
-  // But errors arising from sanity checks are returned in context's
-  // status.
+  // Calculate Convolution filter size in MKL-DNN order.
+  // MKL-DNN requires filter in OIHW (Conv2D) or OIDHW(Conv3D format.
+  // Function does not return anything. But errors arising from sanity
+  // checks are returned in context's status.
   virtual inline void GetFilterSizeInMklOrder(size_t src_index,
                                               size_t filter_index,
                                               memory::dims* filter_dims) {
@@ -206,8 +269,8 @@ class MklDnnConvUtil {
                             GetTfShape(context_, filter_index), filter_dims);
   }
 
-  // Calculate Bias size for 2D Convolution. Function does not return
-  // anything, but sets error in context status.
+  // Calculate Bias size for 2D or 3D Convolution. Function does not
+  // return anything, but may set an error in context status.
   virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
                                             memory::dims* bias_dims) {
     const Tensor& bias = MklGetInput(context_, bias_index);
@@ -218,73 +281,142 @@ class MklDnnConvUtil {
     *bias_dims = {static_cast<int>(bias.dim_size(0))};
   }
 
-  // Function to calculate output and padding size for 2D convolution.
+  // Function to calculate output and padding size for 2D/3D convolution.
   //
   // Calculate output shape of Convolution in MKL-DNN and TensorFlow order.
-  // MKL-DNN uses NCHW for output order. But TensorFlow output will be in
-  // NHWC or NCHW format depending on data format. Function also calculates
-  // left, right, top and bottom pads. Function does not return any status -
-  // status is returned via context status.
+  // MKL-DNN uses NCHW(Conv2D) or NCDHW(Conv3D) for output order.
+  // But TensorFlow output will be in NHWC||NCHW(Conv2D) or
+  // NDHWC||NCDHW(Conv3D) format depending on data format.
+  // Function also calculates left, right, top and bottom pads.
+  // Function does not return any status which is set with context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
   virtual inline void GetOutputAndPadSizeInMklOrder(
       const TensorShape& input_shape, const TensorShape& filter_shape,
       const memory::dims& strides, const memory::dims& dilations,
-      memory::dims* output_dims_tf_order,
-      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
-      memory::dims* pad_r) {
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    int input_rows = GetTensorDim(input_shape, data_format_, 'H');
-    int input_cols = GetTensorDim(input_shape, data_format_, 'W');
+    bool isConv2D = (strides_.size() == 4);
+    int input_planes, input_rows, input_cols;
+    if (isConv2D) {
+      input_rows = GetTensorDim(input_shape, data_format_, 'H');
+      input_cols = GetTensorDim(input_shape, data_format_, 'W');
+    } else {
+      input_planes = GetTensorDim(input_shape, data_format_, '0');
+      input_rows = GetTensorDim(input_shape, data_format_, '1');
+      input_cols = GetTensorDim(input_shape, data_format_, '2');
+    }
 
-    // The first dimension for filter is rows/height.
-    int filter_rows = filter_shape.dim_size(0);
-    // The second dimension for filter is cols/width.
-    int filter_cols = filter_shape.dim_size(1);
+    // Filter dimension
+    // Conv2D:
+    //    First dimension: rows/height.
+    //    Second dimension: cols/width.
+    // Conv3D:
+    //    First dimension: planes/depth.
+    //    Second dimension: rows/height.
+    //    Third dimension: cols/width.
+
+    int filter_planes, filter_rows, filter_cols;
+    if (isConv2D) {
+      filter_rows = filter_shape.dim_size(0);
+      filter_cols = filter_shape.dim_size(1);
+    } else {
+      filter_planes = filter_shape.dim_size(0);
+      filter_rows = filter_shape.dim_size(1);
+      filter_cols = filter_shape.dim_size(2);
+    }
 
-    // Stride is vector of 2 elements: {s_r, s_c}
-    int stride_rows = strides[0];
-    int stride_cols = strides[1];
-    int dilation_rows = dilations[0];
-    int dilation_cols = dilations[1];
+    int stride_planes, stride_rows, stride_cols;
+    int dilation_planes, dilation_rows, dilation_cols;
+    if (isConv2D) {
+      // Conv2D stride is a vector of 2 elements: {s_r, s_c}
+      stride_rows = strides[0];
+      stride_cols = strides[1];
+      dilation_rows = dilations[0];
+      dilation_cols = dilations[1];
+    } else {
+      // Conv3D stride is a vector of 3 elements: {s_d, s_r, s_c}
+      stride_planes = strides[0];
+      stride_rows = strides[1];
+      stride_cols = strides[2];
+      dilation_planes = dilations[0];
+      dilation_rows = dilations[1];
+      dilation_cols = dilations[2];
+    }
 
     // Output batch is same as input batch.
     int out_batch = GetTensorDim(input_shape, data_format_, 'N');
+
     // Output depth is same as last dimension for filter.
-    int out_depth = filter_shape.dim_size(3);
+    int out_depth = filter_shape.dim_size(isConv2D ? 3 : 4);
 
-    int64 out_rows = 0, out_cols = 0;
+    int64 out_rows = 0, out_cols = 0, out_planes = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
+    int64 pad_D1, pad_D2;
+
+    if (isConv2D) {
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerboseV2(
+                         input_rows, filter_rows, dilation_rows, stride_rows,
+                         padding_, &out_rows, &pad_top, &pad_bottom));
+      OP_REQUIRES_OK(context_,
+                     GetWindowedOutputSizeVerboseV2(
+                         input_cols, filter_cols, dilation_cols, stride_cols,
+                         padding_, &out_cols, &pad_left, &pad_right));
+    } else {
+      OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                   input_planes, filter_planes, stride_planes,
+                                   padding_, &out_planes, &pad_D1, &pad_D2));
+      OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                   input_rows, filter_rows, stride_rows,
+                                   padding_, &out_rows, &pad_top, &pad_bottom));
+      OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                   input_cols, filter_cols, stride_cols,
+                                   padding_, &out_cols, &pad_left, &pad_right));
+    }
 
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerboseV2(input_rows, filter_rows,
-                                 dilation_rows, stride_rows, padding_,
-                                 &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerboseV2(input_cols, filter_cols,
-                                 dilation_cols, stride_cols, padding_,
-                                 &out_cols, &pad_left, &pad_right));
-
-    // Tensorflow output is in data_format order. (NHWC or NCHW)
+    // Tensorflow output is in data_format order.
+    //     Conv2D: NHWC or NCHW
+    //     Conv3D: NDHWC or NCDHW
+    // MKL-DNN uses asymetric padding.
     TensorShape out_shape =
-        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
+        isConv2D
+            ? ShapeFromFormat(data_format_, out_batch, out_rows, out_cols,
+                              out_depth)
+            : ShapeFromFormat(data_format_, out_batch,
+                              {{out_planes, out_rows, out_cols}}, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
-    // MKL-DNN always needs output in NCHW format.
-    std::vector<int> mkldnn_sizes(4, -1);
-    mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
-    mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
-    mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
-    mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
-    *output_dims_mkl_order = mkldnn_sizes;
-
-    // Now handle padding. MKL-DNN uses asymetric padding.
-    *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
-    *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+    if (isConv2D) {
+      // For Conv2D, MKL-DNN always needs output in NCHW format.
+      std::vector<int> mkldnn_sizes(4, -1);
+      mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
+      mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
+      mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
+      mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
+      *output_dims_mkl_order = mkldnn_sizes;
+
+      *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+      *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+    } else {
+      std::vector<int> mkldnn_sizes(5, -1);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_N] = out_batch;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_C] = out_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_D] = static_cast<int>(out_planes);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_H] = static_cast<int>(out_rows);
+      mkldnn_sizes[MklDnnDims3D::Dim3d_W] = static_cast<int>(out_cols);
+      *output_dims_mkl_order = mkldnn_sizes;
+
+      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
+                static_cast<int>(pad_left)};
+      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
+                static_cast<int>(pad_right)};
+    }
   }
 
   // Calculate output and pad size of forward Convolution operator.
@@ -292,10 +424,10 @@ class MklDnnConvUtil {
   //
   // Function does not return anything, but sets error in context status.
   inline void GetOutputAndPadSizeInMklOrder(
-      size_t src_index, size_t filter_index,
-      const memory::dims& strides, const memory::dims& dilations,
-      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r) {
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      const memory::dims& dilations, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -304,9 +436,17 @@ class MklDnnConvUtil {
     auto input_tf_shape = GetTfShape(context_, src_index);
     auto filter_tf_shape = GetTfShape(context_, filter_index);
 
-    OP_REQUIRES(context_, input_tf_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input_tf_shape.DebugString()));
+    if (strides_.size() == 4) {
+      // Conv2D
+      OP_REQUIRES(context_, input_tf_shape.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input_tf_shape.DebugString()));
+    } else {
+      // Conv3D
+      OP_REQUIRES(context_, input_tf_shape.dims() == 5,
+                  errors::InvalidArgument("input must be 5-dimensional",
+                                          input_tf_shape.DebugString()));
+    }
 
     GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
                                   strides, dilations, output_dims_tf_order,
@@ -314,9 +454,11 @@ class MklDnnConvUtil {
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
-  // 2D Convolution in MKL order (NCHW for input and output; OIHW for filter.)
-  // Function also calculates output shape in Tensorflow order. Additionally, it
-  // also calculates strides and paddings for 2D Convolution.
+  // Conv2D/Conv3D in MKL order:
+  //     Conv2D: NCHW for input and output; OIHW for filter.
+  //     Conv3D: NCDHW for input and output; OIDHW for filter.
+  // Function also calculates output shape in Tensorflow order.
+  // Additionally, it also calculates strides and paddings.
   //
   // Function does not return anything, but sets error in context status.
   inline void GetConvFwdSizesInMklOrder(
@@ -349,16 +491,15 @@ class MklDnnConvUtil {
   }
 };
 
-
 /////////////////////////////////////////////////////////////////////
-///  Common class that implements Conv2DBackpropFilter and Input
+///  Common class that implements ConvBackpropFilter and Input
 /////////////////////////////////////////////////////////////////////
 
 template <typename Device, class T>
-class MklConv2DBackpropCommonOp : public OpKernel {
+class MklConvBackpropCommonOp : public OpKernel {
  public:
-  ~MklConv2DBackpropCommonOp() {}
-  explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
+  ~MklConvBackpropCommonOp() {}
+  explicit MklConvBackpropCommonOp(OpKernelConstruction* context)
       : OpKernel(context) {
     string data_format_str;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
@@ -372,20 +513,25 @@ class MklConv2DBackpropCommonOp : public OpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
-    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
-    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
-    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-    OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context, dilation_h > 0 && dilation_w > 0,
-        errors::InvalidArgument("Dilated rates should be larger than 0."));
+
+    if (strides_.size() == 4) {
+      // Check Conv2D dilations
+      OP_REQUIRES(context, dilations_.size() == 4,
+                  errors::InvalidArgument("Sliding window dilations field must "
+                                          "specify 4 dimensions"));
+      int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+      int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+      int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+      int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+      OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilations in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context, dilation_h > 0 && dilation_w > 0,
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    }
+
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
diff --git a/tensorflow/core/kernels/multinomial_op.h b/tensorflow/core/kernels/multinomial_op.h
index 6e41060aa4..34e2123613 100644
--- a/tensorflow/core/kernels/multinomial_op.h
+++ b/tensorflow/core/kernels/multinomial_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_MULTINOMIAL_OP_H_
-#define TENSORFLOW_KERNELS_MULTINOMIAL_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MULTINOMIAL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_MULTINOMIAL_OP_H_
 
 namespace tensorflow {
 
@@ -27,4 +27,4 @@ struct MultinomialFunctor;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MULTINOMIAL_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MULTINOMIAL_OP_H_
diff --git a/tensorflow/core/kernels/neon/depthwiseconv_float.h b/tensorflow/core/kernels/neon/depthwiseconv_float.h
index 11f5be7c03..0d5a42bf10 100644
--- a/tensorflow/core/kernels/neon/depthwiseconv_float.h
+++ b/tensorflow/core/kernels/neon/depthwiseconv_float.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
-#define TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_FLOAT_H_
 
 #include "public/gemmlowp.h"
 #include "tensorflow/core/kernels/neon/types.h"
@@ -722,4 +722,4 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 }  // end namespace neon
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/core/kernels/no_op.h b/tensorflow/core/kernels/no_op.h
index 29ea46aed6..9e16d06978 100644
--- a/tensorflow/core/kernels/no_op.h
+++ b/tensorflow/core/kernels/no_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_NO_OP_H_
-#define TENSORFLOW_KERNELS_NO_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NO_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NO_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -29,4 +29,4 @@ class NoOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_NO_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NO_OP_H_
diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h
index e7d25daecc..7a5ec3d0b5 100644
--- a/tensorflow/core/kernels/nth_element_op.h
+++ b/tensorflow/core/kernels/nth_element_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_NTH_ELEMENT_OP_H_
-#define TENSORFLOW_NTH_ELEMENT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NTH_ELEMENT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NTH_ELEMENT_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -34,4 +34,4 @@ struct NthElementFunctor {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_NTH_ELEMENT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NTH_ELEMENT_OP_H_
diff --git a/tensorflow/core/kernels/one_hot_op.h b/tensorflow/core/kernels/one_hot_op.h
index db59f0f0d4..879df2b59b 100644
--- a/tensorflow/core/kernels/one_hot_op.h
+++ b/tensorflow/core/kernels/one_hot_op.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc
 
-#ifndef TENSORFLOW_KERNELS_ONE_HOT_OP_H_
-#define TENSORFLOW_KERNELS_ONE_HOT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
 // Generator definition for OneHotOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -69,4 +69,4 @@ struct OneHot {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_ONE_HOT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index 2c195beb7f..5d607b9044 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
-#define TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
+#define TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
 
 #include <memory>
 #include <vector>
@@ -252,4 +252,4 @@ class OpsTestBase : public ::testing::Test {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_OPS_TESTUTIL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index 93ef512778..a496487d1b 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_OPS_UTIL_H_
-#define TENSORFLOW_KERNELS_OPS_UTIL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
 
 // This file contains utilities for various operations.
 
@@ -113,4 +113,4 @@ gtl::InlinedVector<T, 8> ComputeEigenStrides(const EigenDimensions& shape) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_OPS_UTIL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/pad_op.h b/tensorflow/core/kernels/pad_op.h
index ee9e0f0330..ae79f515d9 100644
--- a/tensorflow/core/kernels/pad_op.h
+++ b/tensorflow/core/kernels/pad_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_PAD_OP_H_
-#define TENSORFLOW_KERNELS_PAD_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_PAD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_PAD_OP_H_
 // Functor definition for PadOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -54,4 +54,4 @@ struct Pad<Device, T, Tpadding, 0> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_PAD_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_PAD_OP_H_
diff --git a/tensorflow/core/kernels/padding_fifo_queue.h b/tensorflow/core/kernels/padding_fifo_queue.h
index 9d7c935068..b86b03c8f0 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.h
+++ b/tensorflow/core/kernels/padding_fifo_queue.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_PADDING_FIFO_QUEUE_H_
-#define TENSORFLOW_KERNELS_PADDING_FIFO_QUEUE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_PADDING_FIFO_QUEUE_H_
+#define TENSORFLOW_CORE_KERNELS_PADDING_FIFO_QUEUE_H_
 
 #include <deque>
 #include <vector>
@@ -86,4 +86,4 @@ class PaddingFIFOQueue : public FIFOQueue {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_PADDING_FIFO_QUEUE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_PADDING_FIFO_QUEUE_H_
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 0ab9ff9f65..aa70ee06f5 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -47,7 +47,7 @@ using random::PhiloxRandom;
 
 template <typename T>
 struct TruncatedNormalFunctor<CPUDevice, T> {
-  static const int kMaxIterations = 100;
+  static const int kMaxIterations = 1000;
 
   void operator()(OpKernelContext* ctx, const CPUDevice& d, int64 num_batches,
                   int64 samples_per_batch, int64 num_elements,
@@ -124,6 +124,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                                (normMin * (normMin - sqrtFactor)) / T(4)) /
             (normMin + sqrtFactor);
         const T diff = normMax - normMin;
+
         if (diff < cutoff) {
           // Sample from a uniform distribution on [normMin, normMax].
 
@@ -143,15 +144,20 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
 
             const auto u = dist(&gen_copy);
             for (int i = 0; i < size; i++) {
-              if (u[i] <= Eigen::numext::exp(g[i]) ||
-                  numIterations + 1 >= kMaxIterations) {
+              auto accept = u[i] <= Eigen::numext::exp(g[i]);
+              if (accept || numIterations + 1 >= kMaxIterations) {
                 // Accept the sample z.
                 // If we run out of iterations, just use the current uniform
-                // sample. Emperically, the probability of accepting each sample
-                // is at least 50% for typical inputs, so we will always accept
-                // by 100 iterations.
-                // This introduces a slight inaccuracy when at least one bound
-                // is large, minval is negative and maxval is positive.
+                // sample, but emit a warning.
+                // TODO(jjhunt) For small entropies (relative to the bounds),
+                // this sampler is poor and may take many iterations since
+                // the proposal distribution is the uniform distribution
+                // U(lower_bound, upper_bound).
+                if (!accept) {
+                  LOG(WARNING) << "TruncatedNormal uniform rejection sampler "
+                               << "exceeded max iterations. Sample may contain "
+                               << "outliers.";
+                }
                 output(sample) = z[i] * stddev + mean;
                 sample++;
                 if (sample >= limit_sample) {
@@ -181,8 +187,13 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
               const T g = Eigen::numext::exp(-x * x / T(2.0));
               const T u = rand[i];
               i++;
-              if ((u <= g && z < normMax) ||
-                  numIterations + 1 >= kMaxIterations) {
+              auto accept = (u <= g && z < normMax);
+              if (accept || numIterations + 1 >= kMaxIterations) {
+                if (!accept) {
+                  LOG(WARNING) << "TruncatedNormal exponential distribution "
+                               << "rejection sampler exceeds max iterations. "
+                               << "Sample may contain outliers.";
+                }
                 output(sample) = z * stddev + mean;
                 sample++;
                 if (sample >= limit_sample) {
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.h b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
index cc801eb810..2e54db31fe 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.h
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
-#define TENSORFLOW_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -49,4 +49,4 @@ struct TruncatedNormalFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 661d47d925..5b80a962bc 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -190,7 +190,7 @@ __global__ void __launch_bounds__(1024)
 // Partial specialization for GPU
 template <typename T>
 struct TruncatedNormalFunctor<GPUDevice, T> {
-  static const int kMaxIterations = 100;
+  static const int kMaxIterations = 1000;
 
   void operator()(OpKernelContext* ctx, const GPUDevice& d, int64 num_batches,
                   int64 samples_per_batch, int64 num_elements,
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 8db78f9784..876a1704c7 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -98,8 +98,7 @@ class PartitionedCallOp : public AsyncOpKernel {
                           done);
         auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
         CopyGraph(*fbody->graph, graph.get());
-        OP_REQUIRES_OK_ASYNC(ctx, PropagateInheritedDevices(graph.get(), args),
-                             done);
+        OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
 
         DeviceSet device_set;
         for (auto d : lib->device_mgr()->ListDevices()) {
@@ -163,15 +162,10 @@ class PartitionedCallOp : public AsyncOpKernel {
                     std::vector<AllocatorAttributes>>
       ArgAndRetAllocAttrs;
 
-  // Propagates device annotations from the outer graph to the function body.
-  //
   // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
   // corresponding resource lives. This ensures that the Placer assigns ops that
-  // access these resources to the appropriate devices. Additionally, places
-  // nodes that are unadorned with device annotations onto PartitiondCallOp's
-  // device. This lets call-site device annotations influence the execution
-  // of the function.
-  Status PropagateInheritedDevices(Graph* graph, const OpInputList& args) {
+  // access these resources to the appropriate devices.
+  Status PinResourceArgs(Graph* graph, const OpInputList& args) {
     for (Node* node : graph->op_nodes()) {
       string node_type = node->type_string();
       if (node_type == FunctionLibraryDefinition::kArgOp) {
@@ -184,18 +178,6 @@ class PartitionedCallOp : public AsyncOpKernel {
           ResourceHandle handle = args[index].flat<ResourceHandle>()(0);
           node->set_assigned_device_name(handle.device());
         }
-      } else if (node_type != FunctionLibraryDefinition::kRetOp) {
-        // All non-RetVal nodes that weren't explicitly placed by the user
-        // inherit PartitionedCallOp's device. RetVal placement is inferred by
-        // the placer, to avoid forcing the function's outputs through a single
-        // device.
-        //
-        // TODO(b/112166045): Plumb the original requested device into this
-        // OpKernel (this->requested_device() isn't reliable), and merge it
-        // with node->requested_device() if possible.
-        if (node->requested_device().empty()) {
-          node->set_requested_device(local_device_name_);
-        }
       }
     }
     return Status::OK();
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
index d1be3ba407..319b17397e 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
-#define TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/padding.h"
@@ -77,4 +77,4 @@ struct Pool3dParameters {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_POOLING_OPS_3D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_H_
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.h b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
index 350b1b6732..2c3681455e 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
@@ -17,8 +17,8 @@ limitations under the License.
 #error This file must only be included when building with Cuda support
 #endif
 
-#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
-#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_GPU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
 
 #define EIGEN_USE_GPU
 
@@ -45,4 +45,4 @@ struct MaxPool3dGradBackward {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index e9265551e3..dda2c80c49 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
-#define TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
 
 #include <vector>
 
@@ -605,4 +605,4 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index ff168df449..8e69b5b699 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_PRIORITY_QUEUE_H_
-#define TENSORFLOW_KERNELS_PRIORITY_QUEUE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_PRIORITY_QUEUE_H_
+#define TENSORFLOW_CORE_KERNELS_PRIORITY_QUEUE_H_
 
 #include <deque>
 #include <queue>
@@ -90,4 +90,4 @@ class PriorityQueue
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_PRIORITY_QUEUE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_PRIORITY_QUEUE_H_
diff --git a/tensorflow/core/kernels/qr_op_impl.h b/tensorflow/core/kernels/qr_op_impl.h
index 0552c034d2..535df9d160 100644
--- a/tensorflow/core/kernels/qr_op_impl.h
+++ b/tensorflow/core/kernels/qr_op_impl.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 //
 // This header file is used by the individual qr_*op*.cc files for registering
@@ -292,6 +295,8 @@ class QrOpGpu : public AsyncOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(QrOpGpu);
 };
 
-#endif
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_QR_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index 97bcaf1a49..d313a021dd 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_RANDOM_OP_H_
-#define TENSORFLOW_KERNELS_RANDOM_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -69,4 +69,4 @@ struct FillPhiloxRandom<SYCLDevice, Distribution> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_RANDOM_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_H_
diff --git a/tensorflow/core/kernels/random_poisson_op.h b/tensorflow/core/kernels/random_poisson_op.h
index 4e9fd62520..62ae01c16c 100644
--- a/tensorflow/core/kernels/random_poisson_op.h
+++ b/tensorflow/core/kernels/random_poisson_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_RANDOM_POISSON_OP_H_
-#define TENSORFLOW_KERNELS_RANDOM_POISSON_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_POISSON_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_POISSON_OP_H_
 
 namespace tensorflow {
 
@@ -28,4 +28,4 @@ struct PoissonFunctor;
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_RANDOM_POISSON_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_POISSON_OP_H_
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index 3010666598..ed160adfb4 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
-#define TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_
+#define TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_
 
 #include <vector>
 
@@ -249,4 +249,4 @@ class FixedUnigramSampler : public RangeSampler {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_RANGE_SAMPLER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RANGE_SAMPLER_H_
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index 34817ad51b..159b43b4cd 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_RECORD_YIELDER_H_
-#define TENSORFLOW_KERNELS_RECORD_YIELDER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RECORD_YIELDER_H_
+#define TENSORFLOW_CORE_KERNELS_RECORD_YIELDER_H_
 
 #include <atomic>
 #include <random>
@@ -157,4 +157,4 @@ class RecordYielder {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_RECORD_YIELDER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RECORD_YIELDER_H_
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 9af4cc23b6..88b3c2ac76 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -1058,4 +1061,6 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::OrReducer> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_GPU_KERNELS_CU_H_
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index e43d2828f3..eb264e0e5a 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_H_
-#define TENSORFLOW_KERNELS_REDUCTION_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_H_
 
 // Functor definitions for Reduction ops, must be compilable by nvcc.
 
@@ -79,4 +79,4 @@ struct ReduceFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 03d6e82e01..d83e1c7d15 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -18,8 +18,8 @@ limitations under the License.
 // is a header file because we split the various reduction ops into their
 // own compilation units to get more parallelism in compilation.
 
-#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
-#define TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
+#define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
 
 #define EIGEN_USE_THREADS
 
@@ -277,4 +277,4 @@ struct ReduceFunctor<SYCLDevice, Reducer>
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/regex_replace_op.cc b/tensorflow/core/kernels/regex_replace_op.cc
index 59ec854a79..a1b948891d 100644
--- a/tensorflow/core/kernels/regex_replace_op.cc
+++ b/tensorflow/core/kernels/regex_replace_op.cc
@@ -20,8 +20,43 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
+namespace {
+
+// Execute the specified regex using the given context.
+// Context requirements:
+//  - "input" string Tensor at input_index=0
+//  - "output" string Tensor at output_index=0
+Status InternalCompute(const RE2& match, const string& rewrite,
+                       const bool replace_global, OpKernelContext* ctx) {
+  const Tensor* input_tensor;
+  TF_RETURN_IF_ERROR(ctx->input("input", &input_tensor));
+  Tensor* output_tensor;
+  std::unique_ptr<Tensor> maybe_forwarded =
+      ctx->forward_input(0 /*input_index*/, 0 /*output_index*/,
+                         tensorflow::DT_STRING, input_tensor->shape(),
+                         ctx->input_memory_type(0), ctx->input_alloc_attr(0));
+  if (maybe_forwarded) {
+    output_tensor = maybe_forwarded.get();
+    TF_RETURN_IF_ERROR(ctx->set_output("output", *output_tensor));
+  } else {
+    TF_RETURN_IF_ERROR(
+        ctx->allocate_output("output", input_tensor->shape(), &output_tensor));
+    output_tensor->flat<string>() = input_tensor->flat<string>();
+  }
+  auto output_flat = output_tensor->flat<string>();
+  for (size_t i = 0; i < output_flat.size(); ++i) {
+    if (replace_global) {
+      RE2::GlobalReplace(&output_flat(i), match, rewrite);
+    } else {
+      RE2::Replace(&output_flat(i), match, rewrite);
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
 
 class RegexReplaceOp : public OpKernel {
  public:
@@ -30,10 +65,6 @@ class RegexReplaceOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    const Tensor* input_tensor;
-    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
-    const auto& input_flat = input_tensor->flat<string>();
-
     const Tensor* pattern_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
@@ -51,19 +82,7 @@ class RegexReplaceOp : public OpKernel {
                 errors::InvalidArgument("Rewrite must be scalar, but received ",
                                         rewrite_tensor->shape().DebugString()));
     const string rewrite = rewrite_tensor->flat<string>()(0);
-
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
-                                             &output_tensor));
-    auto output_flat = output_tensor->flat<string>();
-    for (size_t i = 0; i < input_flat.size(); ++i) {
-      output_flat(i) = input_flat(i);
-      if (replace_global_) {
-        RE2::GlobalReplace(&output_flat(i), match, rewrite);
-      } else {
-        RE2::Replace(&output_flat(i), match, rewrite);
-      }
-    }
+    OP_REQUIRES_OK(ctx, InternalCompute(match, rewrite, replace_global_, ctx));
   }
 
  private:
@@ -73,4 +92,31 @@ class RegexReplaceOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("RegexReplace").Device(DEVICE_CPU),
                         RegexReplaceOp);
 
+class StaticRegexReplaceOp : public OpKernel {
+ public:
+  explicit StaticRegexReplaceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string pattern;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("pattern", &pattern));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("rewrite", &rewrite_str_));
+    re_ = MakeUnique<RE2>(pattern);
+    OP_REQUIRES(ctx, re_->ok(),
+                errors::InvalidArgument("Invalid pattern: ", pattern,
+                                        ", error: ", re_->error()));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_global", &replace_global_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OP_REQUIRES_OK(ctx,
+                   InternalCompute(*re_, rewrite_str_, replace_global_, ctx));
+  }
+
+ private:
+  string rewrite_str_;
+  std::unique_ptr<RE2> re_;
+  bool replace_global_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("StaticRegexReplace").Device(DEVICE_CPU),
+                        StaticRegexReplaceOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/regex_replace_op_test.cc b/tensorflow/core/kernels/regex_replace_op_test.cc
new file mode 100644
index 0000000000..9691d4a89f
--- /dev/null
+++ b/tensorflow/core/kernels/regex_replace_op_test.cc
@@ -0,0 +1,137 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+// Test data from the TensorFlow README.md.
+const char* lines[] = {
+    "**TensorFlow** is an open source software library for numerical "
+    "computation using data flow graphs.",
+    "The graph nodes represent mathematical operations, while the graph edges "
+    "represent the multidimensional data arrays (tensors) that flow between "
+    "them.",
+    "This flexible architecture enables you to deploy computation to one or "
+    "more CPUs or GPUs in a desktop, server, or mobile device without "
+    "rewriting code.",
+    "TensorFlow also includes "
+    "[TensorBoard](https://www.tensorflow.org/guide/"
+    "summaries_and_tensorboard), a data visualization toolkit.",
+    "TensorFlow was originally developed by researchers and engineers working "
+    "on the Google Brain team within Google's Machine Intelligence Research "
+    "organization for the purposes of conducting machine learning and deep "
+    "neural networks research.",
+    "The system is general enough to be applicable in a wide variety of other "
+    "domains, as well.",
+    "TensorFlow provides stable Python API and C APIs as well as without API "
+    "backwards compatibility guarantee like C++, Go, Java, JavaScript and "
+    "Swift."};
+
+const char kRegExPattern[] = "\\p{P}";
+const char kRewrite[] = " ";
+
+Tensor GetTestTensor(int batch) {
+  const int sz = TF_ARRAYSIZE(lines);
+  Tensor t(DT_STRING, {batch});
+  auto s = t.flat<string>();
+  for (int i = 0; i < batch; ++i) {
+    s(i) = lines[i % sz];
+  }
+  return t;
+}
+
+Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
+                              const string& input_rewrite) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor pattern(DT_STRING, TensorShape({}));
+  pattern.flat<string>().setConstant(input_pattern);
+  Tensor rewrite(DT_STRING, TensorShape({}));
+  rewrite.flat<string>().setConstant(input_rewrite);
+
+  TF_CHECK_OK(NodeBuilder("regex_replace_op", "RegexReplace")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, pattern))
+                  .Input(test::graph::Constant(g, rewrite))
+                  .Attr("replace_global", true)
+                  .Finalize(g, nullptr /* node */));
+  return g;
+}
+
+void BM_RegexReplace(int iters, int batch_size) {
+  testing::StopTiming();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  testing::UseRealTime();
+  Tensor input = GetTestTensor(batch_size);
+  Graph* g = SetupRegexReplaceGraph(input, kRegExPattern, kRewrite);
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+BENCHMARK(BM_RegexReplace)
+    ->Arg(1)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128)
+    ->Arg(256);
+
+Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern,
+                        const string& rewrite) {
+  Graph* g = new Graph(OpRegistry::Global());
+
+  TF_CHECK_OK(NodeBuilder("static_regex_replace_op", "StaticRegexReplace")
+                  .Attr("pattern", input_pattern)
+                  .Attr("rewrite", rewrite)
+                  .Input(test::graph::Constant(g, input))
+                  .Attr("replace_global", true)
+                  .Finalize(g, nullptr /* node */));
+  return g;
+}
+void BM_StaticRegexReplace(int iters, int batch_size) {
+  testing::StopTiming();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  testing::UseRealTime();
+  Tensor input = GetTestTensor(batch_size);
+  Graph* g = SetupStaticGraph(input, kRegExPattern, kRewrite);
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+BENCHMARK(BM_StaticRegexReplace)
+    ->Arg(1)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128)
+    ->Arg(256);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index e712b02bd7..4775deeb61 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 
-#ifndef TENSORFLOW_KERNELS_RELU_OP_H_
-#define TENSORFLOW_KERNELS_RELU_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RELU_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RELU_OP_H_
 
 #define EIGEN_USE_THREADS
 
@@ -219,4 +219,4 @@ void SeluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
 
 #undef EIGEN_USE_THREADS
 
-#endif  // TENSORFLOW_KERNELS_RELU_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RELU_OP_H_
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 3bc5ba8a50..e564da335a 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RELU_OP_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_RELU_OP_FUNCTOR_H_
 // Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -168,4 +168,4 @@ struct SeluGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_RELU_OP_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RELU_OP_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h
index 5db2d148b9..7458ac75ca 100644
--- a/tensorflow/core/kernels/reshape_op.h
+++ b/tensorflow/core/kernels/reshape_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_RESHAPE_OP_H_
-#define TENSORFLOW_KERNELS_RESHAPE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RESHAPE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RESHAPE_OP_H_
 
 #include <memory>
 #include "tensorflow/core/framework/op_kernel.h"
@@ -121,4 +121,4 @@ class ReshapeOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_RESHAPE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RESHAPE_OP_H_
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index dde59e8e74..f10c9a19a7 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -277,13 +277,13 @@ struct ResizeBilinearGrad<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor input_grad,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output_grad) {
-    const int batch = output_grad.dimension(0);
-    const int64 original_height = output_grad.dimension(1);
-    const int64 original_width = output_grad.dimension(2);
-    const int channels = output_grad.dimension(3);
+    const Eigen::Index batch = output_grad.dimension(0);
+    const Eigen::Index original_height = output_grad.dimension(1);
+    const Eigen::Index original_width = output_grad.dimension(2);
+    const Eigen::Index channels = output_grad.dimension(3);
 
-    const int64 resized_height = input_grad.dimension(1);
-    const int64 resized_width = input_grad.dimension(2);
+    const Eigen::Index resized_height = input_grad.dimension(1);
+    const Eigen::Index resized_width = input_grad.dimension(2);
 
     output_grad.setZero();
 
@@ -294,22 +294,24 @@ struct ResizeBilinearGrad<CPUDevice, T> {
     //                       +  top_right * (1 - y) * x
     //                       +  bottom_left * y * (1 - x)
     //                       +  bottom_right * y * x
-    for (int64 b = 0; b < batch; ++b) {
-      for (int64 y = 0; y < resized_height; ++y) {
+    for (Eigen::Index b = 0; b < batch; ++b) {
+      for (Eigen::Index y = 0; y < resized_height; ++y) {
         const float in_y = y * height_scale;
-        const int64 top_y_index = static_cast<int64>(floorf(in_y));
-        const int64 bottom_y_index =
-            std::min(static_cast<int64>(ceilf(in_y)), original_height - 1);
+        const Eigen::Index top_y_index =
+            static_cast<Eigen::Index>(floorf(in_y));
+        const Eigen::Index bottom_y_index = std::min(
+            static_cast<Eigen::Index>(ceilf(in_y)), original_height - 1);
         const float y_lerp = in_y - top_y_index;
         const float inverse_y_lerp = (1.0f - y_lerp);
-        for (int64 x = 0; x < resized_width; ++x) {
+        for (Eigen::Index x = 0; x < resized_width; ++x) {
           const float in_x = x * width_scale;
-          const int64 left_x_index = static_cast<int64>(floorf(in_x));
-          const int64 right_x_index =
-              std::min(static_cast<int64>(ceilf(in_x)), original_width - 1);
+          const Eigen::Index left_x_index =
+              static_cast<Eigen::Index>(floorf(in_x));
+          const Eigen::Index right_x_index = std::min(
+              static_cast<Eigen::Index>(ceilf(in_x)), original_width - 1);
           const float x_lerp = in_x - left_x_index;
           const float inverse_x_lerp = (1.0f - x_lerp);
-          for (int64 c = 0; c < channels; ++c) {
+          for (Eigen::Index c = 0; c < channels; ++c) {
             output_grad(b, top_y_index, left_x_index, c) +=
                 T(input_grad(b, y, x, c) * inverse_y_lerp * inverse_x_lerp);
             output_grad(b, top_y_index, right_x_index, c) +=
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
index 8ec526c2b2..e985d3e5a5 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -88,25 +88,27 @@ struct ResizeNearestNeighbor<CPUDevice, T, align_corners> {
   bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output) {
-    const int batch_size = input.dimension(0);
-    const int64 in_height = input.dimension(1);
-    const int64 in_width = input.dimension(2);
-    const int channels = input.dimension(3);
-
-    const int64 out_height = output.dimension(1);
-    const int64 out_width = output.dimension(2);
-
-    for (int b = 0; b < batch_size; ++b) {
-      for (int y = 0; y < out_height; ++y) {
-        const int64 in_y = std::min(
-            (align_corners) ? static_cast<int64>(roundf(y * height_scale))
-                            : static_cast<int64>(floorf(y * height_scale)),
-            in_height - 1);
-        for (int x = 0; x < out_width; ++x) {
-          const int64 in_x = std::min(
-              (align_corners) ? static_cast<int64>(roundf(x * width_scale))
-                              : static_cast<int64>(floorf(x * width_scale)),
-              in_width - 1);
+    const Eigen::Index batch_size = input.dimension(0);
+    const Eigen::Index in_height = input.dimension(1);
+    const Eigen::Index in_width = input.dimension(2);
+    const Eigen::Index channels = input.dimension(3);
+
+    const Eigen::Index out_height = output.dimension(1);
+    const Eigen::Index out_width = output.dimension(2);
+
+    for (Eigen::Index b = 0; b < batch_size; ++b) {
+      for (Eigen::Index y = 0; y < out_height; ++y) {
+        const Eigen::Index in_y =
+            std::min((align_corners)
+                         ? static_cast<Eigen::Index>(roundf(y * height_scale))
+                         : static_cast<Eigen::Index>(floorf(y * height_scale)),
+                     in_height - 1);
+        for (Eigen::Index x = 0; x < out_width; ++x) {
+          const Eigen::Index in_x =
+              std::min((align_corners)
+                           ? static_cast<Eigen::Index>(roundf(x * width_scale))
+                           : static_cast<Eigen::Index>(floorf(x * width_scale)),
+                       in_width - 1);
           std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0));
         }
       }
@@ -199,28 +201,29 @@ struct ResizeNearestNeighborGrad<CPUDevice, T, align_corners> {
   bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
                   const float height_scale, const float width_scale,
                   typename TTypes<T, 4>::Tensor output) {
-    const int batch_size = input.dimension(0);
-    const int64 in_height = input.dimension(1);
-    const int64 in_width = input.dimension(2);
-    const int channels = input.dimension(3);
+    const Eigen::Index batch_size = input.dimension(0);
+    const Eigen::Index in_height = input.dimension(1);
+    const Eigen::Index in_width = input.dimension(2);
+    const Eigen::Index channels = input.dimension(3);
 
-    const int64 out_height = output.dimension(1);
-    const int64 out_width = output.dimension(2);
+    const Eigen::Index out_height = output.dimension(1);
+    const Eigen::Index out_width = output.dimension(2);
 
     output.setZero();
 
-    for (int y = 0; y < in_height; ++y) {
-      const int64 out_y = std::min(
-          (align_corners) ? static_cast<int64>(roundf(y * height_scale))
-                          : static_cast<int64>(floorf(y * height_scale)),
+    for (Eigen::Index y = 0; y < in_height; ++y) {
+      const Eigen::Index out_y = std::min(
+          (align_corners) ? static_cast<Eigen::Index>(roundf(y * height_scale))
+                          : static_cast<Eigen::Index>(floorf(y * height_scale)),
           out_height - 1);
-      for (int x = 0; x < in_width; ++x) {
-        const int64 out_x = std::min(
-            (align_corners) ? static_cast<int64>(roundf(x * width_scale))
-                            : static_cast<int64>(floorf(x * width_scale)),
-            out_width - 1);
-        for (int b = 0; b < batch_size; ++b) {
-          for (int c = 0; c < channels; ++c) {
+      for (Eigen::Index x = 0; x < in_width; ++x) {
+        const Eigen::Index out_x =
+            std::min((align_corners)
+                         ? static_cast<Eigen::Index>(roundf(x * width_scale))
+                         : static_cast<Eigen::Index>(floorf(x * width_scale)),
+                     out_width - 1);
+        for (Eigen::Index b = 0; b < batch_size; ++b) {
+          for (Eigen::Index c = 0; c < channels; ++c) {
             output(b, out_y, out_x, c) += input(b, y, x, c);
           }
         }
diff --git a/tensorflow/core/kernels/reverse_op.h b/tensorflow/core/kernels/reverse_op.h
index 934f0277a9..44e7967c5d 100644
--- a/tensorflow/core/kernels/reverse_op.h
+++ b/tensorflow/core/kernels/reverse_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_REVERSE_OP_H_
-#define TENSORFLOW_KERNELS_REVERSE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REVERSE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_REVERSE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -45,4 +45,4 @@ struct Reverse<Device, T, 0> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_MIRROR_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REVERSE_OP_H_
diff --git a/tensorflow/core/kernels/reverse_sequence_op.h b/tensorflow/core/kernels/reverse_sequence_op.h
index 8ccd32ea16..d6ba2781a9 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.h
+++ b/tensorflow/core/kernels/reverse_sequence_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
-#define TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REVERSE_SEQUENCE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_REVERSE_SEQUENCE_OP_H_
 // Generator definition for ReverseSequenceOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -75,4 +75,4 @@ struct ReverseSequence {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REVERSE_SEQUENCE_OP_H_
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index 5b74b586e8..be7f4b889e 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SAVE_RESTORE_TENSOR_H_
-#define TENSORFLOW_KERNELS_SAVE_RESTORE_TENSOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_H_
+#define TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_H_
 
 #include "tensorflow/core/util/tensor_slice_reader.h"
 #include "tensorflow/core/util/tensor_slice_writer.h"
@@ -70,4 +70,4 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SAVE_RESTORE_TENSOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_H_
diff --git a/tensorflow/core/kernels/scan_ops.h b/tensorflow/core/kernels/scan_ops.h
index 1a1f71d722..13831bb377 100644
--- a/tensorflow/core/kernels/scan_ops.h
+++ b/tensorflow/core/kernels/scan_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SCAN_OPS_H_
-#define TENSORFLOW_KERNELS_SCAN_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SCAN_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SCAN_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -43,4 +43,4 @@ struct Scan {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SCAN_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SCAN_OPS_H_
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index ebaa2bd9c6..2d43bde23f 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SCATTER_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_SCATTER_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_H_
 
 #include <type_traits>
 
@@ -488,4 +488,4 @@ struct ScatterScalarFunctorSYCL {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SCATTER_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index 70809e4dcf..057755a05c 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
-#define TENSORFLOW_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
 
 #if GOOGLE_CUDA
 
@@ -161,4 +161,4 @@ struct ScatterScalarFunctor<GPUDevice, T, Index, op> {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_FUNCTOR_GPU_CU_H_
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
index 271dd2c485..b5274f8788 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op_impl.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 
 #include "third_party/eigen3/Eigen/Core"
@@ -85,3 +88,5 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SELF_ADJOINT_EIG_V2_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 1ff8eff13f..223854de13 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SENDRECV_OPS_H_
-#define TENSORFLOW_KERNELS_SENDRECV_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SENDRECV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SENDRECV_OPS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
@@ -49,4 +49,4 @@ class RecvOp : public AsyncOpKernel {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SENDRECV_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SENDRECV_OPS_H_
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 55be308901..7a50f158af 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SHAPE_OPS_H_
-#define TENSORFLOW_KERNELS_SHAPE_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SHAPE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SHAPE_OPS_H_
 
 #include <limits>
 #include <unordered_set>
@@ -154,6 +154,9 @@ class ExpandDimsOp : public OpKernel {
     OP_REQUIRES(ctx, ctx->input(0).dtype() != DT_VARIANT,
                 errors::InvalidArgument("ExpandDims on Variant not supported"));
 
+    OP_REQUIRES(
+        ctx, (ctx->input(1).NumElements() == 1),
+        errors::InvalidArgument("'dim' must be a tensor with a single value"));
     Tdim dim = ctx->input(1).flat<Tdim>()(0);
     OP_REQUIRES(
         ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()),
@@ -236,9 +239,8 @@ class SqueezeOp : public OpKernel {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
                       errors::InvalidArgument(
-                          "Tried to explicitly squeeze "
-                          "dimension ",
-                          i, " but dimension was not 1: ", existing_dim));
+                          "Can not squeeze dim[", i,
+                          "], expected a dimension of 1, got ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
@@ -272,4 +274,4 @@ class SqueezeOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SHAPE_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SHAPE_OPS_H_
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index db7eded745..1d662f6362 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SLICE_OP_H_
-#define TENSORFLOW_KERNELS_SLICE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_OP_H_
 
 // Functor definition for SliceOp, must be compilable by nvcc.
 
@@ -51,4 +51,4 @@ struct Slice {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SLICE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SLICE_OP_H_
diff --git a/tensorflow/core/kernels/smooth-hinge-loss.h b/tensorflow/core/kernels/smooth-hinge-loss.h
index 5074ad0795..d51f5c130e 100644
--- a/tensorflow/core/kernels/smooth-hinge-loss.h
+++ b/tensorflow/core/kernels/smooth-hinge-loss.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SMOOTH_HINGE_LOSS_H_
-#define TENSORFLOW_KERNELS_SMOOTH_HINGE_LOSS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SMOOTH_HINGE_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_SMOOTH_HINGE_LOSS_H_
 
 #include <limits>
 
@@ -110,5 +110,5 @@ class SmoothHingeLossUpdater : public DualLossUpdater {
 
 }  // namespace tensorflow
 
-#endif
+#endif  // TENSORFLOW_CORE_KERNELS_SMOOTH_HINGE_LOSS_H_
 // TENSORFLOW_KERNELS_SMOOTH_HINGE_LOSS_H_
diff --git a/tensorflow/core/kernels/snapshot_op.h b/tensorflow/core/kernels/snapshot_op.h
index a18065d42b..02d492988e 100644
--- a/tensorflow/core/kernels/snapshot_op.h
+++ b/tensorflow/core/kernels/snapshot_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
-#define TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
@@ -41,4 +41,4 @@ struct Snapshot {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SNAPSHOT_OP_H_
diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h
index d3a267ed87..c8bc1ad3bb 100644
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SOFTMAX_OP_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_SOFTMAX_OP_FUNCTOR_H_
 // Functor definition for SoftmaxOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -98,4 +98,4 @@ struct SoftmaxEigenImpl {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SOFTMAX_OP_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SOFTMAX_OP_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index 494a83ed14..d3fc0e1461 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -35,9 +34,7 @@ template <typename Device, typename T>
 class SoftplusOp : public UnaryElementWiseOp<T, SoftplusOp<Device, T>> {
  public:
   explicit SoftplusOp(OpKernelConstruction* context)
-      : UnaryElementWiseOp<T, SoftplusOp<Device, T>>(context) {
-    WarnAboutInts(context);
-  }
+      : UnaryElementWiseOp<T, SoftplusOp<Device, T>>(context) {}
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softplus<Device, T> functor;
@@ -51,9 +48,7 @@ class SoftplusGradOp
     : public BinaryElementWiseOp<T, SoftplusGradOp<Device, T>> {
  public:
   explicit SoftplusGradOp(OpKernelConstruction* context)
-      : BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>(context) {
-    WarnAboutInts(context);
-  }
+      : BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>(context) {}
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
@@ -89,7 +84,7 @@ void SoftplusGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
       Name("SoftplusGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       SoftplusGradOp<CPUDevice, type>);
 
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h
index e17e175d41..8c083ba158 100644
--- a/tensorflow/core/kernels/softplus_op.h
+++ b/tensorflow/core/kernels/softplus_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
-#define TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SOFTPLUS_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SOFTPLUS_OP_H_
 // Functor definition for SoftplusOp and SoftplusGradOp, must be compilable by
 // nvcc.
 
@@ -73,4 +73,4 @@ struct SoftplusGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SOFTPLUS_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SOFTPLUS_OP_H_
diff --git a/tensorflow/core/kernels/softsign_op.cc b/tensorflow/core/kernels/softsign_op.cc
index 00ee649b17..d691f15651 100644
--- a/tensorflow/core/kernels/softsign_op.cc
+++ b/tensorflow/core/kernels/softsign_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -35,9 +34,7 @@ template <typename Device, typename T>
 class SoftsignOp : public UnaryElementWiseOp<T, SoftsignOp<Device, T>> {
  public:
   explicit SoftsignOp(OpKernelConstruction* context)
-      : UnaryElementWiseOp<T, SoftsignOp<Device, T>>(context) {
-    WarnAboutInts(context);
-  }
+      : UnaryElementWiseOp<T, SoftsignOp<Device, T>>(context) {}
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softsign<Device, T> functor;
@@ -51,9 +48,7 @@ class SoftsignGradOp
     : public BinaryElementWiseOp<T, SoftsignGradOp<Device, T>> {
  public:
   explicit SoftsignGradOp(OpKernelConstruction* context)
-      : BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>(context) {
-    WarnAboutInts(context);
-  }
+      : BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>(context) {}
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
@@ -90,7 +85,7 @@ void SoftsignGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
       Name("SoftsignGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       SoftsignGradOp<CPUDevice, type>);
 
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/softsign_op.h b/tensorflow/core/kernels/softsign_op.h
index c2ababf697..61ff6eeede 100644
--- a/tensorflow/core/kernels/softsign_op.h
+++ b/tensorflow/core/kernels/softsign_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SOFTSIGN_OP_H_
-#define TENSORFLOW_KERNELS_SOFTSIGN_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SOFTSIGN_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SOFTSIGN_OP_H_
 // Functor definition for SoftsignOp and SoftsignGradOp, must be compilable by
 // nvcc.
 
@@ -57,4 +57,4 @@ struct SoftsignGrad {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SOFTSIGN_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SOFTSIGN_OP_H_
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h
index 2c1bffbee4..11149c4d16 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
-#define TENSORFLOW_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
 
 #include "tensorflow/core/kernels/typed_conditional_accumulator_base.h"
 
@@ -459,4 +459,4 @@ class SparseConditionalAccumulator
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_CONDITIONAL_ACCUMULATOR_H_
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index e89280724e..6b9db8f471 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
-#define TENSORFLOW_KERNELS_SPARSE_MATMUL_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_MATMUL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_MATMUL_OP_H_
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/byte_order.h"
@@ -465,4 +465,4 @@ EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
 #endif
 }  // namespace internal
 }  // namespace Eigen
-#endif
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_MATMUL_OP_H_
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_add_op.h b/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
index 353cf0e519..c26ed5e874 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
+++ b/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
-#define TENSORFLOW_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -39,4 +39,4 @@ struct ScatterNdFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_ADD_OP_H_
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
index da13190494..d6dd2deca5 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
-#define TENSORFLOW_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -71,4 +71,4 @@ class MaybeAdjoint<MATRIX, true> {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_TENSOR_DENSE_MATMUL_OP_H_
diff --git a/tensorflow/core/kernels/sparse_xent_op.h b/tensorflow/core/kernels/sparse_xent_op.h
index b5587aa9d7..6ba7931ab5 100644
--- a/tensorflow/core/kernels/sparse_xent_op.h
+++ b/tensorflow/core/kernels/sparse_xent_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_XENT_OP_H_
-#define TENSORFLOW_KERNELS_XENT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_XENT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_XENT_OP_H_
 // Functor definition for SparseXentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -224,4 +224,4 @@ struct SparseXentEigenImpl {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_XENT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_XENT_OP_H_
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index bc1fa28f8f..9d43a00822 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SPLIT_LIB_H_
-#define TENSORFLOW_KERNELS_SPLIT_LIB_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPLIT_LIB_H_
+#define TENSORFLOW_CORE_KERNELS_SPLIT_LIB_H_
 // Functor definition for SplitOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -62,4 +62,4 @@ struct Split<Eigen::SyclDevice, T> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SPLIT_LIB_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPLIT_LIB_H_
diff --git a/tensorflow/core/kernels/squared-loss.h b/tensorflow/core/kernels/squared-loss.h
index 49e6db406e..d256a69350 100644
--- a/tensorflow/core/kernels/squared-loss.h
+++ b/tensorflow/core/kernels/squared-loss.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_SQUARED_LOSS_H_
-#define TENSORFLOW_KERNELS_SQUARED_LOSS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SQUARED_LOSS_H_
+#define TENSORFLOW_CORE_KERNELS_SQUARED_LOSS_H_
 
 #include "tensorflow/core/kernels/loss.h"
 
@@ -70,4 +70,4 @@ class SquaredLossUpdater : public DualLossUpdater {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SQUARED_LOSS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SQUARED_LOSS_H_
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 59fdc2262a..7b537fef5b 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -300,7 +300,8 @@ class StridedSliceAssignOp : public OpKernel {
     gtl::InlinedVector<int64, 4> end;
     gtl::InlinedVector<int64, 4> strides;
 
-    Tensor old_lhs;
+    Tensor* old_lhs = nullptr;
+    Tensor tmp;
     if (context->input_dtype(0) == DT_RESOURCE) {
       Var* v;
       OP_REQUIRES_OK(context,
@@ -308,29 +309,30 @@ class StridedSliceAssignOp : public OpKernel {
       mutex_lock ml(*v->mu());
       OP_REQUIRES_OK(context,
                      PrepareToUpdateVariable<Device, T>(context, v->tensor()));
-      old_lhs = *v->tensor();
-      OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum<T>::value,
+      old_lhs = v->tensor();
+      OP_REQUIRES(context, old_lhs->dtype() == DataTypeToEnum<T>::value,
                   errors::InvalidArgument(
-                      "l-value dtype ", DataTypeString(old_lhs.dtype()),
+                      "l-value dtype ", DataTypeString(old_lhs->dtype()),
                       " does not match r-value dtype ",
                       DataTypeString(DataTypeToEnum<T>::value)));
     } else {
       context->forward_ref_input_to_ref_output(0, 0);
-      old_lhs = context->mutable_input(0, true);
+      tmp = context->mutable_input(0, true);
+      old_lhs = &tmp;
     }
 
     OP_REQUIRES_OK(
-        context,
-        ValidateStridedSliceOp(
-            &context->input(1), &context->input(2), context->input(3),
-            old_lhs.shape(), begin_mask, end_mask, ellipsis_mask, new_axis_mask,
-            shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
-            &is_simple_slice, &slice_dim0, &begin, &end, &strides));
+        context, ValidateStridedSliceOp(
+                     &context->input(1), &context->input(2), context->input(3),
+                     old_lhs->shape(), begin_mask, end_mask, ellipsis_mask,
+                     new_axis_mask, shrink_axis_mask, &processing_shape,
+                     &final_shape, &is_identity, &is_simple_slice, &slice_dim0,
+                     &begin, &end, &strides));
 
     if (processing_shape.num_elements()) {
       const Tensor& input = context->input(4);
       TensorShape input_shape = input.shape();
-      TensorShape original_shape = old_lhs.shape();
+      TensorShape original_shape = old_lhs->shape();
       // TODO(aselle): This check is too strong, we only should need
       // input_shape to be broadcastable to final_shape
       OP_REQUIRES(
@@ -345,12 +347,12 @@ class StridedSliceAssignOp : public OpKernel {
       // scalar shape
 
 // Handle general dimensions
-#define HANDLE_DIM(NDIM)                                                 \
-  if (processing_dims == NDIM) {                                         \
-    HandleStridedSliceAssignCase<Device, T, NDIM>()(                     \
-        context, begin, end, strides, processing_shape, is_simple_slice, \
-        &old_lhs);                                                       \
-    return;                                                              \
+#define HANDLE_DIM(NDIM)                                                       \
+  if (processing_dims == NDIM) {                                               \
+    HandleStridedSliceAssignCase<Device, T, NDIM>()(context, begin, end,       \
+                                                    strides, processing_shape, \
+                                                    is_simple_slice, old_lhs); \
+    return;                                                                    \
   }
       HANDLE_DIM(0);
       HANDLE_DIM(1);
diff --git a/tensorflow/core/kernels/strided_slice_op.h b/tensorflow/core/kernels/strided_slice_op.h
index 2b58632298..86d105391d 100644
--- a/tensorflow/core/kernels/strided_slice_op.h
+++ b/tensorflow/core/kernels/strided_slice_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_STRIDED_SLICE_OP_H_
-#define TENSORFLOW_KERNELS_STRIDED_SLICE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_H_
 
 // Functor definition for StridedSliceOp, must be compilable by nvcc.
 
@@ -137,4 +137,4 @@ struct StridedSliceAssignScalar {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_SLICE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_H_
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 1c4472bb1a..099083b2ff 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_STRIDED_SLICE_OP_IMPL_H_
-#define TENSORFLOW_KERNELS_STRIDED_SLICE_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_IMPL_H_
 
 // Functor definition for StridedSliceOp, must be compilable by nvcc.
 
@@ -313,4 +313,4 @@ DECLARE_FOR_N_SYCL(int64);
 }  // end namespace tensorflow
 
 #endif  // END STRIDED_SLICE_INSTANTIATE_DIM
-#endif  // TENSORFLOW_KERNELS_SLICE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/string_length_op.cc b/tensorflow/core/kernels/string_length_op.cc
new file mode 100644
index 0000000000..a6829b29d9
--- /dev/null
+++ b/tensorflow/core/kernels/string_length_op.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace {
+
+class StringLengthOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+
+    auto src = input.flat<string>();
+    auto dst = output->flat<int32>();
+
+    for (int n = 0; n < src.size(); ++n) {
+      dst(n) = src(n).size();
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StringLength").Device(DEVICE_CPU),
+                        StringLengthOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 26ab72f12e..3884370a6c 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -26,25 +26,81 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
-
 namespace {
+// Split input string `str` based on a character delimiter.
+// Returns a vector of StringPieces which are valid as long as input `str`
+// is valid.
+// Note: The single character delimiter is a common case and is implemented as
+// a series of finds in the input string, making it much more effcient than
+// SplitOnCharSet.
+template <typename Predicate>
+std::vector<StringPiece> SplitOnChar(const string& str, const char delim,
+                                     Predicate p) {
+  std::vector<StringPiece> result;
+  StringPiece text(str);
+  auto f = text.find(delim);
+  while (f != StringPiece::npos) {
+    StringPiece token = text.substr(0, f);
+    if (p(token)) {
+      result.emplace_back(token);
+    }
+    text.remove_prefix(f + 1);
+    f = text.find(delim);
+  }
+  if (p(text)) {
+    result.push_back(text);
+  }
+  return result;
+}
 
-std::vector<string> Split(const string& str, const string& delimiter,
-                          const bool skipEmpty) {
-  if (!delimiter.empty()) {
-    if (skipEmpty) {
-      return str_util::Split(str, delimiter, str_util::SkipEmpty());
+// Split input string `str` based on a set of character delimiters.
+// Returns a vector of StringPieces which are valid as long as input `str`
+// is valid.
+// Based on str_util::Split.
+template <typename Predicate>
+std::vector<StringPiece> SplitOnCharSet(const string& str,
+                                        const string& delim_set, Predicate p) {
+  std::vector<StringPiece> result;
+  StringPiece text(str);
+  StringPiece delims(delim_set);
+  size_t token_start = 0;
+  for (size_t i = 0; i < text.size() + 1; i++) {
+    if ((i == text.size()) || (delims.find(text[i]) != StringPiece::npos)) {
+      StringPiece token(text.data() + token_start, i - token_start);
+      if (p(token)) {
+        result.emplace_back(token);
+      }
+      token_start = i + 1;
     }
-    return str_util::Split(str, delimiter);
   }
-  std::vector<string> char_vector(str.size());
-  for (size_t i = 0; i < str.size(); ++i) {
-    char_vector[i] = str[i];
+  return result;
+}
+
+// Split input string `str` based on given delimiter.
+// Returns a vector of StringPieces which are valid as long as input `str`
+// is valid.
+template <typename Predicate>
+std::vector<StringPiece> Split(const string& str, const string& delimiter,
+                               Predicate predicate) {
+  if (str.empty()) {
+    return std::vector<StringPiece>();
+  }
+  if (delimiter.empty()) {
+    std::vector<StringPiece> result;
+    result.resize(str.size());
+    for (size_t i = 0; i < str.size(); ++i) {
+      result[i] = StringPiece(str.data() + i, 1);
+    }
+    return result;
   }
-  return char_vector;
+  if (delimiter.size() == 1) {
+    return SplitOnChar(str, delimiter[0], predicate);
+  }
+  return SplitOnCharSet(str, delimiter, predicate);
 }
 
-std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
+std::vector<StringPiece> SplitV2(const string& str, StringPiece sep,
+                                 int maxsplit) {
   // This SplitV2 method matches the behavior of python's str.split:
   //   If sep is given, consecutive delimiters are not grouped together
   //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
@@ -59,11 +115,11 @@ std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
   //   splitting an empty string or a string consisting of just whitespace
   //   with a None separator returns [].
 
-  std::vector<string> result;
+  std::vector<StringPiece> result;
 
   StringPiece text(str);
   if (maxsplit == 0) {
-    result.emplace_back(std::string(text));
+    result.emplace_back(text);
     return result;
   }
 
@@ -73,11 +129,11 @@ std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
     str_util::RemoveLeadingWhitespace(&text);
     int split = 0;
     while (str_util::ConsumeNonWhitespace(&text, &token)) {
-      result.emplace_back(std::string(token));
+      result.push_back(token);
       str_util::RemoveLeadingWhitespace(&text);
       ++split;
       if (maxsplit > 0 && split == maxsplit) {
-        result.emplace_back(std::string(text));
+        result.push_back(text);
         return result;
       }
     }
@@ -87,17 +143,17 @@ std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
   int split = 0;
   while (p != text.end()) {
     StringPiece token = text.substr(0, p - text.begin());
-    result.emplace_back(std::string(token));
+    result.push_back(token);
     text.remove_prefix(token.size());
     text.remove_prefix(sep.size());
     ++split;
     if (maxsplit > 0 && split == maxsplit) {
-      result.emplace_back(std::string(text));
+      result.push_back(StringPiece(text));
       return result;
     }
     p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
   }
-  result.emplace_back(std::string(text));
+  result.push_back(text);
   return result;
 }
 
@@ -134,7 +190,7 @@ class StringSplitOp : public OpKernel {
     const auto delimiter_vec = delimiter_tensor->flat<string>();
     const string& delimiter = delimiter_vec(0);
     // Empty delimiter means split the input character by character.
-    std::vector<string> tokens;
+    std::vector<StringPiece> tokens;
     // Guess that we'll be unpacking a handful of tokens per example.
     static constexpr int kReserveSize = 4;
     tokens.reserve(batch_size * kReserveSize);
@@ -143,12 +199,15 @@ class StringSplitOp : public OpKernel {
     int64 max_num_entries = 0;
     std::vector<int64> num_indices(batch_size);
     for (int64 i = 0; i < batch_size; ++i) {
-      std::vector<string> parts = Split(input_vec(i), delimiter, skip_empty_);
+      std::vector<StringPiece> parts =
+          skip_empty_ ? Split(input_vec(i), delimiter, str_util::SkipEmpty())
+                      : Split(input_vec(i), delimiter, str_util::AllowEmpty());
       int64 n_entries = parts.size();
       num_indices[i] = n_entries;
       output_size += n_entries;
       max_num_entries = std::max(max_num_entries, n_entries);
-      tokens.insert(tokens.end(), parts.begin(), parts.end());
+      tokens.insert(tokens.end(), std::make_move_iterator(parts.begin()),
+                    std::make_move_iterator(parts.end()));
     }
 
     Tensor* sp_indices_t;
@@ -170,7 +229,7 @@ class StringSplitOp : public OpKernel {
       for (size_t j = 0; j < num_indices[i]; ++j) {
         sp_indices(c, 0) = i;
         sp_indices(c, 1) = j;
-        sp_tokens(c) = tokens[c];
+        sp_tokens(c).assign(tokens[c].data(), tokens[c].size());
         ++c;
       }
     }
@@ -204,7 +263,7 @@ class StringSplitV2Op : public OpKernel {
                                         sep_tensor->shape().DebugString()));
     const auto sep_vec = sep_tensor->flat<string>();
     StringPiece sep(sep_vec(0));
-    std::vector<string> tokens;
+    std::vector<StringPiece> tokens;
     // Guess that we'll be unpacking a handful of tokens per example.
     static constexpr int kReserveSize = 4;
     tokens.reserve(batch_size * kReserveSize);
@@ -213,7 +272,7 @@ class StringSplitV2Op : public OpKernel {
     int64 max_num_entries = 0;
     std::vector<int64> num_indices(batch_size);
     for (int64 i = 0; i < batch_size; ++i) {
-      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
+      std::vector<StringPiece> parts = SplitV2(input_vec(i), sep, maxsplit_);
       int64 n_entries = parts.size();
       num_indices[i] = n_entries;
       output_size += n_entries;
@@ -240,7 +299,7 @@ class StringSplitV2Op : public OpKernel {
       for (size_t j = 0; j < num_indices[i]; ++j) {
         sp_indices(c, 0) = i;
         sp_indices(c, 1) = j;
-        sp_tokens(c) = tokens[c];
+        sp_tokens(c).assign(tokens[c].data(), tokens[c].size());
         ++c;
       }
     }
diff --git a/tensorflow/core/kernels/string_split_op_test.cc b/tensorflow/core/kernels/string_split_op_test.cc
new file mode 100644
index 0000000000..58ad61adc8
--- /dev/null
+++ b/tensorflow/core/kernels/string_split_op_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+// Test data from the TensorFlow README.md.
+const char* lines[] = {
+    "**TensorFlow** is an open source software library for numerical "
+    "computation using data flow graphs.",
+    "The graph nodes represent mathematical operations, while the graph edges "
+    "represent the multidimensional data arrays (tensors) that flow between "
+    "them.",
+    "This flexible architecture enables you to deploy computation to one or "
+    "more CPUs or GPUs in a desktop, server, or mobile device without "
+    "rewriting code.",
+    "TensorFlow also includes "
+    "[TensorBoard](https://www.tensorflow.org/guide/"
+    "summaries_and_tensorboard), a data visualization toolkit.",
+    "TensorFlow was originally developed by researchers and engineers working "
+    "on the Google Brain team within Google's Machine Intelligence Research "
+    "organization for the purposes of conducting machine learning and deep "
+    "neural networks research.",
+    "The system is general enough to be applicable in a wide variety of other "
+    "domains, as well.",
+    "TensorFlow provides stable Python API and C APIs as well as without API "
+    "backwards compatibility guarantee like C++, Go, Java, JavaScript and "
+    "Swift."};
+
+Tensor GetTestTensor(int batch) {
+  const int sz = TF_ARRAYSIZE(lines);
+  Tensor t(DT_STRING, {batch});
+  auto s = t.flat<string>();
+  for (int i = 0; i < batch; ++i) {
+    s(i) = lines[i % sz];
+  }
+  return t;
+}
+
+Graph* SetupStringSplitGraph(const Tensor& input) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor delim(DT_STRING, TensorShape({}));
+  delim.flat<string>().setConstant(" ");
+
+  TF_CHECK_OK(NodeBuilder("string_split_op", "StringSplit")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, delim))
+                  .Finalize(g, nullptr /* node */));
+  return g;
+}
+
+void BM_StringSplit(int iters, int batch_size) {
+  testing::StopTiming();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  testing::UseRealTime();
+  Tensor input = GetTestTensor(batch_size);
+  Graph* g = SetupStringSplitGraph(input);
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+BENCHMARK(BM_StringSplit)
+    ->Arg(1)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128)
+    ->Arg(256);
+
+Graph* SetupStringSplitV2Graph(const Tensor& input) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor sep(DT_STRING, TensorShape({}));
+  sep.flat<string>().setConstant(" ");
+
+  TF_CHECK_OK(NodeBuilder("string_split_op", "StringSplitV2")
+                  .Input(test::graph::Constant(g, input))
+                  .Input(test::graph::Constant(g, sep))
+                  .Finalize(g, nullptr /* node */));
+  return g;
+}
+
+void BM_StringSplitV2(int iters, int batch_size) {
+  testing::StopTiming();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  testing::UseRealTime();
+  Tensor input = GetTestTensor(batch_size);
+  Graph* g = SetupStringSplitV2Graph(input);
+  testing::StartTiming();
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+BENCHMARK(BM_StringSplitV2)
+    ->Arg(1)
+    ->Arg(8)
+    ->Arg(16)
+    ->Arg(32)
+    ->Arg(64)
+    ->Arg(128)
+    ->Arg(256);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/svd_op_impl.h b/tensorflow/core/kernels/svd_op_impl.h
index a996b67c62..2a67700c12 100644
--- a/tensorflow/core/kernels/svd_op_impl.h
+++ b/tensorflow/core/kernels/svd_op_impl.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
+
 // See docs in ../ops/linalg_ops.cc.
 //
 // This header file is used by the individual svd_*op*.cc files for registering
@@ -101,3 +104,5 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
 };
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SVD_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 68fab85770..e8dc4fad21 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_TENSOR_ARRAY_H_
-#define TENSORFLOW_KERNELS_TENSOR_ARRAY_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_ARRAY_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_ARRAY_H_
 
 #include <limits.h>
 #include <vector>
@@ -629,4 +629,4 @@ Status TensorArray::LockedRead(OpKernelContext* ctx, const int32 index,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_TENSOR_ARRAY_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_ARRAY_H_
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index b368ffc875..632b65e9b6 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -1119,8 +1119,8 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
         {1, num_values, element_shape.num_elements()});
 
     Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
-    Eigen::DSizes<Eigen::DenseIndex, 3> sizes{1, 1,
-                                              element_shape.num_elements()};
+    Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
+        1, 1, static_cast<Eigen::DenseIndex>(element_shape.num_elements())};
 
     std::vector<PersistentTensor> write_values;
     write_values.reserve(num_values);
@@ -1315,9 +1315,11 @@ class TensorArraySplitOp : public OpKernel {
       PersistentTensor persistent_tensor;
 
       int64 previous_length = (i == 0) ? 0 : cumulative_lengths[i - 1];
-      Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, previous_length, 0};
-      Eigen::DSizes<Eigen::DenseIndex, 3> sizes{1, tensor_lengths_t(i),
-                                                elements_per_row};
+      Eigen::DSizes<Eigen::DenseIndex, 3> indices{
+          0, static_cast<Eigen::DenseIndex>(previous_length), 0};
+      Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
+          1, static_cast<Eigen::DenseIndex>(tensor_lengths_t(i)),
+          static_cast<Eigen::DenseIndex>(elements_per_row)};
 
       OP_REQUIRES_OK(ctx, ctx->allocate_persistent(
                               tensor_array->ElemType(), element_shapes[i],
diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
index 189be9239b..95986af8b7 100644
--- a/tensorflow/core/kernels/tile_functor.h
+++ b/tensorflow/core/kernels/tile_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
-#define TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -106,4 +106,4 @@ struct Tile {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/tile_ops_impl.h b/tensorflow/core/kernels/tile_ops_impl.h
index 9861717a0b..6a9de388c6 100644
--- a/tensorflow/core/kernels/tile_ops_impl.h
+++ b/tensorflow/core/kernels/tile_ops_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_TILE_IMPL_OPS_H_
-#define TENSORFLOW_KERNELS_TILE_IMPL_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_IMPL_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -68,4 +68,4 @@ struct ReduceAndReshape {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_TILE_OPS_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_IMPL_H_
diff --git a/tensorflow/core/kernels/topk_op.h b/tensorflow/core/kernels/topk_op.h
index a53e3ec8d4..1fdbc5b15f 100644
--- a/tensorflow/core/kernels/topk_op.h
+++ b/tensorflow/core/kernels/topk_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TOPK_OP_H_
-#define TENSORFLOW_TOPK_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_H_
+#define TENSORFLOW_CORE_KERNELS_TOPK_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,4 +39,4 @@ struct TopKFunctor {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_TOPK_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_H_
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index 765335d3a0..071cb371a7 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
-#define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
+#define TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
@@ -90,4 +90,4 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 495a94f1a1..e10a4cb125 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_TRAINING_OPS_H_
-#define TENSORFLOW_KERNELS_TRAINING_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_TRAINING_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -199,4 +199,4 @@ struct ApplyPowerSign {
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_TRAINING_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TRAINING_OPS_H_
diff --git a/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
index 1980f758fc..9dedb618f9 100644
--- a/tensorflow/core/kernels/typed_conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
-#define TENSORFLOW_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
 
 #include "tensorflow/core/kernels/conditional_accumulator_base.h"
 
@@ -91,4 +91,4 @@ class TypedConditionalAccumulatorBase : public ConditionalAccumulatorBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TYPED_CONDITIONAL_ACCUMULATOR_BASE_H_
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index f27dab4ddd..4742e429ed 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_VARIABLE_OPS_H_
-#define TENSORFLOW_KERNELS_VARIABLE_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_VARIABLE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_VARIABLE_OPS_H_
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -46,4 +46,4 @@ class VariableOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_VARIABLE_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_VARIABLE_OPS_H_
diff --git a/tensorflow/core/kernels/warn_about_ints.cc b/tensorflow/core/kernels/warn_about_ints.cc
deleted file mode 100644
index 75ecdf2ae4..0000000000
--- a/tensorflow/core/kernels/warn_about_ints.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/warn_about_ints.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-
-namespace tensorflow {
-
-void WarnAboutInts(OpKernelConstruction* context) {
-  DataType dtype;
-  OP_REQUIRES_OK(context, context->GetAttr("T", &dtype));
-  if (DataTypeIsInteger(dtype)) {
-    LOG(WARNING) << "Op " << context->def().name() << " of type "
-                 << context->def().op() << " used with integer dtype "
-                 << DataTypeString(dtype)
-                 << ".  This op was registered with integer support "
-                 << "accidentally, and you won't like the result.";
-  }
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
index d26849c8bd..e63b3ba8cd 100644
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_WHERE_OP_H_
-#define TENSORFLOW_KERNELS_WHERE_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_WHERE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_WHERE_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -63,4 +63,4 @@ struct Where {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_WHERE_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_WHERE_OP_H_
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 57f51889de..8879d9dd4c 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -346,3 +349,5 @@ TF_CALL_WHERE_GPU_TYPES(DECLARE_GPU_SPEC);
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_WHERE_OP_GPU_CU_H_
diff --git a/tensorflow/core/kernels/xent_op.h b/tensorflow/core/kernels/xent_op.h
index 87be17fca9..23d3ad39a8 100644
--- a/tensorflow/core/kernels/xent_op.h
+++ b/tensorflow/core/kernels/xent_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_XENT_OP_H_
-#define TENSORFLOW_KERNELS_XENT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_XENT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_XENT_OP_H_
 // Functor definition for XentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -125,4 +125,4 @@ struct XentEigenImpl {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_XENT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_XENT_OP_H_
diff --git a/tensorflow/core/lib/core/arena.h b/tensorflow/core/lib/core/arena.h
index 5698303247..624ee77027 100644
--- a/tensorflow/core/lib/core/arena.h
+++ b/tensorflow/core/lib/core/arena.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // TODO(vrv): Switch this to an open-sourced version of Arena.
 
-#ifndef TENSORFLOW_LIB_CORE_ARENA_H_
-#define TENSORFLOW_LIB_CORE_ARENA_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_ARENA_H_
+#define TENSORFLOW_CORE_LIB_CORE_ARENA_H_
 
 #include <assert.h>
 
@@ -107,4 +107,4 @@ class Arena {
 }  // namespace core
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_ARENA_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_ARENA_H_
diff --git a/tensorflow/core/lib/core/bits.h b/tensorflow/core/lib/core/bits.h
index 1110ef5c2a..86e539a266 100644
--- a/tensorflow/core/lib/core/bits.h
+++ b/tensorflow/core/lib/core/bits.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_CORE_BITS_H_
-#define TENSORFLOW_LIB_CORE_BITS_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_BITS_H_
+#define TENSORFLOW_CORE_LIB_CORE_BITS_H_
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -106,4 +106,4 @@ inline uint64 NextPowerOfTwo64(uint64 value) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_BITS_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_BITS_H_
diff --git a/tensorflow/core/lib/core/casts.h b/tensorflow/core/lib/core/casts.h
index 0f925c6051..7546d4edc5 100644
--- a/tensorflow/core/lib/core/casts.h
+++ b/tensorflow/core/lib/core/casts.h
@@ -20,8 +20,8 @@ limitations under the License.
 // any changes here, make sure that you're not breaking any platforms.
 //
 
-#ifndef TENSORFLOW_LIB_CORE_CASTS_H_
-#define TENSORFLOW_LIB_CORE_CASTS_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_CASTS_H_
+#define TENSORFLOW_CORE_LIB_CORE_CASTS_H_
 
 #include <string.h>  // for memcpy
 
@@ -97,4 +97,4 @@ inline Dest bit_cast(const Source& source) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_CASTS_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_CASTS_H_
diff --git a/tensorflow/core/lib/core/coding.h b/tensorflow/core/lib/core/coding.h
index 8265aec870..4a70ffa619 100644
--- a/tensorflow/core/lib/core/coding.h
+++ b/tensorflow/core/lib/core/coding.h
@@ -18,8 +18,8 @@ limitations under the License.
 // * In addition we support variable length "varint" encoding
 // * Strings are encoded prefixed by their length in varint format
 
-#ifndef TENSORFLOW_LIB_CORE_CODING_H_
-#define TENSORFLOW_LIB_CORE_CODING_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_CODING_H_
+#define TENSORFLOW_CORE_LIB_CORE_CODING_H_
 
 #include "tensorflow/core/lib/core/raw_coding.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -76,4 +76,4 @@ extern int VarintLength(uint64_t v);
 }  // namespace core
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_CODING_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_CODING_H_
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index a631d9815a..49a8a4dbd4 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_CORE_ERRORS_H_
-#define TENSORFLOW_LIB_CORE_ERRORS_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_ERRORS_H_
+#define TENSORFLOW_CORE_LIB_CORE_ERRORS_H_
 
 #include <sstream>
 
@@ -144,4 +144,4 @@ using ::tensorflow::error::OK;
 }  // namespace errors
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_ERRORS_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_ERRORS_H_
diff --git a/tensorflow/core/lib/core/notification.h b/tensorflow/core/lib/core/notification.h
index b3e515e28f..5def958e6b 100644
--- a/tensorflow/core/lib/core/notification.h
+++ b/tensorflow/core/lib/core/notification.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_NOTIFICATION_H_
-#define TENSORFLOW_UTIL_NOTIFICATION_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_NOTIFICATION_H_
+#define TENSORFLOW_CORE_LIB_CORE_NOTIFICATION_H_
 
 // Notification implementation is platform-dependent, to support
 // alternative synchronization primitives.
 #include "tensorflow/core/platform/notification.h"
 
-#endif  // TENSORFLOW_UTIL_NOTIFICATION_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_NOTIFICATION_H_
diff --git a/tensorflow/core/lib/core/raw_coding.h b/tensorflow/core/lib/core/raw_coding.h
index 37201b755d..f49214939b 100644
--- a/tensorflow/core/lib/core/raw_coding.h
+++ b/tensorflow/core/lib/core/raw_coding.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_CORE_RAW_CODING_H_
-#define TENSORFLOW_LIB_CORE_RAW_CODING_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_RAW_CODING_H_
+#define TENSORFLOW_CORE_LIB_CORE_RAW_CODING_H_
 
 #include <string.h>
 #include "tensorflow/core/platform/byte_order.h"
@@ -68,4 +68,4 @@ inline uint64 DecodeFixed64(const char* ptr) {
 }  // namespace core
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_RAW_CODING_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_RAW_CODING_H_
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index 12dfcd284f..cb2a06e620 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -22,7 +22,7 @@ Status::Status(tensorflow::error::Code code, StringPiece msg) {
   assert(code != tensorflow::error::OK);
   state_ = std::unique_ptr<State>(new State);
   state_->code = code;
-  state_->msg = msg.ToString();
+  state_->msg = string(msg);
 }
 
 void Status::Update(const Status& new_status) {
diff --git a/tensorflow/core/lib/core/status_test_util.h b/tensorflow/core/lib/core/status_test_util.h
index b35633c9da..c695caa8d1 100644
--- a/tensorflow/core/lib/core/status_test_util.h
+++ b/tensorflow/core/lib/core/status_test_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_CORE_STATUS_TEST_UTIL_H_
-#define TENSORFLOW_LIB_CORE_STATUS_TEST_UTIL_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_STATUS_TEST_UTIL_H_
+#define TENSORFLOW_CORE_LIB_CORE_STATUS_TEST_UTIL_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
@@ -31,4 +31,4 @@ limitations under the License.
 // If you want to check for particular errors, a better alternative is:
 // EXPECT_EQ(..expected tensorflow::error::Code..., status.code());
 
-#endif  // TENSORFLOW_LIB_CORE_STATUS_TEST_UTIL_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_STATUS_TEST_UTIL_H_
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index d7ecc44e50..be659e5f8e 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -23,14 +23,15 @@ limitations under the License.
 // non-const method, all threads accessing the same StringPiece must use
 // external synchronization.
 
-#ifndef TENSORFLOW_LIB_CORE_STRINGPIECE_H_
-#define TENSORFLOW_LIB_CORE_STRINGPIECE_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
+#define TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
 
 #include <assert.h>
 #include <stddef.h>
 #include <string.h>
 #include <iosfwd>
 #include <string>
+#include <type_traits>
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -101,11 +102,18 @@ class StringPiece {
   //   >  0 iff "*this" >  "b"
   int compare(StringPiece b) const;
 
-  // Converts to `std::basic_string`.
-  template <typename A>
-  explicit operator std::basic_string<char, std::char_traits<char>, A>() const {
+  // Converts to various kinds of strings, including `std::basic_string`.
+  template <typename S>
+  explicit operator S() const {
+    static_assert(
+        std::is_same<char, typename S::value_type>::value,
+        "Type mismatch: S must be a string with character type char.");
+    static_assert(
+        std::is_same<std::char_traits<char>, typename S::traits_type>::value,
+        "Type mismatch: S must be a string with traits type "
+        "std::char_traits<char>.");
     if (!data()) return {};
-    return std::basic_string<char, std::char_traits<char>, A>(data(), size());
+    return S(data(), size());
   }
 
  private:
@@ -148,4 +156,4 @@ extern std::ostream& operator<<(std::ostream& o, tensorflow::StringPiece piece);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_STRINGPIECE_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_STRINGPIECE_H_
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index 952b9eaaaa..e4b489fe17 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -56,8 +56,8 @@ TEST(StringPiece, Ctor) {
 }
 
 TEST(StringPiece, ConversionToString) {
-  EXPECT_EQ("", std::string(StringPiece("")));
-  EXPECT_EQ("foo", std::string(StringPiece("foo")));
+  EXPECT_EQ("", string(StringPiece("")));
+  EXPECT_EQ("foo", string(StringPiece("foo")));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index b89b74b8de..74df7c84a4 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_CORE_THREADPOOL_H_
-#define TENSORFLOW_LIB_CORE_THREADPOOL_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_THREADPOOL_H_
+#define TENSORFLOW_CORE_LIB_CORE_THREADPOOL_H_
 
 #include <functional>
 #include <memory>
@@ -108,4 +108,4 @@ class ThreadPool {
 }  // namespace thread
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_CORE_THREADPOOL_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_THREADPOOL_H_
diff --git a/tensorflow/core/lib/gtl/array_slice.h b/tensorflow/core/lib/gtl/array_slice.h
index 002d166c72..4ecc96ee79 100644
--- a/tensorflow/core/lib/gtl/array_slice.h
+++ b/tensorflow/core/lib/gtl/array_slice.h
@@ -91,8 +91,8 @@ limitations under the License.
 //   for (int i = 0; i < 10; ++i) { my_proto.add_value(i); }
 //   MyMutatingRoutine(my_proto.mutable_value());
 
-#ifndef TENSORFLOW_LIB_GTL_ARRAY_SLICE_H_
-#define TENSORFLOW_LIB_GTL_ARRAY_SLICE_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
+#define TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
 
 #include <initializer_list>
 #include <type_traits>
@@ -311,4 +311,4 @@ const typename MutableArraySlice<T>::size_type MutableArraySlice<T>::npos;
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_GTL_ARRAY_SLICE_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
diff --git a/tensorflow/core/lib/gtl/cleanup.h b/tensorflow/core/lib/gtl/cleanup.h
index 6bd60ca482..8c73dc6aa9 100644
--- a/tensorflow/core/lib/gtl/cleanup.h
+++ b/tensorflow/core/lib/gtl/cleanup.h
@@ -39,8 +39,8 @@ limitations under the License.
 //
 // You can call 'release()' on a Cleanup object to cancel the cleanup.
 
-#ifndef TENSORFLOW_LIB_GTL_CLEANUP_H_
-#define TENSORFLOW_LIB_GTL_CLEANUP_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_CLEANUP_H_
+#define TENSORFLOW_CORE_LIB_GTL_CLEANUP_H_
 
 #include <type_traits>
 #include <utility>
@@ -110,4 +110,4 @@ TF_MUST_USE_RESULT Cleanup<DecayF> MakeCleanup(F&& f) {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_GTL_CLEANUP_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_CLEANUP_H_
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index 2011f7d4a1..c18dc9ad1a 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -28,8 +28,8 @@ limitations under the License.
 //
 // TODO(billydonahue): change size_t to size_type where appropriate.
 
-#ifndef TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
-#define TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_INLINED_VECTOR_H_
+#define TENSORFLOW_CORE_LIB_GTL_INLINED_VECTOR_H_
 
 #include <stddef.h>
 #include <stdlib.h>
@@ -685,4 +685,4 @@ inline void InlinedVector<T, N>::AppendRange(Iter first, Iter last) {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_INLINED_VECTOR_H_
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 4ee3f88d18..7ad916ad3d 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_GTL_OPTIONAL_H_
-#define TENSORFLOW_LIB_GTL_OPTIONAL_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_OPTIONAL_H_
+#define TENSORFLOW_CORE_LIB_GTL_OPTIONAL_H_
 
 #include <assert.h>
 #include <functional>
@@ -873,4 +873,4 @@ struct hash<::tensorflow::gtl::optional<T>> {
 
 }  // namespace std
 
-#endif  // TENSORFLOW_LIB_GTL_OPTIONAL_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_OPTIONAL_H_
diff --git a/tensorflow/core/lib/gtl/priority_queue_util.h b/tensorflow/core/lib/gtl/priority_queue_util.h
index 07311e3725..93bf3d3037 100644
--- a/tensorflow/core/lib/gtl/priority_queue_util.h
+++ b/tensorflow/core/lib/gtl/priority_queue_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
-#define TENSORFLOW_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
+#define TENSORFLOW_CORE_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
 
 #include <algorithm>
 #include <queue>
@@ -52,4 +52,4 @@ T ConsumeTop(std::priority_queue<T, Container, Comparator>* q) {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_PRIORITY_QUEUE_UTIL_H_
diff --git a/tensorflow/core/lib/hash/crc32c.h b/tensorflow/core/lib/hash/crc32c.h
index ee0bda93b1..2718cd31b3 100644
--- a/tensorflow/core/lib/hash/crc32c.h
+++ b/tensorflow/core/lib/hash/crc32c.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_HASH_CRC32C_H_
-#define TENSORFLOW_LIB_HASH_CRC32C_H_
+#ifndef TENSORFLOW_CORE_LIB_HASH_CRC32C_H_
+#define TENSORFLOW_CORE_LIB_HASH_CRC32C_H_
 
 #include <stddef.h>
 #include "tensorflow/core/platform/types.h"
@@ -51,4 +51,4 @@ inline uint32 Unmask(uint32 masked_crc) {
 }  // namespace crc32c
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_HASH_CRC32C_H_
+#endif  // TENSORFLOW_CORE_LIB_HASH_CRC32C_H_
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index 737d23f699..675bab7191 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Simple hash functions used for internal data structures
 
-#ifndef TENSORFLOW_LIB_HASH_HASH_H_
-#define TENSORFLOW_LIB_HASH_HASH_H_
+#ifndef TENSORFLOW_CORE_LIB_HASH_HASH_H_
+#define TENSORFLOW_CORE_LIB_HASH_HASH_H_
 
 #include <stddef.h>
 #include <stdint.h>
@@ -110,4 +110,4 @@ struct hash<std::pair<T, U>> {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_HASH_HASH_H_
+#endif  // TENSORFLOW_CORE_LIB_HASH_HASH_H_
diff --git a/tensorflow/core/lib/histogram/histogram.h b/tensorflow/core/lib/histogram/histogram.h
index 65ce10786d..f882ee9abe 100644
--- a/tensorflow/core/lib/histogram/histogram.h
+++ b/tensorflow/core/lib/histogram/histogram.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_HISTOGRAM_HISTOGRAM_H_
-#define TENSORFLOW_LIB_HISTOGRAM_HISTOGRAM_H_
+#ifndef TENSORFLOW_CORE_LIB_HISTOGRAM_HISTOGRAM_H_
+#define TENSORFLOW_CORE_LIB_HISTOGRAM_HISTOGRAM_H_
 
 #include <string>
 #include <vector>
@@ -136,4 +136,4 @@ class ThreadSafeHistogram {
 }  // namespace histogram
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_HISTOGRAM_HISTOGRAM_H_
+#endif  // TENSORFLOW_CORE_LIB_HISTOGRAM_HISTOGRAM_H_
diff --git a/tensorflow/core/lib/io/buffered_inputstream.h b/tensorflow/core/lib/io/buffered_inputstream.h
index 924619f40f..96a95b7ed9 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.h
+++ b/tensorflow/core/lib/io/buffered_inputstream.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_BUFFERED_INPUTSTREAM_H_
-#define TENSORFLOW_LIB_IO_BUFFERED_INPUTSTREAM_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_BUFFERED_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_BUFFERED_INPUTSTREAM_H_
 
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -104,4 +104,4 @@ class BufferedInputStream : public InputStreamInterface {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_BUFFERED_INPUTSTREAM_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_BUFFERED_INPUTSTREAM_H_
diff --git a/tensorflow/core/lib/io/inputstream_interface.h b/tensorflow/core/lib/io/inputstream_interface.h
index 3083d20776..cbfc509d93 100644
--- a/tensorflow/core/lib/io/inputstream_interface.h
+++ b/tensorflow/core/lib/io/inputstream_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_INPUTSTREAM_INTERFACE_H_
-#define TENSORFLOW_LIB_IO_INPUTSTREAM_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
+#define TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
 
 #include <string>
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index 818ba99888..e3649fd0c9 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_PATH_H_
-#define TENSORFLOW_LIB_IO_PATH_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_PATH_H_
+#define TENSORFLOW_CORE_LIB_IO_PATH_H_
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -94,4 +94,4 @@ string GetTempFilename(const string& extension);
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_PATH_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_PATH_H_
diff --git a/tensorflow/core/lib/io/proto_encode_helper.h b/tensorflow/core/lib/io/proto_encode_helper.h
index f70e1cbaab..34905520f1 100644
--- a/tensorflow/core/lib/io/proto_encode_helper.h
+++ b/tensorflow/core/lib/io/proto_encode_helper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_PROTO_ENCODE_HELPER_H_
-#define TENSORFLOW_LIB_IO_PROTO_ENCODE_HELPER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_PROTO_ENCODE_HELPER_H_
+#define TENSORFLOW_CORE_LIB_IO_PROTO_ENCODE_HELPER_H_
 
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -95,4 +95,4 @@ class ProtoEncodeHelper {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_PROTO_ENCODE_HELPER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_PROTO_ENCODE_HELPER_H_
diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h
index bdbdbd71ff..c822fe50e9 100644
--- a/tensorflow/core/lib/io/random_inputstream.h
+++ b/tensorflow/core/lib/io/random_inputstream.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_RANDOM_INPUTSTREAM_H_
-#define TENSORFLOW_LIB_IO_RANDOM_INPUTSTREAM_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_RANDOM_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_RANDOM_INPUTSTREAM_H_
 
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -54,4 +54,4 @@ class RandomAccessInputStream : public InputStreamInterface {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_RANDOM_INPUTSTREAM_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_RANDOM_INPUTSTREAM_H_
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index f6d587dfa0..c05f9e1b36 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_RECORD_READER_H_
-#define TENSORFLOW_LIB_IO_RECORD_READER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
+#define TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -122,4 +122,4 @@ class SequentialRecordReader {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_RECORD_READER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index daed809af3..2f6afa5487 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_RECORD_WRITER_H_
-#define TENSORFLOW_LIB_IO_RECORD_WRITER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_RECORD_WRITER_H_
+#define TENSORFLOW_CORE_LIB_IO_RECORD_WRITER_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -82,4 +82,4 @@ class RecordWriter {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_RECORD_WRITER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_RECORD_WRITER_H_
diff --git a/tensorflow/core/lib/io/table.h b/tensorflow/core/lib/io/table.h
index a1b78eae5b..b9c6b8d9d2 100644
--- a/tensorflow/core/lib/io/table.h
+++ b/tensorflow/core/lib/io/table.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_TABLE_H_
-#define TENSORFLOW_LIB_IO_TABLE_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_TABLE_H_
+#define TENSORFLOW_CORE_LIB_IO_TABLE_H_
 
 #include <stdint.h>
 #include "tensorflow/core/lib/io/iterator.h"
@@ -84,4 +84,4 @@ class Table {
 }  // namespace table
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_TABLE_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_TABLE_H_
diff --git a/tensorflow/core/lib/io/table_builder.h b/tensorflow/core/lib/io/table_builder.h
index 0202f90446..0e37e0a77f 100644
--- a/tensorflow/core/lib/io/table_builder.h
+++ b/tensorflow/core/lib/io/table_builder.h
@@ -21,8 +21,8 @@ limitations under the License.
 // non-const method, all threads accessing the same TableBuilder must use
 // external synchronization.
 
-#ifndef TENSORFLOW_LIB_IO_TABLE_BUILDER_H_
-#define TENSORFLOW_LIB_IO_TABLE_BUILDER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_TABLE_BUILDER_H_
+#define TENSORFLOW_CORE_LIB_IO_TABLE_BUILDER_H_
 
 #include <stdint.h>
 #include "tensorflow/core/lib/core/status.h"
@@ -96,4 +96,4 @@ class TableBuilder {
 }  // namespace table
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_TABLE_BUILDER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_TABLE_BUILDER_H_
diff --git a/tensorflow/core/lib/io/table_options.h b/tensorflow/core/lib/io/table_options.h
index fd8a9d4a78..9a36bf1631 100644
--- a/tensorflow/core/lib/io/table_options.h
+++ b/tensorflow/core/lib/io/table_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_IO_TABLE_OPTIONS_H_
-#define TENSORFLOW_LIB_IO_TABLE_OPTIONS_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_TABLE_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_IO_TABLE_OPTIONS_H_
 
 #include <stddef.h>
 
@@ -65,4 +65,4 @@ struct Options {
 }  // namespace table
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_IO_TABLE_OPTIONS_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_TABLE_OPTIONS_H_
diff --git a/tensorflow/core/lib/jpeg/jpeg_handle.h b/tensorflow/core/lib/jpeg/jpeg_handle.h
index 7d86be51da..86fa3ac5c2 100644
--- a/tensorflow/core/lib/jpeg/jpeg_handle.h
+++ b/tensorflow/core/lib/jpeg/jpeg_handle.h
@@ -16,8 +16,8 @@ limitations under the License.
 // This file declares the functions and structures for memory I/O with libjpeg
 // These functions are not meant to be used directly, see jpeg_mem.h instead.
 
-#ifndef TENSORFLOW_LIB_JPEG_JPEG_HANDLE_H_
-#define TENSORFLOW_LIB_JPEG_JPEG_HANDLE_H_
+#ifndef TENSORFLOW_CORE_LIB_JPEG_JPEG_HANDLE_H_
+#define TENSORFLOW_CORE_LIB_JPEG_JPEG_HANDLE_H_
 
 #include "tensorflow/core/platform/jpeg.h"
 #include "tensorflow/core/platform/types.h"
@@ -57,4 +57,4 @@ void SetDest(j_compress_ptr cinfo, void *buffer, int bufsize,
 }  // namespace jpeg
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_JPEG_JPEG_HANDLE_H_
+#endif  // TENSORFLOW_CORE_LIB_JPEG_JPEG_HANDLE_H_
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.h b/tensorflow/core/lib/jpeg/jpeg_mem.h
index 59342d28c0..03437a4e78 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.h
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.h
@@ -18,8 +18,8 @@ limitations under the License.
 // (data array and size fields).
 // Direct manipulation of JPEG strings are supplied: Flip, Rotate, Crop..
 
-#ifndef TENSORFLOW_LIB_JPEG_JPEG_MEM_H_
-#define TENSORFLOW_LIB_JPEG_JPEG_MEM_H_
+#ifndef TENSORFLOW_CORE_LIB_JPEG_JPEG_MEM_H_
+#define TENSORFLOW_CORE_LIB_JPEG_JPEG_MEM_H_
 
 #include <functional>
 #include <string>
@@ -159,4 +159,4 @@ bool Compress(const void* srcdata, int width, int height,
 }  // namespace jpeg
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_JPEG_JPEG_MEM_H_
+#endif  // TENSORFLOW_CORE_LIB_JPEG_JPEG_MEM_H_
diff --git a/tensorflow/core/lib/math/math_util.h b/tensorflow/core/lib/math/math_util.h
index 41d486f2bd..502d741512 100644
--- a/tensorflow/core/lib/math/math_util.h
+++ b/tensorflow/core/lib/math/math_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_MATH_MATH_UTIL_H_
-#define TENSORFLOW_LIB_MATH_MATH_UTIL_H_
+#ifndef TENSORFLOW_CORE_LIB_MATH_MATH_UTIL_H_
+#define TENSORFLOW_CORE_LIB_MATH_MATH_UTIL_H_
 
 #include <type_traits>
 
@@ -160,4 +160,4 @@ T MathUtil::IPow(T base, int exp) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_MATH_MATH_UTIL_H_
+#endif  // TENSORFLOW_CORE_LIB_MATH_MATH_UTIL_H_
diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc
index 8c28620ff9..fface033cb 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry.cc
@@ -38,15 +38,15 @@ void Collector::CollectMetricDescriptor(
     mutex_lock l(mu_);
     return collected_metrics_->metric_descriptor_map
         .insert(std::make_pair(
-            std::string(metric_def->name()),
+            string(metric_def->name()),
             std::unique_ptr<MetricDescriptor>(new MetricDescriptor())))
         .first->second.get();
   }();
-  metric_descriptor->name = std::string(metric_def->name());
-  metric_descriptor->description = std::string(metric_def->description());
+  metric_descriptor->name = string(metric_def->name());
+  metric_descriptor->description = string(metric_def->description());
 
   for (const StringPiece label_name : metric_def->label_descriptions()) {
-    metric_descriptor->label_names.push_back(std::string(label_name));
+    metric_descriptor->label_names.emplace_back(label_name);
   }
 
   metric_descriptor->metric_kind = metric_def->kind();
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 20f0444f8b..c204d52cfe 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -72,7 +72,7 @@ class MetricCollector {
         registration_time_millis_(registration_time_millis),
         collector_(collector),
         point_set_(point_set) {
-    point_set_->metric_name = std::string(metric_def->name());
+    point_set_->metric_name = string(metric_def->name());
   }
 
   const MetricDef<metric_kind, Value, NumLabels>* const metric_def_;
@@ -261,7 +261,7 @@ class Collector {
     auto* const point_set = [&]() {
       mutex_lock l(mu_);
       return collected_metrics_->point_set_map
-          .insert(std::make_pair(std::string(metric_def->name()),
+          .insert(std::make_pair(string(metric_def->name()),
                                  std::unique_ptr<PointSet>(new PointSet())))
           .first->second.get();
     }();
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index 6f94685665..756e5c2af8 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -98,8 +98,8 @@ class AbstractMetricDef {
                     const std::vector<string>& label_descriptions)
       : kind_(kind),
         value_type_(value_type),
-        name_(std::string(name)),
-        description_(std::string(description)),
+        name_(name),
+        description_(description),
         label_descriptions_(std::vector<string>(label_descriptions.begin(),
                                                 label_descriptions.end())) {}
 
diff --git a/tensorflow/core/lib/random/distribution_sampler.h b/tensorflow/core/lib/random/distribution_sampler.h
index 25605d8ed4..7aa50ece03 100644
--- a/tensorflow/core/lib/random/distribution_sampler.h
+++ b/tensorflow/core/lib/random/distribution_sampler.h
@@ -28,8 +28,8 @@ limitations under the License.
 //
 // The algorithm used is Walker's Aliasing algorithm, described in Knuth, Vol 2.
 
-#ifndef TENSORFLOW_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
-#define TENSORFLOW_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
 
 #include <memory>
 #include <utility>
@@ -91,4 +91,4 @@ class DistributionSampler {
 }  // namespace random
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_DISTRIBUTION_SAMPLER_H_
diff --git a/tensorflow/core/lib/random/philox_random.h b/tensorflow/core/lib/random/philox_random.h
index b2adb4462b..058ed95ffb 100644
--- a/tensorflow/core/lib/random/philox_random.h
+++ b/tensorflow/core/lib/random/philox_random.h
@@ -17,8 +17,8 @@ limitations under the License.
 // Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
 //   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
 
-#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
-#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
 
 #include <stdlib.h>
 
@@ -248,4 +248,4 @@ class PhiloxRandom {
 }  // namespace random
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index e963511f5c..c3801a0412 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
-#define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
 
 #define _USE_MATH_DEFINES
 #include <math.h>
@@ -744,4 +744,4 @@ PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
 }  // namespace random
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
diff --git a/tensorflow/core/lib/random/simple_philox.h b/tensorflow/core/lib/random/simple_philox.h
index d529e08913..6464036856 100644
--- a/tensorflow/core/lib/random/simple_philox.h
+++ b/tensorflow/core/lib/random/simple_philox.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_RANDOM_SIMPLE_PHILOX_H_
-#define TENSORFLOW_LIB_RANDOM_SIMPLE_PHILOX_H_
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_SIMPLE_PHILOX_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_SIMPLE_PHILOX_H_
 
 #include <math.h>
 #include <string.h>
@@ -73,4 +73,4 @@ class SimplePhilox {
 }  // namespace random
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_RANDOM_SIMPLE_PHILOX_H_
+#endif  // TENSORFLOW_CORE_LIB_RANDOM_SIMPLE_PHILOX_H_
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 1d5bacac93..959290ba8c 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_STRINGS_NUMBERS_H_
-#define TENSORFLOW_LIB_STRINGS_NUMBERS_H_
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
 
 #include <string>
 
@@ -140,11 +140,11 @@ inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
 }
 
 inline bool ProtoParseNumeric(StringPiece s, float* value) {
-  return safe_strtof(std::string(s).c_str(), value);
+  return safe_strtof(s, value);
 }
 
 inline bool ProtoParseNumeric(StringPiece s, double* value) {
-  return safe_strtod(std::string(s).c_str(), value);
+  return safe_strtod(s, value);
 }
 
 // Convert strings to number of type T.
@@ -176,4 +176,4 @@ string HumanReadableElapsedTime(double seconds);
 }  // namespace strings
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_STRINGS_NUMBERS_H_
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_NUMBERS_H_
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index cab8f81585..3aba5ec80e 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -332,7 +332,7 @@ string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
                      bool replace_all) {
   // TODO(jlebar): We could avoid having to shift data around in the string if
   // we had a StringPiece::find() overload that searched for a StringPiece.
-  string res = std::string(s);
+  string res(s);
   size_t pos = 0;
   while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
     res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
@@ -448,8 +448,7 @@ bool SplitAndParseAsFloats(StringPiece text, char delim,
                            std::vector<float>* result) {
   return SplitAndParseAsInts<float>(text, delim,
                                     [](StringPiece str, float* value) {
-                                      return strings::safe_strtof(
-                                          std::string(str).c_str(), value);
+                                      return strings::safe_strtof(str, value);
                                     },
                                     result);
 }
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index c887db7eff..9f52cf29fc 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LIB_STRINGS_STR_UTIL_H_
-#define TENSORFLOW_LIB_STRINGS_STR_UTIL_H_
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
 
 #include <functional>
 #include <string>
@@ -205,7 +205,7 @@ std::vector<string> Split(StringPiece text, StringPiece delims, Predicate p) {
       if ((i == text.size()) || (delims.find(text[i]) != StringPiece::npos)) {
         StringPiece token(text.data() + token_start, i - token_start);
         if (p(token)) {
-          result.push_back(std::string(token));
+          result.emplace_back(token);
         }
         token_start = i + 1;
       }
@@ -231,4 +231,4 @@ size_t Strnlen(const char* str, const size_t string_max_len);
 }  // namespace str_util
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_STRINGS_STR_UTIL_H_
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_STR_UTIL_H_
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index fb2cd5bc7e..5ae3d220e3 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -17,8 +17,8 @@ limitations under the License.
 // #category: operations on strings
 // #summary: Merges strings or numbers with no delimiter.
 //
-#ifndef TENSORFLOW_LIB_STRINGS_STRCAT_H_
-#define TENSORFLOW_LIB_STRINGS_STRCAT_H_
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_STRCAT_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_STRCAT_H_
 
 #include <string>
 
@@ -233,4 +233,4 @@ inline void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b,
 }  // namespace strings
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_STRINGS_STRCAT_H_
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_STRCAT_H_
diff --git a/tensorflow/core/lib/strings/stringprintf.h b/tensorflow/core/lib/strings/stringprintf.h
index f7957252ea..52af410d42 100644
--- a/tensorflow/core/lib/strings/stringprintf.h
+++ b/tensorflow/core/lib/strings/stringprintf.h
@@ -20,8 +20,8 @@ limitations under the License.
 //      strings::SPrintf(&result, "%d %s\n", 10, "hello");
 //      strings::Appendf(&result, "%d %s\n", 20, "there");
 
-#ifndef TENSORFLOW_LIB_STRINGS_STRINGPRINTF_H_
-#define TENSORFLOW_LIB_STRINGS_STRINGPRINTF_H_
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
 
 #include <stdarg.h>
 #include <string>
@@ -49,4 +49,4 @@ extern void Appendv(string* dst, const char* format, va_list ap);
 }  // namespace strings
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_LIB_STRINGS_STRINGPRINTF_H_
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_STRINGPRINTF_H_
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index 1f2e57e9a9..3d03bc1d5f 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -354,6 +354,27 @@ Status TransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Transpose", TransposeGrad);
 
+Status GatherNdGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+      // Arg defs
+      {"params: Tparams", "indices: Tindices", "doutput: Tparams"},
+      // Ret val defs
+      {"dparams: Tparams", "dindices: Tindices"},
+      // Attr defs
+      {"Tparams: type", "Tindices: type"},
+      // Nodes
+      {
+        {{"x_shape"}, "Shape", {"params"}, {{"T", "$Tparams"}}},
+        {{"dparams"}, "ScatterNd", {"indices", "doutput", "x_shape"},
+         {{"T", "$Tparams"}, {"Tindices", "$Tindices"}}},
+        {{"dindices"}, "ZerosLike", {"indices"}, {{"T", "$Tindices"}}},
+      });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("GatherNd", GatherNdGrad);
+
 Status ConjugateTransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
   *g = FDH::Define(
       // Arg defs
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f87b4d6bde..1d11ec00ce 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -702,6 +702,16 @@ REGISTER_OP("Const")
       return Status::OK();
     });
 
+// Returns a constant tensor on the host.  Useful for writing C++ tests
+// and benchmarks which run on GPU but require arguments pinned to the host.
+// Used by test::graph::HostConstant.
+// value: Attr `value` is the tensor to return.
+REGISTER_OP("HostConst")
+    .Output("output: dtype")
+    .Attr("value: tensor")
+    .Attr("dtype: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 // --------------------------------------------------------------------------
 // TODO(mgubin): Update the doc when the freeze_graph script supports converting
 // into memmapped format.
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index c15409a246..03dab390a7 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -1620,6 +1620,24 @@ TEST(ArrayOpsTest, Slice_ShapeFn) {
   INFER_ERROR("cannot be < -1", op, "[2,3,4,5];[4];[4]");
 }
 
+TEST(ArrayOpsTest, StridedSlice_ShapeFn) {
+  ShapeInferenceTestOp op("StridedSlice");
+  TF_ASSERT_OK(NodeDefBuilder("test", "StridedSlice")
+                   .Input("input", 0, DT_FLOAT)
+                   .Input("begin", 1, DT_INT32)
+                   .Input("end", 2, DT_INT32)
+                   .Input("strides", 3, DT_INT32)
+                   .Attr("shrink_axis_mask", 1)
+                   .Finalize(&op.node_def));
+  op.input_tensors.resize(4);
+  Tensor strides = test::AsTensor<int32>({1});
+  op.input_tensors[3] = &strides;
+  // Slicing on the 0-th dimension.
+  INFER_OK(op, "[2,3,4,5];[1];[1];[1]", "[3,4,5]");
+  // Slicing on the 0-th dimension. This time some of the result dimension is 0.
+  INFER_OK(op, "[2,0,3,4];[1];[1];[1]", "[0,3,4]");
+}
+
 TEST(ArrayOpsTest, StridedSliceGrad_ShapeFn) {
   ShapeInferenceTestOp op("StridedSliceGrad");
   op.input_tensors.resize(5);
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 44dddffd59..97a212b8f3 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -20317,6 +20317,31 @@ op {
   }
 }
 op {
+  name: "DivNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
   name: "DrawBoundingBoxes"
   input_arg {
     name: "images"
@@ -25613,6 +25638,21 @@ op {
   }
 }
 op {
+  name: "HostConst"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
   name: "IFFT"
   input_arg {
     name: "input"
@@ -29976,6 +30016,32 @@ op {
   }
 }
 op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 27
+  }
+}
+op {
   name: "MatrixInverse"
   input_arg {
     name: "input"
@@ -37269,6 +37335,76 @@ op {
   }
 }
 op {
+  name: "ParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
   name: "ParseSingleExample"
   input_arg {
     name: "serialized"
@@ -68819,6 +68955,32 @@ op {
   }
 }
 op {
+  name: "StaticRegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pattern"
+    type: "string"
+  }
+  attr {
+    name: "rewrite"
+    type: "string"
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
   name: "StatsAggregatorHandle"
   output_arg {
     name: "handle"
@@ -69119,6 +69281,17 @@ op {
   }
 }
 op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
+op {
   name: "StringSplit"
   input_arg {
     name: "input"
@@ -73391,41 +73564,6 @@ op {
   }
 }
 op {
-  name: "UnsafeDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
   name: "UnsortedSegmentMax"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 13733d48f0..41f5f9aebe 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -166,6 +166,22 @@ REGISTER_OP("LatencyStatsDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ParseExampleDataset")
+    .Input("input_dataset: variant")
+    .Input("num_parallel_calls: int64")
+    .Input("dense_defaults: Tdense")
+    .Output("handle: variant")
+    .Attr("sparse_keys: list(string) >= 0")
+    .Attr("dense_keys: list(string) >= 0")
+    .Attr("sparse_types: list({float,int64,string}) >= 0")
+    .Attr("Tdense: list({float,int64,string}) >= 0")
+    .Attr("dense_shapes: list(shape) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
+                                              // sorted by key (dense_keys and
+                                              // sparse_keys combined) here.
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("FeatureStatsDataset")
     .Input("input_dataset: variant")
     .Input("tag: string")
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index f37f79ddbf..1d4d51a25d 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -235,6 +235,8 @@ REGISTER_OP("MatrixInverse")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("MatrixExponential")
+    .Deprecated(
+        27, "Use Python implementation tf.linalg.matrix_exponential instead.")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {double, float, complex64, complex128}")
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 7c71406c6b..72a77be70d 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -294,7 +294,9 @@ REGISTER_OP("LookupTableImportV2")
       ShapeHandle handle;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
 
-      // TODO: Validate keys and values shape.
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 57499a6f1d..07f876cb90 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -495,18 +495,18 @@ Status RealDivGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("RealDiv", RealDivGrad);
 
-Status UnsafeDivGrad(const AttrSlice& attrs, FunctionDef* g) {
+Status DivNoNanGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForBinaryCwise(g, {
-      {{"gx"}, "UnsafeDiv", {"dz", "y"}},
+      {{"gx"}, "DivNoNan", {"dz", "y"}},
       {{"nx"}, "Neg", {"x"}, {}, {"dz"}},
       {{"y2"}, "Square", {"y"}, {}, {"dz"}},
-      {{"nx_y2"}, "UnsafeDiv", {"nx", "y2"}},
+      {{"nx_y2"}, "DivNoNan", {"nx", "y2"}},
       {{"gy"}, "Mul", {"dz", "nx_y2"}},  // dz * (- x / y^2)
   });
   // clang-format on
 }
-REGISTER_OP_GRADIENT("UnsafeDiv", UnsafeDivGrad);
+REGISTER_OP_GRADIENT("DivNoNan", DivNoNanGrad);
 
 Status PowGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index b0d1595c31..5ee79809ac 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -753,14 +753,14 @@ TEST_F(MathGradTest, Div) {
   }
 }
 
-TEST_F(MathGradTest, UnsafeDiv) {
+TEST_F(MathGradTest, DivNoNan) {
   auto x = test::AsTensor<float>(
       {0.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 0.f}, TensorShape({3, 3}));
   auto y = test::AsTensor<float>({-10.f, 0.f, 10.f}, TensorShape({3, 1}));
   Tensor dx;
   Tensor dy;
   {
-    SymGrad("UnsafeDiv", x, y, &dx, &dy);
+    SymGrad("DivNoNan", x, y, &dx, &dy);
     {
       auto g = [](float x, float y) {
         if (y == 0.f) {
@@ -792,7 +792,7 @@ TEST_F(MathGradTest, UnsafeDiv) {
     }
   }
   {  // Swap x and y.
-    SymGrad("UnsafeDiv", y, x, &dy, &dx);
+    SymGrad("DivNoNan", y, x, &dy, &dx);
     {
       auto g = [](float x, float y) {
         if (y == 0.f) {
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 49646f1f3a..717263a9b0 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -392,8 +392,11 @@ Returns x * y element-wise.
 REGISTER_OP("Div").BINARY_MORE().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
-REGISTER_OP("UnsafeDiv")
-    .BINARY_MORE()
+REGISTER_OP("DivNoNan")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr("T: {float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorDiv")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index ebeb048157..be4c3ed2b6 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -121,7 +121,7 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
                               "Mod",        "Mul",
                               "NotEqual",   "Pow",
                               "Sub",        "SquaredDifference",
-                              "UnsafeDiv"}) {
+                              "DivNoNan"}) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?;?", "?");
     INFER_OK(op, "[1,2];?", "?");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index e0f25fb4ef..94476acd4b 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1009,6 +1009,7 @@ REGISTER_OP("SeluGrad")
     .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
+// TODO(b/111515541): change T to {half, bfloat16, float, double}
 REGISTER_OP("Softplus")
     .Input("features: T")
     .Output("activations: T")
@@ -1022,6 +1023,7 @@ REGISTER_OP("SoftplusGrad")
     .Attr("T: realnumbertype")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
+// TODO(b/111515541): change T to {half, bfloat16, float, double}
 REGISTER_OP("Softsign")
     .Input("features: T")
     .Output("activations: T")
@@ -1736,6 +1738,87 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklConv3D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv3DShape)
+    .Doc(R"doc(
+MKL version of Conv3D operator. Uses MKL DNN APIs to perform 3D convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv3DBackpropInputV2")
+    .Input("input_sizes: Tshape")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Input("mkl_input_sizes: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int) >= 5")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Convolution3D backward input. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv3DBackpropFilterV2")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv3DBackpropFilter. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklRelu")
     .Input("features: T")
     .Input("mkl_features: uint8")
@@ -2161,7 +2244,7 @@ REGISTER_OP("_MklToTf")
     .Input("mkl_input: uint8")
     .Output("output: T")
     .Attr("T: {half, float, double}")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to convert a tensor from MKL layout to TensorFlow layout.
@@ -2183,7 +2266,7 @@ REGISTER_OP("_MklInputConversion")
     .Attr(
         "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
         "complex64, complex128}")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to process the inputs to an elementwise MKL op. Both inputs
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1fda569b8e..9091622f09 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -9190,6 +9190,31 @@ op {
   }
 }
 op {
+  name: "DivNoNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
   name: "DrawBoundingBoxes"
   input_arg {
     name: "images"
@@ -12257,6 +12282,21 @@ op {
   }
 }
 op {
+  name: "HostConst"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
   name: "IFFT"
   input_arg {
     name: "input"
@@ -15006,6 +15046,10 @@ op {
       }
     }
   }
+  deprecation {
+    version: 27
+    explanation: "Use Python implementation tf.linalg.matrix_exponential instead."
+  }
 }
 op {
   name: "MatrixInverse"
@@ -18342,6 +18386,76 @@ op {
   }
 }
 op {
+  name: "ParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
   name: "ParseSingleExample"
   input_arg {
     name: "serialized"
@@ -31805,6 +31919,32 @@ op {
   }
 }
 op {
+  name: "StaticRegexReplace"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pattern"
+    type: "string"
+  }
+  attr {
+    name: "rewrite"
+    type: "string"
+  }
+  attr {
+    name: "replace_global"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
   name: "StatsAggregatorHandle"
   output_arg {
     name: "handle"
@@ -32105,6 +32245,17 @@ op {
   }
 }
 op {
+  name: "StringLength"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+}
+op {
   name: "StringSplit"
   input_arg {
     name: "input"
@@ -34934,41 +35085,6 @@ op {
   }
 }
 op {
-  name: "UnsafeDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
   name: "UnsortedSegmentMax"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 8c39d69157..7aa1e71809 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -37,6 +37,14 @@ REGISTER_OP("RegexReplace")
       return Status::OK();
     });
 
+REGISTER_OP("StaticRegexReplace")
+    .Input("input: string")
+    .Attr("pattern: string")
+    .Attr("rewrite: string")
+    .Output("output: string")
+    .Attr("replace_global: bool = true")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("RegexFullMatch")
     .Input("input: string")
     .Input("pattern: string")
@@ -159,6 +167,11 @@ REGISTER_OP("StringStrip")
     .Output("output: string")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("StringLength")
+    .Input("input: string")
+    .Output("output: int32")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("EncodeBase64")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/abi.h b/tensorflow/core/platform/abi.h
index 763d467457..591e83b0c4 100644
--- a/tensorflow/core/platform/abi.h
+++ b/tensorflow/core/platform/abi.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_ABI_H_
-#define TENSORFLOW_PLATFORM_ABI_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_ABI_H_
+#define TENSORFLOW_CORE_PLATFORM_ABI_H_
 
 #include <string>
 
@@ -26,4 +26,4 @@ std::string MaybeAbiDemangle(const char* name);
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_ABI_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_ABI_H_
diff --git a/tensorflow/core/platform/cloud/auth_provider.h b/tensorflow/core/platform/cloud/auth_provider.h
index 465ff248d9..7347bc626d 100644
--- a/tensorflow/core/platform/cloud/auth_provider.h
+++ b/tensorflow/core/platform/cloud/auth_provider.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_AUTH_PROVIDER_H_
-#define TENSORFLOW_CORE_PLATFORM_AUTH_PROVIDER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_AUTH_PROVIDER_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_AUTH_PROVIDER_H_
 
 #include <string>
 #include "tensorflow/core/lib/core/errors.h"
@@ -51,4 +51,4 @@ class EmptyAuthProvider : public AuthProvider {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_AUTH_PROVIDER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_AUTH_PROVIDER_H_
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.h b/tensorflow/core/platform/cloud/gcs_dns_cache.h
index 40f16f1044..07d0e59fd5 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.h
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
-#define TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
 
 #include <random>
 
@@ -74,4 +74,4 @@ class GcsDnsCache {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.h b/tensorflow/core/platform/cloud/google_auth_provider.h
index 58a785fd60..3755b124a8 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.h
+++ b/tensorflow/core/platform/cloud/google_auth_provider.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_GOOGLE_AUTH_PROVIDER_H_
-#define TENSORFLOW_CORE_PLATFORM_GOOGLE_AUTH_PROVIDER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
 
 #include <memory>
 #include "tensorflow/core/platform/cloud/auth_provider.h"
@@ -65,4 +65,4 @@ class GoogleAuthProvider : public AuthProvider {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_GOOGLE_AUTH_PROVIDER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GOOGLE_AUTH_PROVIDER_H_
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index 2343bca608..e925eefb1f 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_H_
-#define TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_H_
 
 #include <string>
 #include <unordered_map>
@@ -188,4 +188,4 @@ class HttpRequest {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_H_
diff --git a/tensorflow/core/platform/cloud/http_request_fake.h b/tensorflow/core/platform/cloud/http_request_fake.h
index 7711eaceb2..0a1164b64a 100644
--- a/tensorflow/core/platform/cloud/http_request_fake.h
+++ b/tensorflow/core/platform/cloud/http_request_fake.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_FAKE_H_
-#define TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_FAKE_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
 
 #include <algorithm>
 #include <fstream>
@@ -212,4 +212,4 @@ class FakeHttpRequestFactory : public HttpRequest::Factory {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_FAKE_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_HTTP_REQUEST_FAKE_H_
diff --git a/tensorflow/core/platform/context.h b/tensorflow/core/platform/context.h
index 728ef91631..9f7beb7a68 100644
--- a/tensorflow/core/platform/context.h
+++ b/tensorflow/core/platform/context.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_CONTEXT_H_
-#define TENSORFLOW_PLATFORM_CONTEXT_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CONTEXT_H_
+#define TENSORFLOW_CORE_PLATFORM_CONTEXT_H_
 
 namespace tensorflow {
 
@@ -42,4 +42,4 @@ class WithContext;
 #include "tensorflow/core/platform/default/context.h"
 #endif
 
-#endif  // TENSORFLOW_PLATFORM_CONTEXT_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CONTEXT_H_
diff --git a/tensorflow/core/platform/cpu_feature_guard.h b/tensorflow/core/platform/cpu_feature_guard.h
index 586a6be55e..3d7bfe95b1 100644
--- a/tensorflow/core/platform/cpu_feature_guard.h
+++ b/tensorflow/core/platform/cpu_feature_guard.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_CPU_FEATURE_GUARD_H_
-#define TENSORFLOW_PLATFORM_CPU_FEATURE_GUARD_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CPU_FEATURE_GUARD_H_
+#define TENSORFLOW_CORE_PLATFORM_CPU_FEATURE_GUARD_H_
 
 namespace tensorflow {
 namespace port {
@@ -29,4 +29,4 @@ void InfoAboutUnusedCPUFeatures();
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_CPU_FEATURE_GUARD_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CPU_FEATURE_GUARD_H_
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 175c9ae8b1..6eba83224a 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_CPU_INFO_H_
-#define TENSORFLOW_PLATFORM_CPU_INFO_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CPU_INFO_H_
+#define TENSORFLOW_CORE_PLATFORM_CPU_INFO_H_
 
 #include <string>
 
@@ -117,4 +117,4 @@ int CPUIDNumSMT();
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_CPU_INFO_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CPU_INFO_H_
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index fb4ee1c33c..6a4ff9a1cb 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -8,224 +8,229 @@ load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
+    "if_mkl_ml",
 )
 
 # Appends a suffix to a list of deps.
 def tf_deps(deps, suffix):
-  tf_deps = []
+    tf_deps = []
 
-  # If the package name is in shorthand form (ie: does not contain a ':'),
-  # expand it to the full name.
-  for dep in deps:
-    tf_dep = dep
+    # If the package name is in shorthand form (ie: does not contain a ':'),
+    # expand it to the full name.
+    for dep in deps:
+        tf_dep = dep
 
-    if not ":" in dep:
-      dep_pieces = dep.split("/")
-      tf_dep += ":" + dep_pieces[len(dep_pieces) - 1]
+        if not ":" in dep:
+            dep_pieces = dep.split("/")
+            tf_dep += ":" + dep_pieces[len(dep_pieces) - 1]
 
-    tf_deps += [tf_dep + suffix]
+        tf_deps += [tf_dep + suffix]
 
-  return tf_deps
+    return tf_deps
 
 # Modified from @cython//:Tools/rules.bzl
 def pyx_library(
-    name,
-    deps=[],
-    py_deps=[],
-    srcs=[],
-    **kwargs):
-  """Compiles a group of .pyx / .pxd / .py files.
-
-  First runs Cython to create .cpp files for each input .pyx or .py + .pxd
-  pair. Then builds a shared object for each, passing "deps" to each cc_binary
-  rule (includes Python headers by default). Finally, creates a py_library rule
-  with the shared objects and any pure Python "srcs", with py_deps as its
-  dependencies; the shared objects can be imported like normal Python files.
-
-  Args:
-    name: Name for the rule.
-    deps: C/C++ dependencies of the Cython (e.g. Numpy headers).
-    py_deps: Pure Python dependencies of the final library.
-    srcs: .py, .pyx, or .pxd files to either compile or pass through.
-    **kwargs: Extra keyword arguments passed to the py_library.
-  """
-  # First filter out files that should be run compiled vs. passed through.
-  py_srcs = []
-  pyx_srcs = []
-  pxd_srcs = []
-  for src in srcs:
-    if src.endswith(".pyx") or (src.endswith(".py")
-                                and src[:-3] + ".pxd" in srcs):
-      pyx_srcs.append(src)
-    elif src.endswith(".py"):
-      py_srcs.append(src)
-    else:
-      pxd_srcs.append(src)
-    if src.endswith("__init__.py"):
-      pxd_srcs.append(src)
-
-  # Invoke cython to produce the shared object libraries.
-  for filename in pyx_srcs:
-    native.genrule(
-        name = filename + "_cython_translation",
-        srcs = [filename],
-        outs = [filename.split(".")[0] + ".cpp"],
-        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
-        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
-        cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
-        tools = ["@cython//:cython_binary"] + pxd_srcs,
+        name,
+        deps = [],
+        py_deps = [],
+        srcs = [],
+        **kwargs):
+    """Compiles a group of .pyx / .pxd / .py files.
+
+    First runs Cython to create .cpp files for each input .pyx or .py + .pxd
+    pair. Then builds a shared object for each, passing "deps" to each cc_binary
+    rule (includes Python headers by default). Finally, creates a py_library rule
+    with the shared objects and any pure Python "srcs", with py_deps as its
+    dependencies; the shared objects can be imported like normal Python files.
+
+    Args:
+      name: Name for the rule.
+      deps: C/C++ dependencies of the Cython (e.g. Numpy headers).
+      py_deps: Pure Python dependencies of the final library.
+      srcs: .py, .pyx, or .pxd files to either compile or pass through.
+      **kwargs: Extra keyword arguments passed to the py_library.
+    """
+
+    # First filter out files that should be run compiled vs. passed through.
+    py_srcs = []
+    pyx_srcs = []
+    pxd_srcs = []
+    for src in srcs:
+        if src.endswith(".pyx") or (src.endswith(".py") and
+                                    src[:-3] + ".pxd" in srcs):
+            pyx_srcs.append(src)
+        elif src.endswith(".py"):
+            py_srcs.append(src)
+        else:
+            pxd_srcs.append(src)
+        if src.endswith("__init__.py"):
+            pxd_srcs.append(src)
+
+    # Invoke cython to produce the shared object libraries.
+    for filename in pyx_srcs:
+        native.genrule(
+            name = filename + "_cython_translation",
+            srcs = [filename],
+            outs = [filename.split(".")[0] + ".cpp"],
+            # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
+            # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
+            cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
+            tools = ["@cython//:cython_binary"] + pxd_srcs,
+        )
+
+    shared_objects = []
+    for src in pyx_srcs:
+        stem = src.split(".")[0]
+        shared_object_name = stem + ".so"
+        native.cc_binary(
+            name = shared_object_name,
+            srcs = [stem + ".cpp"],
+            deps = deps + ["//third_party/python_runtime:headers"],
+            linkshared = 1,
+        )
+        shared_objects.append(shared_object_name)
+
+    # Now create a py_library with these shared objects as data.
+    native.py_library(
+        name = name,
+        srcs = py_srcs,
+        deps = py_deps,
+        srcs_version = "PY2AND3",
+        data = shared_objects,
+        **kwargs
     )
 
-  shared_objects = []
-  for src in pyx_srcs:
-    stem = src.split(".")[0]
-    shared_object_name = stem + ".so"
-    native.cc_binary(
-        name=shared_object_name,
-        srcs=[stem + ".cpp"],
-        deps=deps + ["//third_party/python_runtime:headers"],
-        linkshared = 1,
-    )
-    shared_objects.append(shared_object_name)
-
-  # Now create a py_library with these shared objects as data.
-  native.py_library(
-      name=name,
-      srcs=py_srcs,
-      deps=py_deps,
-      srcs_version = "PY2AND3",
-      data=shared_objects,
-      **kwargs
-  )
-
-def _proto_cc_hdrs(srcs, use_grpc_plugin=False):
-  ret = [s[:-len(".proto")] + ".pb.h" for s in srcs]
-  if use_grpc_plugin:
-    ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs]
-  return ret
-
-def _proto_cc_srcs(srcs, use_grpc_plugin=False):
-  ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs]
-  if use_grpc_plugin:
-    ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
-  return ret
-
-def _proto_py_outs(srcs, use_grpc_plugin=False):
-  ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
-  if use_grpc_plugin:
-    ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
-  return ret
+def _proto_cc_hdrs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.h" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs]
+    return ret
+
+def _proto_cc_srcs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
+    return ret
+
+def _proto_py_outs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+    return ret
 
 # Re-defined protocol buffer rule to allow building "header only" protocol
 # buffers, to avoid duplicate registrations. Also allows non-iterable cc_libs
 # containing select() statements.
 def cc_proto_library(
-    name,
-    srcs=[],
-    deps=[],
-    cc_libs=[],
-    include=None,
-    protoc="@protobuf_archive//:protoc",
-    internal_bootstrap_hack=False,
-    use_grpc_plugin=False,
-    use_grpc_namespace=False,
-    default_header=False,
-    **kargs):
-  """Bazel rule to create a C++ protobuf library from proto source files.
-
-  Args:
-    name: the name of the cc_proto_library.
-    srcs: the .proto files of the cc_proto_library.
-    deps: a list of dependency labels; must be cc_proto_library.
-    cc_libs: a list of other cc_library targets depended by the generated
-        cc_library.
-    include: a string indicating the include path of the .proto files.
-    protoc: the label of the protocol compiler to generate the sources.
-    internal_bootstrap_hack: a flag indicate the cc_proto_library is used only
-        for bootstraping. When it is set to True, no files will be generated.
-        The rule will simply be a provider for .proto files, so that other
-        cc_proto_library can depend on it.
-    use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
-        when processing the proto files.
-    default_header: Controls the naming of generated rules. If True, the `name`
-        rule will be header-only, and an _impl rule will contain the
-        implementation. Otherwise the header-only rule (name + "_headers_only")
-        must be referred to explicitly.
-    **kargs: other keyword arguments that are passed to cc_library.
-  """
-
-  includes = []
-  if include != None:
-    includes = [include]
-
-  if internal_bootstrap_hack:
-    # For pre-checked-in generated files, we add the internal_bootstrap_hack
-    # which will skip the codegen action.
+        name,
+        srcs = [],
+        deps = [],
+        cc_libs = [],
+        include = None,
+        protoc = "@protobuf_archive//:protoc",
+        internal_bootstrap_hack = False,
+        use_grpc_plugin = False,
+        use_grpc_namespace = False,
+        default_header = False,
+        **kargs):
+    """Bazel rule to create a C++ protobuf library from proto source files.
+
+    Args:
+      name: the name of the cc_proto_library.
+      srcs: the .proto files of the cc_proto_library.
+      deps: a list of dependency labels; must be cc_proto_library.
+      cc_libs: a list of other cc_library targets depended by the generated
+          cc_library.
+      include: a string indicating the include path of the .proto files.
+      protoc: the label of the protocol compiler to generate the sources.
+      internal_bootstrap_hack: a flag indicate the cc_proto_library is used only
+          for bootstraping. When it is set to True, no files will be generated.
+          The rule will simply be a provider for .proto files, so that other
+          cc_proto_library can depend on it.
+      use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
+          when processing the proto files.
+      default_header: Controls the naming of generated rules. If True, the `name`
+          rule will be header-only, and an _impl rule will contain the
+          implementation. Otherwise the header-only rule (name + "_headers_only")
+          must be referred to explicitly.
+      **kargs: other keyword arguments that are passed to cc_library.
+    """
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    if internal_bootstrap_hack:
+        # For pre-checked-in generated files, we add the internal_bootstrap_hack
+        # which will skip the codegen action.
+        proto_gen(
+            name = name + "_genproto",
+            srcs = srcs,
+            deps = [s + "_genproto" for s in deps],
+            includes = includes,
+            protoc = protoc,
+            visibility = ["//visibility:public"],
+        )
+
+        # An empty cc_library to make rule dependency consistent.
+        native.cc_library(
+            name = name,
+            **kargs
+        )
+        return
+
+    grpc_cpp_plugin = None
+    plugin_options = []
+    if use_grpc_plugin:
+        grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+        if use_grpc_namespace:
+            plugin_options = ["services_namespace=grpc"]
+
+    gen_srcs = _proto_cc_srcs(srcs, use_grpc_plugin)
+    gen_hdrs = _proto_cc_hdrs(srcs, use_grpc_plugin)
+    outs = gen_srcs + gen_hdrs
+
     proto_gen(
-        name=name + "_genproto",
-        srcs=srcs,
-        deps=[s + "_genproto" for s in deps],
-        includes=includes,
-        protoc=protoc,
-        visibility=["//visibility:public"],
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        plugin = grpc_cpp_plugin,
+        plugin_language = "grpc",
+        plugin_options = plugin_options,
+        gen_cc = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
     )
-    # An empty cc_library to make rule dependency consistent.
-    native.cc_library(
-        name=name,
-        **kargs)
-    return
-
-  grpc_cpp_plugin = None
-  plugin_options = []
-  if use_grpc_plugin:
-    grpc_cpp_plugin = "//external:grpc_cpp_plugin"
-    if use_grpc_namespace:
-      plugin_options = ["services_namespace=grpc"]
-
-  gen_srcs = _proto_cc_srcs(srcs, use_grpc_plugin)
-  gen_hdrs = _proto_cc_hdrs(srcs, use_grpc_plugin)
-  outs = gen_srcs + gen_hdrs
-
-  proto_gen(
-      name=name + "_genproto",
-      srcs=srcs,
-      deps=[s + "_genproto" for s in deps],
-      includes=includes,
-      protoc=protoc,
-      plugin=grpc_cpp_plugin,
-      plugin_language="grpc",
-      plugin_options=plugin_options,
-      gen_cc=1,
-      outs=outs,
-      visibility=["//visibility:public"],
-  )
-
-  if use_grpc_plugin:
-    cc_libs += select({
-        "//tensorflow:linux_s390x": ["//external:grpc_lib_unsecure"],
-        "//conditions:default": ["//external:grpc_lib"],
-    })
 
-  if default_header:
-    header_only_name = name
-    impl_name = name + "_impl"
-  else:
-    header_only_name = name + "_headers_only"
-    impl_name = name
-
-  native.cc_library(
-      name=impl_name,
-      srcs=gen_srcs,
-      hdrs=gen_hdrs,
-      deps=cc_libs + deps,
-      includes=includes,
-      **kargs)
-  native.cc_library(
-      name=header_only_name,
-      deps=["@protobuf_archive//:protobuf_headers"] + if_static([impl_name]),
-      hdrs=gen_hdrs,
-      **kargs)
+    if use_grpc_plugin:
+        cc_libs += select({
+            "//tensorflow:linux_s390x": ["//external:grpc_lib_unsecure"],
+            "//conditions:default": ["//external:grpc_lib"],
+        })
+
+    if default_header:
+        header_only_name = name
+        impl_name = name + "_impl"
+    else:
+        header_only_name = name + "_headers_only"
+        impl_name = name
+
+    native.cc_library(
+        name = impl_name,
+        srcs = gen_srcs,
+        hdrs = gen_hdrs,
+        deps = cc_libs + deps,
+        includes = includes,
+        **kargs
+    )
+    native.cc_library(
+        name = header_only_name,
+        deps = ["@protobuf_archive//:protobuf_headers"] + if_static([impl_name]),
+        hdrs = gen_hdrs,
+        **kargs
+    )
 
 # Re-defined protocol buffer rule to bring in the change introduced in commit
 # https://github.com/google/protobuf/commit/294b5758c373cbab4b72f35f4cb62dc1d8332b68
@@ -234,474 +239,512 @@ def cc_proto_library(
 # to include the above commit.
 def py_proto_library(
         name,
-        srcs=[],
-        deps=[],
-        py_libs=[],
-        py_extra_srcs=[],
-        include=None,
-        default_runtime="@protobuf_archive//:protobuf_python",
-        protoc="@protobuf_archive//:protoc",
-        use_grpc_plugin=False,
+        srcs = [],
+        deps = [],
+        py_libs = [],
+        py_extra_srcs = [],
+        include = None,
+        default_runtime = "@protobuf_archive//:protobuf_python",
+        protoc = "@protobuf_archive//:protoc",
+        use_grpc_plugin = False,
         **kargs):
-  """Bazel rule to create a Python protobuf library from proto source files
-
-  NOTE: the rule is only an internal workaround to generate protos. The
-  interface may change and the rule may be removed when bazel has introduced
-  the native rule.
-
-  Args:
-    name: the name of the py_proto_library.
-    srcs: the .proto files of the py_proto_library.
-    deps: a list of dependency labels; must be py_proto_library.
-    py_libs: a list of other py_library targets depended by the generated
-        py_library.
-    py_extra_srcs: extra source files that will be added to the output
-        py_library. This attribute is used for internal bootstrapping.
-    include: a string indicating the include path of the .proto files.
-    default_runtime: the implicitly default runtime which will be depended on by
-        the generated py_library target.
-    protoc: the label of the protocol compiler to generate the sources.
-    use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
-        when processing the proto files.
-    **kargs: other keyword arguments that are passed to cc_library.
-  """
-  outs = _proto_py_outs(srcs, use_grpc_plugin)
-
-  includes = []
-  if include != None:
-    includes = [include]
-
-  grpc_python_plugin = None
-  if use_grpc_plugin:
-    grpc_python_plugin = "//external:grpc_python_plugin"
-    # Note: Generated grpc code depends on Python grpc module. This dependency
-    # is not explicitly listed in py_libs. Instead, host system is assumed to
-    # have grpc installed.
-
-  proto_gen(
-      name=name + "_genproto",
-      srcs=srcs,
-      deps=[s + "_genproto" for s in deps],
-      includes=includes,
-      protoc=protoc,
-      gen_py=1,
-      outs=outs,
-      visibility=["//visibility:public"],
-      plugin=grpc_python_plugin,
-      plugin_language="grpc"
-  )
-
-  if default_runtime and not default_runtime in py_libs + deps:
-    py_libs = py_libs + [default_runtime]
-
-  native.py_library(
-      name=name,
-      srcs=outs+py_extra_srcs,
-      deps=py_libs+deps,
-      imports=includes,
-      **kargs)
-
-def tf_proto_library_cc(name, srcs = [], has_services = None,
-                        protodeps = [],
-                        visibility = [], testonly = 0,
-                        cc_libs = [],
-                        cc_stubby_versions = None,
-                        cc_grpc_version = None,
-                        j2objc_api_version = 1,
-                        cc_api_version = 2,
-                        dart_api_version = 2,
-                        java_api_version = 2, py_api_version = 2,
-                        js_api_version = 2, js_codegen = "jspb",
-                        default_header = False):
-  js_codegen = js_codegen  # unused argument
-  js_api_version = js_api_version  # unused argument
-  native.filegroup(
-      name = name + "_proto_srcs",
-      srcs = srcs + tf_deps(protodeps, "_proto_srcs"),
-      testonly = testonly,
-      visibility = visibility,
-  )
-
-  use_grpc_plugin = None
-  if cc_grpc_version:
-    use_grpc_plugin = True
-
-  cc_deps = tf_deps(protodeps, "_cc")
-  cc_name = name + "_cc"
-  if not srcs:
-    # This is a collection of sub-libraries. Build header-only and impl
-    # libraries containing all the sources.
+    """Bazel rule to create a Python protobuf library from proto source files
+
+    NOTE: the rule is only an internal workaround to generate protos. The
+    interface may change and the rule may be removed when bazel has introduced
+    the native rule.
+
+    Args:
+      name: the name of the py_proto_library.
+      srcs: the .proto files of the py_proto_library.
+      deps: a list of dependency labels; must be py_proto_library.
+      py_libs: a list of other py_library targets depended by the generated
+          py_library.
+      py_extra_srcs: extra source files that will be added to the output
+          py_library. This attribute is used for internal bootstrapping.
+      include: a string indicating the include path of the .proto files.
+      default_runtime: the implicitly default runtime which will be depended on by
+          the generated py_library target.
+      protoc: the label of the protocol compiler to generate the sources.
+      use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+          when processing the proto files.
+      **kargs: other keyword arguments that are passed to cc_library.
+    """
+    outs = _proto_py_outs(srcs, use_grpc_plugin)
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    grpc_python_plugin = None
+    if use_grpc_plugin:
+        grpc_python_plugin = "//external:grpc_python_plugin"
+        # Note: Generated grpc code depends on Python grpc module. This dependency
+        # is not explicitly listed in py_libs. Instead, host system is assumed to
+        # have grpc installed.
+
     proto_gen(
-        name = cc_name + "_genproto",
-        deps = [s + "_genproto" for s in cc_deps],
-        protoc = "@protobuf_archive//:protoc",
-        visibility=["//visibility:public"],
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        gen_py = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
+        plugin = grpc_python_plugin,
+        plugin_language = "grpc",
     )
-    native.cc_library(
-        name = cc_name,
-        deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] +
-               if_static([name + "_cc_impl"]),
+
+    if default_runtime and not default_runtime in py_libs + deps:
+        py_libs = py_libs + [default_runtime]
+
+    native.py_library(
+        name = name,
+        srcs = outs + py_extra_srcs,
+        deps = py_libs + deps,
+        imports = includes,
+        **kargs
+    )
+
+def tf_proto_library_cc(
+        name,
+        srcs = [],
+        has_services = None,
+        protodeps = [],
+        visibility = [],
+        testonly = 0,
+        cc_libs = [],
+        cc_stubby_versions = None,
+        cc_grpc_version = None,
+        j2objc_api_version = 1,
+        cc_api_version = 2,
+        dart_api_version = 2,
+        java_api_version = 2,
+        py_api_version = 2,
+        js_api_version = 2,
+        js_codegen = "jspb",
+        default_header = False):
+    js_codegen = js_codegen  # unused argument
+    js_api_version = js_api_version  # unused argument
+    native.filegroup(
+        name = name + "_proto_srcs",
+        srcs = srcs + tf_deps(protodeps, "_proto_srcs"),
         testonly = testonly,
         visibility = visibility,
     )
-    native.cc_library(
-        name = cc_name + "_impl",
-        deps = [s + "_impl" for s in cc_deps] + ["@protobuf_archive//:cc_wkt_protos"],
-    )
 
-    return
-
-  cc_proto_library(
-      name = cc_name,
-      srcs = srcs,
-      deps = cc_deps + ["@protobuf_archive//:cc_wkt_protos"],
-      cc_libs = cc_libs + if_static(
-          ["@protobuf_archive//:protobuf"],
-          ["@protobuf_archive//:protobuf_headers"]
-      ),
-      copts = if_not_windows([
-          "-Wno-unknown-warning-option",
-          "-Wno-unused-but-set-variable",
-          "-Wno-sign-compare",
-      ]),
-      protoc = "@protobuf_archive//:protoc",
-      use_grpc_plugin = use_grpc_plugin,
-      testonly = testonly,
-      visibility = visibility,
-      default_header = default_header,
-  )
-
-def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
-                        testonly=0, srcs_version="PY2AND3", use_grpc_plugin=False):
-  py_deps = tf_deps(protodeps, "_py")
-  py_name = name + "_py"
-  if not srcs:
-    # This is a collection of sub-libraries. Build header-only and impl
-    # libraries containing all the sources.
-    proto_gen(
-        name = py_name + "_genproto",
-        deps = [s + "_genproto" for s in py_deps],
+    use_grpc_plugin = None
+    if cc_grpc_version:
+        use_grpc_plugin = True
+
+    cc_deps = tf_deps(protodeps, "_cc")
+    cc_name = name + "_cc"
+    if not srcs:
+        # This is a collection of sub-libraries. Build header-only and impl
+        # libraries containing all the sources.
+        proto_gen(
+            name = cc_name + "_genproto",
+            deps = [s + "_genproto" for s in cc_deps],
+            protoc = "@protobuf_archive//:protoc",
+            visibility = ["//visibility:public"],
+        )
+        native.cc_library(
+            name = cc_name,
+            deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] +
+                   if_static([name + "_cc_impl"]),
+            testonly = testonly,
+            visibility = visibility,
+        )
+        native.cc_library(
+            name = cc_name + "_impl",
+            deps = [s + "_impl" for s in cc_deps] + ["@protobuf_archive//:cc_wkt_protos"],
+        )
+
+        return
+
+    cc_proto_library(
+        name = cc_name,
+        srcs = srcs,
+        deps = cc_deps + ["@protobuf_archive//:cc_wkt_protos"],
+        cc_libs = cc_libs + if_static(
+            ["@protobuf_archive//:protobuf"],
+            ["@protobuf_archive//:protobuf_headers"],
+        ),
+        copts = if_not_windows([
+            "-Wno-unknown-warning-option",
+            "-Wno-unused-but-set-variable",
+            "-Wno-sign-compare",
+        ]),
         protoc = "@protobuf_archive//:protoc",
-        visibility=["//visibility:public"],
+        use_grpc_plugin = use_grpc_plugin,
+        testonly = testonly,
+        visibility = visibility,
+        default_header = default_header,
     )
-    native.py_library(
+
+def tf_proto_library_py(
+        name,
+        srcs = [],
+        protodeps = [],
+        deps = [],
+        visibility = [],
+        testonly = 0,
+        srcs_version = "PY2AND3",
+        use_grpc_plugin = False):
+    py_deps = tf_deps(protodeps, "_py")
+    py_name = name + "_py"
+    if not srcs:
+        # This is a collection of sub-libraries. Build header-only and impl
+        # libraries containing all the sources.
+        proto_gen(
+            name = py_name + "_genproto",
+            deps = [s + "_genproto" for s in py_deps],
+            protoc = "@protobuf_archive//:protoc",
+            visibility = ["//visibility:public"],
+        )
+        native.py_library(
+            name = py_name,
+            deps = py_deps + ["@protobuf_archive//:protobuf_python"],
+            testonly = testonly,
+            visibility = visibility,
+        )
+        return
+
+    py_proto_library(
         name = py_name,
-        deps = py_deps + ["@protobuf_archive//:protobuf_python"],
-        testonly = testonly,
+        srcs = srcs,
+        srcs_version = srcs_version,
+        deps = deps + py_deps + ["@protobuf_archive//:protobuf_python"],
+        protoc = "@protobuf_archive//:protoc",
+        default_runtime = "@protobuf_archive//:protobuf_python",
         visibility = visibility,
+        testonly = testonly,
+        use_grpc_plugin = use_grpc_plugin,
     )
-    return
-
-  py_proto_library(
-      name = py_name,
-      srcs = srcs,
-      srcs_version = srcs_version,
-      deps = deps + py_deps + ["@protobuf_archive//:protobuf_python"],
-      protoc = "@protobuf_archive//:protoc",
-      default_runtime = "@protobuf_archive//:protobuf_python",
-      visibility = visibility,
-      testonly = testonly,
-      use_grpc_plugin = use_grpc_plugin,
-  )
 
 def tf_jspb_proto_library(**kwargs):
-  pass
+    pass
 
 def tf_nano_proto_library(**kwargs):
-  pass
-
-def tf_proto_library(name, srcs = [], has_services = None,
-                     protodeps = [],
-                     visibility = [], testonly = 0,
-                     cc_libs = [],
-                     cc_api_version = 2, cc_grpc_version = None,
-                     dart_api_version = 2, j2objc_api_version = 1,
-                     java_api_version = 2, py_api_version = 2,
-                     js_api_version = 2, js_codegen = "jspb",
-                     provide_cc_alias = False,
-                     default_header = False):
-  """Make a proto library, possibly depending on other proto libraries."""
-  _ignore = (js_api_version, js_codegen, provide_cc_alias)
-
-  tf_proto_library_cc(
-      name = name,
-      srcs = srcs,
-      protodeps = protodeps,
-      cc_grpc_version = cc_grpc_version,
-      cc_libs = cc_libs,
-      testonly = testonly,
-      visibility = visibility,
-      default_header = default_header,
-  )
-
-  tf_proto_library_py(
-      name = name,
-      srcs = srcs,
-      protodeps = protodeps,
-      srcs_version = "PY2AND3",
-      testonly = testonly,
-      visibility = visibility,
-      use_grpc_plugin = has_services,
-  )
+    pass
+
+def tf_proto_library(
+        name,
+        srcs = [],
+        has_services = None,
+        protodeps = [],
+        visibility = [],
+        testonly = 0,
+        cc_libs = [],
+        cc_api_version = 2,
+        cc_grpc_version = None,
+        dart_api_version = 2,
+        j2objc_api_version = 1,
+        java_api_version = 2,
+        py_api_version = 2,
+        js_api_version = 2,
+        js_codegen = "jspb",
+        provide_cc_alias = False,
+        default_header = False):
+    """Make a proto library, possibly depending on other proto libraries."""
+    _ignore = (js_api_version, js_codegen, provide_cc_alias)
+
+    tf_proto_library_cc(
+        name = name,
+        srcs = srcs,
+        protodeps = protodeps,
+        cc_grpc_version = cc_grpc_version,
+        cc_libs = cc_libs,
+        testonly = testonly,
+        visibility = visibility,
+        default_header = default_header,
+    )
+
+    tf_proto_library_py(
+        name = name,
+        srcs = srcs,
+        protodeps = protodeps,
+        srcs_version = "PY2AND3",
+        testonly = testonly,
+        visibility = visibility,
+        use_grpc_plugin = has_services,
+    )
 
 # A list of all files under platform matching the pattern in 'files'. In
 # contrast with 'tf_platform_srcs' below, which seletive collects files that
 # must be compiled in the 'default' platform, this is a list of all headers
 # mentioned in the platform/* files.
 def tf_platform_hdrs(files):
-  return native.glob(["platform/*/" + f for f in files])
+    return native.glob(["platform/*/" + f for f in files])
 
 def tf_platform_srcs(files):
-  base_set = ["platform/default/" + f for f in files]
-  windows_set = base_set + ["platform/windows/" + f for f in files]
-  posix_set = base_set + ["platform/posix/" + f for f in files]
-
-  # Handle cases where we must also bring the posix file in. Usually, the list
-  # of files to build on windows builds is just all the stuff in the
-  # windows_set. However, in some cases the implementations in 'posix/' are
-  # just what is necessary and historically we choose to simply use the posix
-  # file instead of making a copy in 'windows'.
-  for f in files:
-    if f == "error.cc":
-      windows_set.append("platform/posix/" + f)
-
-  return select({
-    "//tensorflow:windows" : native.glob(windows_set),
-    "//conditions:default" : native.glob(posix_set),
-  })
+    base_set = ["platform/default/" + f for f in files]
+    windows_set = base_set + ["platform/windows/" + f for f in files]
+    posix_set = base_set + ["platform/posix/" + f for f in files]
+
+    # Handle cases where we must also bring the posix file in. Usually, the list
+    # of files to build on windows builds is just all the stuff in the
+    # windows_set. However, in some cases the implementations in 'posix/' are
+    # just what is necessary and historically we choose to simply use the posix
+    # file instead of making a copy in 'windows'.
+    for f in files:
+        if f == "error.cc":
+            windows_set.append("platform/posix/" + f)
+
+    return select({
+        "//tensorflow:windows": native.glob(windows_set),
+        "//conditions:default": native.glob(posix_set),
+    })
 
 def tf_additional_lib_hdrs(exclude = []):
-  windows_hdrs = native.glob([
-      "platform/default/*.h",
-      "platform/windows/*.h",
-      "platform/posix/error.h",
-  ], exclude = exclude)
-  return select({
-    "//tensorflow:windows" : windows_hdrs,
-    "//conditions:default" : native.glob([
+    windows_hdrs = native.glob([
         "platform/default/*.h",
-        "platform/posix/*.h",
-      ], exclude = exclude),
-  })
+        "platform/windows/*.h",
+        "platform/posix/error.h",
+    ], exclude = exclude)
+    return select({
+        "//tensorflow:windows": windows_hdrs,
+        "//conditions:default": native.glob([
+            "platform/default/*.h",
+            "platform/posix/*.h",
+        ], exclude = exclude),
+    })
 
 def tf_additional_lib_srcs(exclude = []):
-  windows_srcs = native.glob([
-      "platform/default/*.cc",
-      "platform/windows/*.cc",
-      "platform/posix/error.cc",
-  ], exclude = exclude)
-  return select({
-    "//tensorflow:windows" : windows_srcs,
-    "//conditions:default" : native.glob([
+    windows_srcs = native.glob([
         "platform/default/*.cc",
-        "platform/posix/*.cc",
-      ], exclude = exclude),
-  })
+        "platform/windows/*.cc",
+        "platform/posix/error.cc",
+    ], exclude = exclude)
+    return select({
+        "//tensorflow:windows": windows_srcs,
+        "//conditions:default": native.glob([
+            "platform/default/*.cc",
+            "platform/posix/*.cc",
+        ], exclude = exclude),
+    })
 
 def tf_additional_minimal_lib_srcs():
-  return [
-      "platform/default/integral_types.h",
-      "platform/default/mutex.h",
-  ]
+    return [
+        "platform/default/integral_types.h",
+        "platform/default/mutex.h",
+    ]
 
 def tf_additional_proto_hdrs():
-  return [
-      "platform/default/integral_types.h",
-      "platform/default/logging.h",
-      "platform/default/protobuf.h"
-  ] + if_windows([
-      "platform/windows/integral_types.h",
-  ])
+    return [
+        "platform/default/integral_types.h",
+        "platform/default/logging.h",
+        "platform/default/protobuf.h",
+    ] + if_windows([
+        "platform/windows/integral_types.h",
+    ])
+
+def tf_additional_proto_compiler_hdrs():
+    return [
+        "platform/default/protobuf_compiler.h",
+    ]
 
 def tf_additional_proto_srcs():
-  return [
-      "platform/default/protobuf.cc",
-  ]
+    return [
+        "platform/default/protobuf.cc",
+    ]
 
 def tf_additional_human_readable_json_deps():
-  return []
+    return []
 
 def tf_additional_all_protos():
-  return ["//tensorflow/core:protos_all"]
+    return ["//tensorflow/core:protos_all"]
 
 def tf_protos_all_impl():
-  return ["//tensorflow/core:protos_all_cc_impl"]
+    return ["//tensorflow/core:protos_all_cc_impl"]
 
 def tf_protos_all():
-  return if_static(
-      extra_deps=tf_protos_all_impl(),
-      otherwise=["//tensorflow/core:protos_all_cc"])
+    return if_static(
+        extra_deps = tf_protos_all_impl(),
+        otherwise = ["//tensorflow/core:protos_all_cc"],
+    )
 
 def tf_protos_grappler_impl():
-  return ["//tensorflow/core/grappler/costs:op_performance_data_cc_impl"]
+    return ["//tensorflow/core/grappler/costs:op_performance_data_cc_impl"]
 
 def tf_protos_grappler():
-  return if_static(
-      extra_deps=tf_protos_grappler_impl(),
-      otherwise=["//tensorflow/core/grappler/costs:op_performance_data_cc"])
+    return if_static(
+        extra_deps = tf_protos_grappler_impl(),
+        otherwise = ["//tensorflow/core/grappler/costs:op_performance_data_cc"],
+    )
 
 def tf_additional_cupti_wrapper_deps():
-  return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
+    return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
 
 def tf_additional_device_tracer_srcs():
-  return ["platform/default/device_tracer.cc"]
+    return ["platform/default/device_tracer.cc"]
 
 def tf_additional_device_tracer_cuda_deps():
-  return []
+    return []
 
 def tf_additional_device_tracer_deps():
-  return []
+    return []
 
 def tf_additional_libdevice_data():
-  return []
+    return []
 
 def tf_additional_libdevice_deps():
-  return ["@local_config_cuda//cuda:cuda_headers"]
+    return ["@local_config_cuda//cuda:cuda_headers"]
 
 def tf_additional_libdevice_srcs():
-  return ["platform/default/cuda_libdevice_path.cc"]
+    return ["platform/default/cuda_libdevice_path.cc"]
 
 def tf_additional_test_deps():
-  return []
+    return []
 
 def tf_additional_test_srcs():
-  return [
-      "platform/default/test_benchmark.cc",
-  ] + select({
-      "//tensorflow:windows" : [
-          "platform/windows/test.cc"
+    return [
+        "platform/default/test_benchmark.cc",
+    ] + select({
+        "//tensorflow:windows": [
+            "platform/windows/test.cc",
         ],
-      "//conditions:default" : [
-          "platform/posix/test.cc",
+        "//conditions:default": [
+            "platform/posix/test.cc",
         ],
     })
 
 def tf_kernel_tests_linkstatic():
-  return 0
+    return 0
 
 def tf_additional_lib_defines():
-  """Additional defines needed to build TF libraries."""
-  return select({
-      "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
-      "//tensorflow:with_jemalloc_linux_ppc64le":["TENSORFLOW_USE_JEMALLOC"],
-      "//conditions:default": [],
-  }) + if_not_mobile(["TENSORFLOW_USE_ABSL"])
+    """Additional defines needed to build TF libraries."""
+    return select({
+        "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
+        "//tensorflow:with_jemalloc_linux_ppc64le": ["TENSORFLOW_USE_JEMALLOC"],
+        "//conditions:default": [],
+    })
 
 def tf_additional_lib_deps():
-  """Additional dependencies needed to build TF libraries."""
-  return if_not_mobile(["@com_google_absl//absl/base:base"]) + if_static(
-      ["@nsync//:nsync_cpp"],
-      ["@nsync//:nsync_headers"]
-  ) + select({
-      "//tensorflow:with_jemalloc_linux_x86_64_dynamic": ["@jemalloc//:jemalloc_headers"],
-      "//tensorflow:with_jemalloc_linux_ppc64le_dynamic": ["@jemalloc//:jemalloc_headers"],
-      "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc//:jemalloc_impl"],
-      "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc//:jemalloc_impl"],
-      "//conditions:default": [],
-  })
+    """Additional dependencies needed to build TF libraries."""
+    return ["@com_google_absl//absl/base:base"] + if_static(
+        ["@nsync//:nsync_cpp"],
+        ["@nsync//:nsync_headers"],
+    ) + select({
+        "//tensorflow:with_jemalloc_linux_x86_64_dynamic": ["@jemalloc//:jemalloc_headers"],
+        "//tensorflow:with_jemalloc_linux_ppc64le_dynamic": ["@jemalloc//:jemalloc_headers"],
+        "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc//:jemalloc_impl"],
+        "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc//:jemalloc_impl"],
+        "//conditions:default": [],
+    })
 
 def tf_additional_core_deps():
-  return select({
-      "//tensorflow:with_gcp_support_android_override": [],
-      "//tensorflow:with_gcp_support_ios_override": [],
-      "//tensorflow:with_gcp_support": [
-          "//tensorflow/core/platform/cloud:gcs_file_system",
-      ],
-      "//conditions:default": [],
-  }) + select({
-      "//tensorflow:with_hdfs_support_windows_override": [],
-      "//tensorflow:with_hdfs_support_android_override": [],
-      "//tensorflow:with_hdfs_support_ios_override": [],
-      "//tensorflow:with_hdfs_support": [
-          "//tensorflow/core/platform/hadoop:hadoop_file_system",
-      ],
-      "//conditions:default": [],
-  }) + select({
-      "//tensorflow:with_aws_support_windows_override": [],
-      "//tensorflow:with_aws_support_android_override": [],
-      "//tensorflow:with_aws_support_ios_override": [],
-      "//tensorflow:with_aws_support": [
-          "//tensorflow/core/platform/s3:s3_file_system",
-      ],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_gcp_support_android_override": [],
+        "//tensorflow:with_gcp_support_ios_override": [],
+        "//tensorflow:with_gcp_support": [
+            "//tensorflow/core/platform/cloud:gcs_file_system",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_hdfs_support_windows_override": [],
+        "//tensorflow:with_hdfs_support_android_override": [],
+        "//tensorflow:with_hdfs_support_ios_override": [],
+        "//tensorflow:with_hdfs_support": [
+            "//tensorflow/core/platform/hadoop:hadoop_file_system",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:with_aws_support_windows_override": [],
+        "//tensorflow:with_aws_support_android_override": [],
+        "//tensorflow:with_aws_support_ios_override": [],
+        "//tensorflow:with_aws_support": [
+            "//tensorflow/core/platform/s3:s3_file_system",
+        ],
+        "//conditions:default": [],
+    })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
-  return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
-      "//tensorflow:with_gcp_support_android_override": [],
-      "//tensorflow:with_gcp_support_ios_override": [],
-      "//tensorflow:with_gcp_support": [
-        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
-        "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
-      ],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_gcp_support_windows_override": [],
+        "//tensorflow:with_gcp_support_android_override": [],
+        "//tensorflow:with_gcp_support_ios_override": [],
+        "//tensorflow:with_gcp_support": [
+            "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
+            "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
+        ],
+        "//conditions:default": [],
+    })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
-  return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
-      "//tensorflow:with_gcp_support_android_override": [],
-      "//tensorflow:with_gcp_support_ios_override": [],
-      "//tensorflow:with_gcp_support": [
-        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
-        "//tensorflow/contrib/cloud/kernels:gcs_config_ops",
-      ],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_gcp_support_windows_override": [],
+        "//tensorflow:with_gcp_support_android_override": [],
+        "//tensorflow:with_gcp_support_ios_override": [],
+        "//tensorflow:with_gcp_support": [
+            "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
+            "//tensorflow/contrib/cloud/kernels:gcs_config_ops",
+        ],
+        "//conditions:default": [],
+    })
 
 def tf_lib_proto_parsing_deps():
-  return [
-      ":protos_all_cc",
-      "//third_party/eigen3",
-      "//tensorflow/core/platform/default/build_config:proto_parsing",
-  ]
+    return [
+        ":protos_all_cc",
+        "//third_party/eigen3",
+        "//tensorflow/core/platform/default/build_config:proto_parsing",
+    ]
+
+def tf_lib_proto_compiler_deps():
+    return [
+        "@protobuf_archive//:protoc_lib",
+    ]
 
 def tf_additional_verbs_lib_defines():
-  return select({
-      "//tensorflow:with_verbs_support": ["TENSORFLOW_USE_VERBS"],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_verbs_support": ["TENSORFLOW_USE_VERBS"],
+        "//conditions:default": [],
+    })
 
 def tf_additional_mpi_lib_defines():
-  return select({
-      "//tensorflow:with_mpi_support": ["TENSORFLOW_USE_MPI"],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_mpi_support": ["TENSORFLOW_USE_MPI"],
+        "//conditions:default": [],
+    })
 
 def tf_additional_gdr_lib_defines():
-  return select({
-      "//tensorflow:with_gdr_support": ["TENSORFLOW_USE_GDR"],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_gdr_support": ["TENSORFLOW_USE_GDR"],
+        "//conditions:default": [],
+    })
 
-def tf_py_clif_cc(name, visibility=None, **kwargs):
-  pass
+def tf_py_clif_cc(name, visibility = None, **kwargs):
+    pass
 
-def tf_pyclif_proto_library(name, proto_lib, proto_srcfile="", visibility=None,
-                            **kwargs):
-  pass
+def tf_pyclif_proto_library(
+        name,
+        proto_lib,
+        proto_srcfile = "",
+        visibility = None,
+        **kwargs):
+    pass
 
 def tf_additional_binary_deps():
-  return ["@nsync//:nsync_cpp"] + if_cuda(
-      [
-          "//tensorflow/stream_executor:cuda_platform",
-          "//tensorflow/core/platform/default/build_config:cuda",
-      ],
-  ) + select({
-      "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc//:jemalloc_impl"],
-      "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc//:jemalloc_impl"],
-      "//conditions:default": [],
-  })  + [
-      # TODO(allenl): Split these out into their own shared objects (they are
-      # here because they are shared between contrib/ op shared objects and
-      # core).
-      "//tensorflow/core/kernels:lookup_util",
-      "//tensorflow/core/util/tensor_bundle",
-  ] + if_mkl(
-      [
-          "//third_party/mkl:intel_binary_blob",
-      ],
-  )
+    return ["@nsync//:nsync_cpp"] + if_cuda(
+        [
+            "//tensorflow/stream_executor:cuda_platform",
+            "//tensorflow/core/platform/default/build_config:cuda",
+        ],
+    ) + select({
+        "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc//:jemalloc_impl"],
+        "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc//:jemalloc_impl"],
+        "//conditions:default": [],
+    }) + [
+        # TODO(allenl): Split these out into their own shared objects (they are
+        # here because they are shared between contrib/ op shared objects and
+        # core).
+        "//tensorflow/core/kernels:lookup_util",
+        "//tensorflow/core/util/tensor_bundle",
+    ] + if_mkl_ml(
+        [
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    )
diff --git a/tensorflow/core/platform/default/integral_types.h b/tensorflow/core/platform/default/integral_types.h
index 7cbe7d62f7..92186bc912 100644
--- a/tensorflow/core/platform/default/integral_types.h
+++ b/tensorflow/core/platform/default/integral_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
 
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/types.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/types.h
@@ -33,4 +33,4 @@ typedef unsigned long long uint64;
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_INTEGRAL_TYPES_H_
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 2c134f1be9..08a692fff7 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_LOGGING_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_LOGGING_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_LOGGING_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_LOGGING_H_
 
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/logging.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/logging.h
@@ -314,4 +314,4 @@ int64 MinVLogLevelFromEnv();
 }  // namespace internal
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_LOGGING_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_LOGGING_H_
diff --git a/tensorflow/core/platform/default/mutex.h b/tensorflow/core/platform/default/mutex.h
index 48d90779e1..bef7801037 100644
--- a/tensorflow/core/platform/default/mutex.h
+++ b/tensorflow/core/platform/default/mutex.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_MUTEX_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_MUTEX_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_MUTEX_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_MUTEX_H_
 
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/mutex.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/mutex.h
@@ -173,4 +173,4 @@ inline ConditionResult WaitForMilliseconds(mutex_lock* mu,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_MUTEX_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_MUTEX_H_
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
index c732c76ff7..bd9d41c62b 100644
--- a/tensorflow/core/platform/default/protobuf.h
+++ b/tensorflow/core/platform/default/protobuf.h
@@ -20,8 +20,8 @@ limitations under the License.
 // IWYU pragma: friend third_party/tensorflow/core/platform/protobuf.h
 
 #include "google/protobuf/arena.h"
-#include "google/protobuf/compiler/importer.h"
 #include "google/protobuf/descriptor.h"
+#include "google/protobuf/descriptor.pb.h"
 #include "google/protobuf/dynamic_message.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
diff --git a/tensorflow/core/platform/default/protobuf_compiler.h b/tensorflow/core/platform/default/protobuf_compiler.h
new file mode 100644
index 0000000000..a93d7a184b
--- /dev/null
+++ b/tensorflow/core/platform/default/protobuf_compiler.h
@@ -0,0 +1,25 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
+
+// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf_compiler.h"
+// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf_compiler.h
+
+#include "google/protobuf/compiler/importer.h"
+#include "tensorflow/core/platform/default/protobuf.h"
+
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
diff --git a/tensorflow/core/platform/default/thread_annotations.h b/tensorflow/core/platform/default/thread_annotations.h
index a6aa5b1b5e..d21d60ab0b 100644
--- a/tensorflow/core/platform/default/thread_annotations.h
+++ b/tensorflow/core/platform/default/thread_annotations.h
@@ -32,8 +32,8 @@ limitations under the License.
 // (e.g. &MyClass::mutex_) to refer to a mutex in some (unknown) object.
 //
 
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
 
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/thread_annotations.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/thread_annotations.h
@@ -174,4 +174,4 @@ inline T& ts_unchecked_read(T& v) NO_THREAD_SAFETY_ANALYSIS {
 }  // namespace thread_safety_analysis
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_THREAD_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/default/tracing_impl.h b/tensorflow/core/platform/default/tracing_impl.h
index b161378405..b7a5f1386c 100644
--- a/tensorflow/core/platform/default/tracing_impl.h
+++ b/tensorflow/core/platform/default/tracing_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_TRACING_IMPL_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_TRACING_IMPL_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_TRACING_IMPL_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_TRACING_IMPL_H_
 
 // Stub implementations of tracing functionality.
 
@@ -43,4 +43,4 @@ inline bool EventCollector::IsEnabled() { return false; }
 }  // namespace tracing
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_TRACING_IMPL_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_TRACING_IMPL_H_
diff --git a/tensorflow/core/platform/denormal.h b/tensorflow/core/platform/denormal.h
index 09bb0352a2..555ac023db 100644
--- a/tensorflow/core/platform/denormal.h
+++ b/tensorflow/core/platform/denormal.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_DENORMAL_H_
-#define TENSORFLOW_PLATFORM_DENORMAL_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DENORMAL_H_
+#define TENSORFLOW_CORE_PLATFORM_DENORMAL_H_
 
 #include "tensorflow/core/platform/macros.h"
 
@@ -59,4 +59,4 @@ class ScopedDontFlushDenormal {
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_DENORMAL_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DENORMAL_H_
diff --git a/tensorflow/core/platform/dynamic_annotations.h b/tensorflow/core/platform/dynamic_annotations.h
index f51f3f33a3..dad0d0f4e4 100644
--- a/tensorflow/core/platform/dynamic_annotations.h
+++ b/tensorflow/core/platform/dynamic_annotations.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_DYNAMIC_ANNOTATIONS_H_
-#define TENSORFLOW_PLATFORM_DYNAMIC_ANNOTATIONS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DYNAMIC_ANNOTATIONS_H_
+#define TENSORFLOW_CORE_PLATFORM_DYNAMIC_ANNOTATIONS_H_
 
 #include "tensorflow/core/platform/platform.h"
 
@@ -28,4 +28,4 @@ limitations under the License.
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
-#endif  // TENSORFLOW_PLATFORM_DYNAMIC_ANNOTATIONS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DYNAMIC_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 47c59d435b..afc4201e53 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -92,7 +92,7 @@ Env::Env() : file_system_registry_(new FileSystemRegistryImpl) {}
 Status Env::GetFileSystemForFile(const string& fname, FileSystem** result) {
   StringPiece scheme, host, path;
   io::ParseURI(fname, &scheme, &host, &path);
-  FileSystem* file_system = file_system_registry_->Lookup(std::string(scheme));
+  FileSystem* file_system = file_system_registry_->Lookup(string(scheme));
   if (!file_system) {
     if (scheme.empty()) {
       scheme = "[local]";
@@ -166,7 +166,7 @@ bool Env::FilesExist(const std::vector<string>& files,
   for (const auto& file : files) {
     StringPiece scheme, host, path;
     io::ParseURI(file, &scheme, &host, &path);
-    files_per_fs[std::string(scheme)].push_back(file);
+    files_per_fs[string(scheme)].push_back(file);
   }
 
   std::unordered_map<string, Status> per_file_status;
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 922773684b..3ab542a5d8 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -158,7 +158,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
   std::reverse(sub_dirs.begin(), sub_dirs.end());
 
   // Now create the directories.
-  string built_path = std::string(remaining_dir);
+  string built_path(remaining_dir);
   for (const StringPiece sub_dir : sub_dirs) {
     built_path = io::JoinPath(built_path, sub_dir);
     Status status = CreateDir(io::CreateURI(scheme, host, built_path));
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 0ba0e6304f..342cf28e38 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -59,7 +59,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
   string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
   string eval_pattern = pattern;
   std::vector<string> all_files;
-  string dir = std::string(io::Dirname(fixed_prefix));
+  string dir(io::Dirname(fixed_prefix));
   // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
   // include . as the top level directory.
   if (dir.empty()) {
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index c0a16c95f9..a637d42a92 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -125,7 +125,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     ASSERT_EQ(scheme, "ipfs");
     ASSERT_EQ(host, "solarsystem");
     str_util::ConsumePrefix(&path, "/");
-    *parsed_path = std::string(path);
+    *parsed_path = string(path);
   }
 
   std::map<string, std::set<string>> celestial_bodies_ = {
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index ff4b4436bb..8cdb08f51b 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -144,7 +144,7 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
 
   StringPiece scheme, namenode, path;
   io::ParseURI(fname, &scheme, &namenode, &path);
-  const string nn = namenode.ToString();
+  const string nn(namenode);
 
   hdfsBuilder* builder = hdfs_->hdfsNewBuilder();
   if (scheme == "file") {
@@ -183,7 +183,7 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
 string HadoopFileSystem::TranslateName(const string& name) const {
   StringPiece scheme, namenode, path;
   io::ParseURI(name, &scheme, &namenode, &path);
-  return path.ToString();
+  return string(path);
 }
 
 class HDFSRandomAccessFile : public RandomAccessFile {
@@ -392,7 +392,7 @@ Status HadoopFileSystem::GetChildren(const string& dir,
     return IOError(dir, errno);
   }
   for (int i = 0; i < entries; i++) {
-    result->push_back(io::Basename(info[i].mName).ToString());
+    result->push_back(string(io::Basename(info[i].mName)));
   }
   hdfs_->hdfsFreeFileInfo(info, entries);
   return Status::OK();
diff --git a/tensorflow/core/platform/host_info.h b/tensorflow/core/platform/host_info.h
index 6124c95923..e76b83adf3 100644
--- a/tensorflow/core/platform/host_info.h
+++ b/tensorflow/core/platform/host_info.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_HOST_INFO_H_
-#define TENSORFLOW_PLATFORM_HOST_INFO_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_HOST_INFO_H_
+#define TENSORFLOW_CORE_PLATFORM_HOST_INFO_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -27,4 +27,4 @@ string Hostname();
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_HOST_INFO_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_HOST_INFO_H_
diff --git a/tensorflow/core/platform/init_main.h b/tensorflow/core/platform/init_main.h
index 20cbc615b1..834c529816 100644
--- a/tensorflow/core/platform/init_main.h
+++ b/tensorflow/core/platform/init_main.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_INIT_MAIN_H_
-#define TENSORFLOW_PLATFORM_INIT_MAIN_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_INIT_MAIN_H_
+#define TENSORFLOW_CORE_PLATFORM_INIT_MAIN_H_
 
 namespace tensorflow {
 namespace port {
@@ -28,4 +28,4 @@ void InitMain(const char* usage, int* argc, char*** argv);
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_INIT_MAIN_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_INIT_MAIN_H_
diff --git a/tensorflow/core/platform/load_library.h b/tensorflow/core/platform/load_library.h
index 9038de25f3..c7eeb2918c 100644
--- a/tensorflow/core/platform/load_library.h
+++ b/tensorflow/core/platform/load_library.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_LOAD_LIBRARY_H_
-#define TENSORFLOW_PLATFORM_LOAD_LIBRARY_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_LOAD_LIBRARY_H_
+#define TENSORFLOW_CORE_PLATFORM_LOAD_LIBRARY_H_
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -31,4 +31,4 @@ string FormatLibraryFileName(const string& name, const string& version);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_LOAD_LIBRARY_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_LOAD_LIBRARY_H_
diff --git a/tensorflow/core/platform/logging.h b/tensorflow/core/platform/logging.h
index 985c061676..17a5d5fb5b 100644
--- a/tensorflow/core/platform/logging.h
+++ b/tensorflow/core/platform/logging.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_LOGGING_H_
-#define TENSORFLOW_PLATFORM_LOGGING_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_LOGGING_H_
+#define TENSORFLOW_CORE_PLATFORM_LOGGING_H_
 
 #include "tensorflow/core/platform/platform.h"  // To pick up PLATFORM_define
 
@@ -36,4 +36,4 @@ void LogString(const char* fname, int line, int severity,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_LOGGING_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_LOGGING_H_
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index b65eb43146..e1d83e18ac 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_MACROS_H_
-#define TENSORFLOW_PLATFORM_MACROS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_MACROS_H_
+#define TENSORFLOW_CORE_PLATFORM_MACROS_H_
 
 // Compiler attributes
 #if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG)
@@ -125,4 +125,4 @@ limitations under the License.
   } while (0)
 #endif
 
-#endif  // TENSORFLOW_PLATFORM_MACROS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_MACROS_H_
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index fca3a2332d..e8150f7322 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_MEM_H_
-#define TENSORFLOW_PLATFORM_MEM_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_MEM_H_
+#define TENSORFLOW_CORE_PLATFORM_MEM_H_
 
 // TODO(cwhipkey): remove this when callers use annotations directly.
 #include "tensorflow/core/platform/dynamic_annotations.h"
@@ -65,4 +65,4 @@ int64 AvailableRam();
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_MEM_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_MEM_H_
diff --git a/tensorflow/core/platform/mutex.h b/tensorflow/core/platform/mutex.h
index 42d46ceb5b..66b20da95a 100644
--- a/tensorflow/core/platform/mutex.h
+++ b/tensorflow/core/platform/mutex.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_MUTEX_H_
-#define TENSORFLOW_PLATFORM_MUTEX_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_MUTEX_H_
+#define TENSORFLOW_CORE_PLATFORM_MUTEX_H_
 
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,4 +50,4 @@ ConditionResult WaitForMilliseconds(mutex_lock* mu, condition_variable* cv,
                                     int64 ms);
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_MUTEX_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_MUTEX_H_
diff --git a/tensorflow/core/platform/net.h b/tensorflow/core/platform/net.h
index 9e7851728d..7dbc92f058 100644
--- a/tensorflow/core/platform/net.h
+++ b/tensorflow/core/platform/net.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_NET_H_
-#define TENSORFLOW_PLATFORM_NET_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_NET_H_
+#define TENSORFLOW_CORE_PLATFORM_NET_H_
 
 namespace tensorflow {
 namespace internal {
@@ -24,4 +24,4 @@ int PickUnusedPortOrDie();
 }  // namespace internal
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_NET_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_NET_H_
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index b110d63aba..93b1425f7a 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_PNG_H_
-#define TENSORFLOW_PLATFORM_PNG_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_PNG_H_
+#define TENSORFLOW_CORE_PLATFORM_PNG_H_
 
 #include "tensorflow/core/platform/platform.h"
 
@@ -27,4 +27,4 @@ limitations under the License.
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
-#endif  // TENSORFLOW_PLATFORM_PNG_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_PNG_H_
diff --git a/tensorflow/core/platform/posix/error.h b/tensorflow/core/platform/posix/error.h
index 9b614d0f70..9df5f2daa1 100644
--- a/tensorflow/core/platform/posix/error.h
+++ b/tensorflow/core/platform/posix/error.h
@@ -24,4 +24,4 @@ Status IOError(const string& context, int err_number);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_POSIX_POSIX_FILE_SYSTEM_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_POSIX_ERROR_H_
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 1939cf72fb..b46b9927cd 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -17,9 +17,7 @@ limitations under the License.
 #include "jemalloc/jemalloc.h"
 #endif
 
-#ifdef TENSORFLOW_USE_ABSL
 #include "absl/base/internal/sysinfo.h"
-#endif
 
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
@@ -194,11 +192,7 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
-#ifdef TENSORFLOW_USE_ABSL
   return absl::base_internal::NominalCPUFrequency();
-#else
-  return 1.0;
-#endif
 }
 
 int64 AvailableRam() {
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index e8898d0a97..752eccea66 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -70,7 +70,7 @@ class LocalPosixFileSystem : public PosixFileSystem {
   string TranslateName(const string& name) const override {
     StringPiece scheme, host, path;
     io::ParseURI(name, &scheme, &host, &path);
-    return path.ToString();
+    return string(path);
   }
 };
 
diff --git a/tensorflow/core/platform/posix/subprocess.h b/tensorflow/core/platform/posix/subprocess.h
index 53f95f3c14..9740d75595 100644
--- a/tensorflow/core/platform/posix/subprocess.h
+++ b/tensorflow/core/platform/posix/subprocess.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_DEFAULT_SUBPROCESS_H_
-#define TENSORFLOW_PLATFORM_DEFAULT_SUBPROCESS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_POSIX_SUBPROCESS_H_
+#define TENSORFLOW_CORE_PLATFORM_POSIX_SUBPROCESS_H_
 
 #include <errno.h>
 #include <unistd.h>
@@ -128,4 +128,4 @@ class SubProcess {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_DEFAULT_SUBPROCESS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_POSIX_SUBPROCESS_H_
diff --git a/tensorflow/core/platform/prefetch.h b/tensorflow/core/platform/prefetch.h
index 81e1a5210a..9cefab3c1b 100644
--- a/tensorflow/core/platform/prefetch.h
+++ b/tensorflow/core/platform/prefetch.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_PREFETCH_H_
-#define TENSORFLOW_PLATFORM_PREFETCH_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_PREFETCH_H_
+#define TENSORFLOW_CORE_PLATFORM_PREFETCH_H_
 
 #include "tensorflow/core/platform/platform.h"
 
@@ -56,4 +56,4 @@ inline void prefetch(const void* x) {
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_PREFETCH_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_PREFETCH_H_
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index ce2069b004..2d94736c97 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_PROFILEUTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H__
-#define TENSORFLOW_PLATFORM_PROFILEUTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H__
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
 
 #include <sys/types.h>
 
@@ -64,4 +64,4 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
 #endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
         // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
 
-#endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H__
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_ANDROID_ARMV7A_CPU_UTILS_HELPER_H_
diff --git a/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
index de4eec28e3..e25456374c 100644
--- a/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
+++ b/tensorflow/core/platform/profile_utils/clock_cycle_profiler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
-#define TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
 
 #include <algorithm>
 
@@ -103,4 +103,4 @@ class ClockCycleProfiler {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CLOCK_CYCLE_PROFILER_H_
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 8f06290303..b0b1ef0363 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // This class is designed to get accurate profile for programs.
 
-#ifndef TENSORFLOW_PLATFORM_PROFILEUTILS_CPU_UTILS_H__
-#define TENSORFLOW_PLATFORM_PROFILEUTILS_CPU_UTILS_H__
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
 
 #include <chrono>
 #include <memory>
@@ -164,4 +164,4 @@ class CpuUtils {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_CPU_UTILS_H__
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
index 11b739c009..cab7618a70 100644
--- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__
-#define TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__
+#ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
 
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,4 +50,4 @@ class ICpuUtilsHelper {
 }  // namespace profile_utils
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__
+#endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_I_CPU_UTILS_HELPER_H_
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index 288d091624..fcbf1fc8c5 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_PROTOBUF_H_
-#define TENSORFLOW_PLATFORM_PROTOBUF_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_PROTOBUF_H_
+#define TENSORFLOW_CORE_PLATFORM_PROTOBUF_H_
 
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
@@ -52,4 +52,4 @@ inline void SetProtobufStringSwapAllowed(string* src, string* dest) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_PROTOBUF_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_PROTOBUF_H_
diff --git a/tensorflow/core/kernels/warn_about_ints.h b/tensorflow/core/platform/protobuf_compiler.h
index 20666b230e..29679e0089 100644
--- a/tensorflow/core/kernels/warn_about_ints.h
+++ b/tensorflow/core/platform/protobuf_compiler.h
@@ -13,17 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
-#define TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
+#ifndef TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
+#define TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
 
-#include "tensorflow/core/framework/op_kernel.h"
+#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
+#include "tensorflow/core/platform/google/protobuf_compiler.h"
+#else
+#include "tensorflow/core/platform/default/protobuf_compiler.h"
+#endif
 
-namespace tensorflow {
-
-// Warn if a kernel is being created using ints
-// TODO(irving): Remove in TF 2.0 along with the bad op registrations.
-void WarnAboutInts(OpKernelConstruction* context);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
+#endif  // TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h
index 2f151a5aee..d0cfde09bc 100644
--- a/tensorflow/core/platform/protobuf_internal.h
+++ b/tensorflow/core/platform/protobuf_internal.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_PROTOBUF_INTERNAL_H_
-#define TENSORFLOW_PLATFORM_PROTOBUF_INTERNAL_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_PROTOBUF_INTERNAL_H_
+#define TENSORFLOW_CORE_PLATFORM_PROTOBUF_INTERNAL_H_
 
 #include "google/protobuf/any.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -69,4 +69,4 @@ Status ParseAny(const google::protobuf::Any& any, T* message,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_PROTOBUF_INTERNAL_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_PROTOBUF_INTERNAL_H_
diff --git a/tensorflow/core/platform/setround.h b/tensorflow/core/platform/setround.h
index d076e7acc6..ded00b23b1 100644
--- a/tensorflow/core/platform/setround.h
+++ b/tensorflow/core/platform/setround.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_SETROUND_H_
-#define TENSORFLOW_PLATFORM_SETROUND_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_SETROUND_H_
+#define TENSORFLOW_CORE_PLATFORM_SETROUND_H_
 
 #include <cfenv>
 
@@ -42,4 +42,4 @@ class ScopedSetRound {
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_SETROUND_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_SETROUND_H_
diff --git a/tensorflow/core/platform/snappy.h b/tensorflow/core/platform/snappy.h
index 62c208ffb4..5477b097ef 100644
--- a/tensorflow/core/platform/snappy.h
+++ b/tensorflow/core/platform/snappy.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_SNAPPY_H_
-#define TENSORFLOW_PLATFORM_SNAPPY_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_SNAPPY_H_
+#define TENSORFLOW_CORE_PLATFORM_SNAPPY_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -31,4 +31,4 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output);
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_SNAPPY_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_SNAPPY_H_
diff --git a/tensorflow/core/platform/stacktrace_handler.h b/tensorflow/core/platform/stacktrace_handler.h
index a52970fdaa..9f118b91b8 100644
--- a/tensorflow/core/platform/stacktrace_handler.h
+++ b/tensorflow/core/platform/stacktrace_handler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
-#define TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_STACKTRACE_HANDLER_H_
+#define TENSORFLOW_CORE_PLATFORM_STACKTRACE_HANDLER_H_
 
 namespace tensorflow {
 namespace testing {
@@ -25,4 +25,4 @@ void InstallStacktraceHandler();
 }  // namespace testing
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_STACKTRACE_HANDLER_H_
diff --git a/tensorflow/core/platform/subprocess.h b/tensorflow/core/platform/subprocess.h
index dcc0c1a4ee..7c11e6232f 100644
--- a/tensorflow/core/platform/subprocess.h
+++ b/tensorflow/core/platform/subprocess.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_SUBPROCESS_H_
-#define TENSORFLOW_PLATFORM_SUBPROCESS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_SUBPROCESS_H_
+#define TENSORFLOW_CORE_PLATFORM_SUBPROCESS_H_
 
 #include <memory>
 #include <vector>
@@ -67,4 +67,4 @@ std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
-#endif  // TENSORFLOW_PLATFORM_SUBPROCESS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_SUBPROCESS_H_
diff --git a/tensorflow/core/platform/test.h b/tensorflow/core/platform/test.h
index 99bae63edf..f5d3282f57 100644
--- a/tensorflow/core/platform/test.h
+++ b/tensorflow/core/platform/test.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_TEST_H_
-#define TENSORFLOW_PLATFORM_TEST_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_TEST_H_
+#define TENSORFLOW_CORE_PLATFORM_TEST_H_
 
 #include <memory>
 #include <vector>
@@ -55,4 +55,4 @@ int PickUnusedPortOrDie();
 }  // namespace testing
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_TEST_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_TEST_H_
diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h
index 9b8726d98f..61fcd0d372 100644
--- a/tensorflow/core/platform/test_benchmark.h
+++ b/tensorflow/core/platform/test_benchmark.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 // Simple benchmarking facility.
-#ifndef TENSORFLOW_PLATFORM_TEST_BENCHMARK_H_
-#define TENSORFLOW_PLATFORM_TEST_BENCHMARK_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_TEST_BENCHMARK_H_
+#define TENSORFLOW_CORE_PLATFORM_TEST_BENCHMARK_H_
 
 #include <utility>
 #include <vector>
@@ -115,4 +115,4 @@ void UseRealTime();
 }  // namespace testing
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_TEST_BENCHMARK_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_TEST_BENCHMARK_H_
diff --git a/tensorflow/core/platform/thread_annotations.h b/tensorflow/core/platform/thread_annotations.h
index 50195cbbc7..aec34df8a1 100644
--- a/tensorflow/core/platform/thread_annotations.h
+++ b/tensorflow/core/platform/thread_annotations.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_THREAD_ANNOTATIONS_H_
-#define TENSORFLOW_PLATFORM_THREAD_ANNOTATIONS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
+#define TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -27,4 +27,4 @@ limitations under the License.
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
-#endif  // TENSORFLOW_PLATFORM_THREAD_ANNOTATIONS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index c322777705..e5851f1dfe 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_TRACING_H_
-#define TENSORFLOW_PLATFORM_TRACING_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_TRACING_H_
+#define TENSORFLOW_CORE_PLATFORM_TRACING_H_
 
 // Tracing interface
 
@@ -238,4 +238,4 @@ const char* GetLogDir();
 #include "tensorflow/core/platform/default/tracing_impl.h"
 #endif
 
-#endif  // TENSORFLOW_PLATFORM_TRACING_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_TRACING_H_
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 68897ac423..a4fa790317 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_TYPES_H_
-#define TENSORFLOW_PLATFORM_TYPES_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_TYPES_H_
+#define TENSORFLOW_CORE_PLATFORM_TYPES_H_
 
 #include <string>
 #include "tensorflow/core/platform/platform.h"
@@ -66,4 +66,4 @@ namespace tensorflow {
 namespace se = ::stream_executor;
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_TYPES_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_TYPES_H_
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index ba2126abcf..8b42cbec7a 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
-#define TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_CPU_INFO_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_CPU_INFO_H_
 
 // included so __cpuidex function is available for GETCPUID on Windows
 #include <intrin.h>
 
-#endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_CPU_INFO_H_
diff --git a/tensorflow/core/platform/windows/integral_types.h b/tensorflow/core/platform/windows/integral_types.h
index 46338a536d..283af49f20 100644
--- a/tensorflow/core/platform/windows/integral_types.h
+++ b/tensorflow/core/platform/windows/integral_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
-#define TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
 
 #include "tensorflow/core/platform/default/integral_types.h"
 
@@ -22,4 +22,4 @@ limitations under the License.
 
 typedef std::ptrdiff_t ssize_t;
 
-#endif  // TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index f00471d484..9084ff5a92 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
-#define TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_SUBPROCESS_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_SUBPROCESS_H_
 
 #include <memory>
 #include <vector>
@@ -33,4 +33,4 @@ std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PLATFORM_WINDOWS_SUBPROCESS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_SUBPROCESS_H_
diff --git a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
index f5ac5c9c5a..0d1c92eb08 100644
--- a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -137,4 +137,4 @@ class ExpensiveOperationChecker : public Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OP_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
index 270662bd4a..e1533f882f 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
-#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
 
 #include "tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h"
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
@@ -78,4 +78,4 @@ class Advisor {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 2c4f52e3ad..744e1e95de 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -37,7 +37,7 @@ const char* const kGradientSuffix = " (gradient)";
 
 // Convert to Trace proto into a short readable string.
 string GetTraceString(const CallStack::Trace& trace) {
-  string ntrace = io::Basename(trace.file()).ToString();
+  string ntrace(io::Basename(trace.file()));
   ntrace += strings::StrCat(":", trace.lineno());
   if (trace.function().length() < 20) {
     ntrace += ":" + trace.function();
@@ -113,7 +113,7 @@ class FunctionTable {
     // function index should start from 1.
     func_pb->set_id(function_table_.size());
 
-    string file_base = io::Basename(file_path).ToString();
+    string file_base(io::Basename(file_path));
     file_base = file_base.substr(0, file_base.find_last_of("."));
     func_pb->set_name(
         string_table_->GetIndex(strings::StrCat(file_base, ":", func_name)));
diff --git a/tensorflow/core/profiler/tfprof_options.h b/tensorflow/core/profiler/tfprof_options.h
index d61deb72ac..57c7e11fa2 100644
--- a/tensorflow/core/profiler/tfprof_options.h
+++ b/tensorflow/core/profiler/tfprof_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
-#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_TFPROF_OPTIONS_H_
+#define TENSORFLOW_CORE_PROFILER_TFPROF_OPTIONS_H_
 
 #include <set>
 #include <string>
@@ -183,4 +183,4 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_TFPROF_OPTIONS_H_
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index cc8596ef3d..536a07c413 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PUBLIC_SESSION_H_
-#define TENSORFLOW_PUBLIC_SESSION_H_
+#ifndef TENSORFLOW_CORE_PUBLIC_SESSION_H_
+#define TENSORFLOW_CORE_PUBLIC_SESSION_H_
 
 #include <string>
 #include <vector>
@@ -279,4 +279,4 @@ Session* NewSession(const SessionOptions& options);
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_PUBLIC_SESSION_H_
+#endif  // TENSORFLOW_CORE_PUBLIC_SESSION_H_
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 563564119f..4129c93af5 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -96,10 +96,12 @@ limitations under the License.
 //     GraphDef. (7dec2017)
 // 27. Deprecate TensorArray ops v2 in favor of v3 and deprecated io_ops
 //     deprecated in favor of V2 ops. (2018/01/23)
+// 28. Deprecate MatrixExponential op in favor of Python implementation.
+//     (2018/08/21).
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 26
+#define TF_GRAPH_DEF_VERSION 27
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/activation_mode.h b/tensorflow/core/util/activation_mode.h
index 2e03ccd5c8..2f7820fb47 100644
--- a/tensorflow/core/util/activation_mode.h
+++ b/tensorflow/core/util/activation_mode.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_ACTIVATION_MODE_H_
-#define TENSORFLOW_UTIL_ACTIVATION_MODE_H_
+#ifndef TENSORFLOW_CORE_UTIL_ACTIVATION_MODE_H_
+#define TENSORFLOW_CORE_UTIL_ACTIVATION_MODE_H_
 
 // This file contains helper routines to deal with activation mode in various
 // ops and kernels.
@@ -43,4 +43,4 @@ Status GetActivationModeFromString(const string& str_value,
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_ACTIVATION_MODE_H_
+#endif  // TENSORFLOW_CORE_UTIL_ACTIVATION_MODE_H_
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 81d64e5676..6d73c38e3c 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_BCAST_H_
-#define TENSORFLOW_UTIL_BCAST_H_
+#ifndef TENSORFLOW_CORE_UTIL_BCAST_H_
+#define TENSORFLOW_CORE_UTIL_BCAST_H_
 
 #include <algorithm>
 
@@ -132,4 +132,4 @@ class BCast {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_BCAST_H_
+#endif  // TENSORFLOW_CORE_UTIL_BCAST_H_
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index b281acb2b0..55f1e30880 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -32,7 +32,7 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
   if (str_util::ConsumePrefix(&arg, "--") &&
       str_util::ConsumePrefix(&arg, flag) &&
       str_util::ConsumePrefix(&arg, "=")) {
-    *value_parsing_ok = hook(std::string(arg));
+    *value_parsing_ok = hook(string(arg));
     return true;
   }
 
diff --git a/tensorflow/core/util/ctc/ctc_beam_search.h b/tensorflow/core/util/ctc/ctc_beam_search.h
index aee647a1b3..5e2aeb7830 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -259,6 +259,16 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
   } else {
     max_coeff = raw_input.maxCoeff();
   }
+
+  // Get normalization term of softmax: log(sum(exp(logit[j]-max_coeff))).
+  float logsumexp = 0.0;
+  for (int j = 0; j < raw_input.size(); ++j) {
+    logsumexp += Eigen::numext::exp(raw_input(j) - max_coeff);
+  }
+  logsumexp = Eigen::numext::log(logsumexp);
+  // Final normalization offset to get correct log probabilities.
+  float norm_offset = max_coeff + logsumexp;
+
   const float label_selection_input_min =
       (label_selection_margin_ >= 0) ? (max_coeff - label_selection_margin_)
                                      : -std::numeric_limits<float>::infinity();
@@ -290,10 +300,10 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
                       beam_scorer_->GetStateExpansionScore(b->state, previous));
       }
       // Plabel(l=abc @ t=6) *= P(c @ 6)
-      b->newp.label += raw_input(b->label) - max_coeff;
+      b->newp.label += raw_input(b->label) - norm_offset;
     }
     // Pblank(l=abc @ t=6) = P(l=abc @ t=5) * P(- @ 6)
-    b->newp.blank = b->oldp.total + raw_input(blank_index_) - max_coeff;
+    b->newp.blank = b->oldp.total + raw_input(blank_index_) - norm_offset;
     // P(l=abc @ t=6) = Plabel(l=abc @ t=6) + Pblank(l=abc @ t=6)
     b->newp.total = LogSumExp(b->newp.blank, b->newp.label);
 
@@ -328,6 +338,8 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
       const float logit = top_k ? top_k_logits[ind] : raw_input(ind);
       // Perform label selection: if input for this label looks very
       // unpromising, never evaluate it with a scorer.
+      // We may compare logits instead of log probabilities, 
+      // since the difference is the same in both cases.
       if (logit < label_selection_input_min) {
         continue;
       }
@@ -341,7 +353,7 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Step(
         //   Plabel(l=abcd @ t=6) = P(l=abc @ t=5) * P(d @ 6)
         beam_scorer_->ExpandState(b->state, b->label, &c.state, c.label);
         float previous = (c.label == b->label) ? b->oldp.blank : b->oldp.total;
-        c.newp.label = logit - max_coeff +
+        c.newp.label = logit - norm_offset +
                        beam_scorer_->GetStateExpansionScore(c.state, previous);
         // P(l=abcd @ t=6) = Plabel(l=abcd @ t=6)
         c.newp.total = c.newp.label;
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 4071a70836..3f0bc60562 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_DEVICE_NAME_UTILS_H_
-#define TENSORFLOW_UTIL_DEVICE_NAME_UTILS_H_
+#ifndef TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
+#define TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
 
 #include <string>
 
@@ -173,4 +173,4 @@ class DeviceNameUtils {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_DEVICE_NAME_UTILS_H_
+#endif  // TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
diff --git a/tensorflow/core/util/env_var.cc b/tensorflow/core/util/env_var.cc
index 8d43bcc927..2604a5d66a 100644
--- a/tensorflow/core/util/env_var.cc
+++ b/tensorflow/core/util/env_var.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
                           bool* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return Status::OK();
   }
@@ -48,7 +48,7 @@ Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
 Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val,
                            int64* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return Status::OK();
   }
@@ -62,11 +62,11 @@ Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val,
 
 Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val,
                             string* value) {
-  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
   if (tf_env_var_val != nullptr) {
     *value = tf_env_var_val;
   } else {
-    *value = std::string(default_val);
+    *value = string(default_val);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/util/env_var.h b/tensorflow/core/util/env_var.h
index 47f9ff3a3b..724ca35729 100644
--- a/tensorflow/core/util/env_var.h
+++ b/tensorflow/core/util/env_var.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_ENV_VAR_H_
+#ifndef TENSORFLOW_CORE_UTIL_ENV_VAR_H_
+#define TENSORFLOW_CORE_UTIL_ENV_VAR_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -42,4 +43,4 @@ Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_ENV_VAR_H_
+#endif  // TENSORFLOW_CORE_UTIL_ENV_VAR_H_
diff --git a/tensorflow/core/util/events_writer.h b/tensorflow/core/util/events_writer.h
index 5dbaf97af4..d5952c3cbd 100644
--- a/tensorflow/core/util/events_writer.h
+++ b/tensorflow/core/util/events_writer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_EVENTS_WRITER_H_
-#define TENSORFLOW_UTIL_EVENTS_WRITER_H_
+#ifndef TENSORFLOW_CORE_UTIL_EVENTS_WRITER_H_
+#define TENSORFLOW_CORE_UTIL_EVENTS_WRITER_H_
 
 #include <memory>
 #include <string>
@@ -95,4 +95,4 @@ class EventsWriter {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_EVENTS_WRITER_H_
+#endif  // TENSORFLOW_CORE_UTIL_EVENTS_WRITER_H_
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 1fec0010a1..a38cd1d09f 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -353,7 +353,7 @@ bool TestFastParse(const string& serialized, Example* example) {
     // I.e. last entry in the map overwrites all the previous ones.
     parsed::FeatureMapEntry& name_and_feature =
         parsed_example[parsed_example_size - i - 1];
-    string name = std::string(name_and_feature.first);
+    string name(name_and_feature.first);
     if ((*features.mutable_feature()).count(name) > 0) continue;
 
     auto& value = (*features.mutable_feature())[name];
diff --git a/tensorflow/core/util/guarded_philox_random.h b/tensorflow/core/util/guarded_philox_random.h
index 44970eb949..8be7a374f0 100644
--- a/tensorflow/core/util/guarded_philox_random.h
+++ b/tensorflow/core/util/guarded_philox_random.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_KERNELS_GUARDED_PHILOX_RANDOM_H_
-#define TENSORFLOW_KERNELS_GUARDED_PHILOX_RANDOM_H_
+#ifndef TENSORFLOW_CORE_UTIL_GUARDED_PHILOX_RANDOM_H_
+#define TENSORFLOW_CORE_UTIL_GUARDED_PHILOX_RANDOM_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/random/philox_random.h"
@@ -79,4 +79,4 @@ class GuardedPhiloxRandom {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_GUARDED_PHILOX_RANDOM_H_
+#endif  // TENSORFLOW_CORE_UTIL_GUARDED_PHILOX_RANDOM_H_
diff --git a/tensorflow/core/util/mirror_pad_mode.h b/tensorflow/core/util/mirror_pad_mode.h
index f703d47ab1..ceee9b06b0 100644
--- a/tensorflow/core/util/mirror_pad_mode.h
+++ b/tensorflow/core/util/mirror_pad_mode.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_MIRROR_PAD_MODE_H_
-#define TENSORFLOW_UTIL_MIRROR_PAD_MODE_H_
+#ifndef TENSORFLOW_CORE_UTIL_MIRROR_PAD_MODE_H_
+#define TENSORFLOW_CORE_UTIL_MIRROR_PAD_MODE_H_
 
 // This file contains helper routines to deal with padding in various ops and
 // kernels.
@@ -49,4 +49,4 @@ Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_MIRROR_PAD_MODE_H_
+#endif  // TENSORFLOW_CORE_UTIL_MIRROR_PAD_MODE_H_
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 159a787d05..422be9356d 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -87,6 +87,16 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
+typedef enum {
+  Dim3d_N = 0,
+  Dim3d_C = 1,
+  Dim3d_D = 2,
+  Dim3d_H = 3,
+  Dim3d_W = 4,
+  Dim3d_O = 0,
+  Dim3d_I = 1
+} MklDnnDims3D;
+
 #ifdef INTEL_MKL_ML_ONLY
 class MklShape {
  public:
@@ -351,6 +361,7 @@ class MklShape {
 #else
 
 // Forward decl
+TensorFormat MklDnn3DDataFormatToTFDataFormat(memory::format format);
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
 memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
 memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
@@ -453,6 +464,13 @@ class MklDnnShape {
     return this->DimSize(index);
   }
 
+  inline size_t GetDimension3D(char dimension) const {
+    int index = GetMklDnnTensor3DDimIndex(dimension);
+    CHECK(index >= 0 && index < this->GetDimension())
+        << "Invalid index from the dimension: " << index << ", " << dimension;
+    return this->DimSize(index);
+  }
+
   inline int32 GetMklDnnTensorDimIndex(char dimension) const {
     switch (dimension) {
       case 'N':
@@ -469,6 +487,24 @@ class MklDnnShape {
     }
   }
 
+  inline int32 GetMklDnnTensor3DDimIndex(char dimension) const {
+    switch (dimension) {
+      case 'N':
+        return MklDnnDims3D::Dim3d_N;
+      case 'C':
+        return MklDnnDims3D::Dim3d_C;
+      case 'D':
+        return MklDnnDims3D::Dim3d_D;
+      case 'H':
+        return MklDnnDims3D::Dim3d_H;
+      case 'W':
+        return MklDnnDims3D::Dim3d_W;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  }
+
   inline size_t GetDimension() const { return data_.dimension_; }
   inline const int* GetSizes() const {
     return reinterpret_cast<const int*>(&data_.sizes_[0]);
@@ -587,13 +623,26 @@ class MklDnnShape {
   }
 
   inline void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
-    // TODO(nhasabni): Why do we restrict this to 4D?
-    CHECK_EQ(dimension, 4);
-    CHECK(dimension == data_.dimension_);
-    data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W;
-    data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H;
-    data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C;
-    data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N;
+    if (dimension == 5) {
+      CHECK(dimension == data_.dimension_);
+      data_.map_[GetTensorDimIndex<3>(data_format, '0')] =
+          MklDnnDims3D::Dim3d_D;
+      data_.map_[GetTensorDimIndex<3>(data_format, '1')] =
+          MklDnnDims3D::Dim3d_H;
+      data_.map_[GetTensorDimIndex<3>(data_format, '2')] =
+          MklDnnDims3D::Dim3d_W;
+      data_.map_[GetTensorDimIndex<3>(data_format, 'C')] =
+          MklDnnDims3D::Dim3d_C;
+      data_.map_[GetTensorDimIndex<3>(data_format, 'N')] =
+          MklDnnDims3D::Dim3d_N;
+    } else {
+      CHECK_EQ(dimension, 4);
+      CHECK(dimension == data_.dimension_);
+      data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C;
+      data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N;
+    }
   }
 
   inline void SetTfDimOrder(const size_t dimension, memory::format format) {
@@ -1329,6 +1378,19 @@ memory::data_type MklDnnType<float>() {
   return memory::data_type::f32;
 }
 
+/// Map TensorFlow's data format into MKL-DNN 3D data format
+/// @input: TensorFlow data format
+/// @return: memory::format corresponding to TensorFlow data format;
+///          Fails with an error if invalid data format.
+inline memory::format TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC)
+    return memory::format::ndhwc;
+  else if (format == FORMAT_NCHW)
+    return memory::format::ncdhw;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+  return memory::format::format_undef;
+}
+
 /// Map TensorFlow's data format into MKL-DNN data format
 ///
 /// @input: TensorFlow data format
@@ -1340,7 +1402,6 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
   else if (format == FORMAT_NCHW)
     return memory::format::nchw;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  // Return to get rid of compiler warning
   return memory::format::format_undef;
 }
 
@@ -1350,9 +1411,9 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @return: Tensorflow data format corresponding to memory::format
 ///          Fails with an error if invalid data format.
 inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
-  if (format == memory::format::nhwc)
+  if (format == memory::format::nhwc || format == memory::format::ndhwc)
     return FORMAT_NHWC;
-  else if (format == memory::format::nchw)
+  else if (format == memory::format::nchw || format == memory::format::ncdhw)
     return FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
@@ -1402,6 +1463,22 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
   return memory::dims({n, c, h, w});
 }
 
+inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
+                                               TensorFormat format) {
+  // Check validity of format.
+  CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format),
+           memory::format::format_undef);
+
+  int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N'));
+  int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C'));
+  int d = shape.dim_size(GetTensorDimIndex<3>(format, '0'));
+  int h = shape.dim_size(GetTensorDimIndex<3>(format, '1'));
+  int w = shape.dim_size(GetTensorDimIndex<3>(format, '2'));
+
+  // MKL-DNN requires dimensions in NCDHW format.
+  return memory::dims({n, c, d, h, w});
+}
+
 /// Overloaded version of function above. Input parameters are
 /// self-explanatory.
 inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
@@ -1514,6 +1591,8 @@ class MklDnnData {
 
   /// Operations memory descriptor
   memory::desc* op_md_;
+  // flat to indicate if data is 3D or not.
+  bool bIs3D;
   /// Operations temp buffer
   void* allocated_buffer_;
   /// CPU engine on which operation will be executed
@@ -1540,6 +1619,10 @@ class MklDnnData {
         static_cast<const void*>(tensor->flat<T>().data()));
   }
 
+  void SetIs3DData(bool bIs3D_) { bIs3D = bIs3D_; }
+
+  bool GetIs3D() { return bIs3D; }
+
   /// Set user memory primitive using specified dimensions, memory format and
   /// data_buffer. Function automatically uses element data type by using
   /// input type T used for creating call object.
diff --git a/tensorflow/core/util/padding.h b/tensorflow/core/util/padding.h
index a4278ff2b4..76f9b4dd9a 100644
--- a/tensorflow/core/util/padding.h
+++ b/tensorflow/core/util/padding.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_PADDING_H_
-#define TENSORFLOW_UTIL_PADDING_H_
+#ifndef TENSORFLOW_CORE_UTIL_PADDING_H_
+#define TENSORFLOW_CORE_UTIL_PADDING_H_
 
 // This file contains helper routines to deal with padding in various ops and
 // kernels.
@@ -50,4 +50,4 @@ Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_PADDING_H_
+#endif  // TENSORFLOW_CORE_UTIL_PADDING_H_
diff --git a/tensorflow/core/util/port.h b/tensorflow/core/util/port.h
index 981def9d22..e9b9cb1cd2 100644
--- a/tensorflow/core/util/port.h
+++ b/tensorflow/core/util/port.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_PORT_H_
-#define TENSORFLOW_UTIL_PORT_H_
+#ifndef TENSORFLOW_CORE_UTIL_PORT_H_
+#define TENSORFLOW_CORE_UTIL_PORT_H_
 
 namespace tensorflow {
 
@@ -30,4 +30,4 @@ bool IsMklEnabled();
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_PORT_H_
+#endif  // TENSORFLOW_CORE_UTIL_PORT_H_
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 90672a10a8..7c9cfa35f7 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Utilities for saving/restoring tensor slice checkpoints.
 
-#ifndef TENSORFLOW_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
-#define TENSORFLOW_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
+#ifndef TENSORFLOW_CORE_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
 
 #include <string>  // for string
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -210,4 +210,4 @@ inline void Fill(const string* data, size_t n, TensorProto* t) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
+#endif  // TENSORFLOW_CORE_UTIL_SAVED_TENSOR_SLICE_UTIL_H_
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index aca60b942d..ad8a44a518 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -326,7 +326,7 @@ Status ValidateStridedSliceOp(
       // Even if we don't have values for begin or end, we do know that this
       // dimension covers the whole interval. If we have shape information for
       // this dimension, that tells us the interval length.
-      if (dim_i > 0) {
+      if (dim_i >= 0) {
         if (stride_i < 0) {
           interval_length = -dim_i;
         } else {
diff --git a/tensorflow/core/util/tensor_bundle/naming.h b/tensorflow/core/util/tensor_bundle/naming.h
index 3d21570c74..6539d565e2 100644
--- a/tensorflow/core/util/tensor_bundle/naming.h
+++ b/tensorflow/core/util/tensor_bundle/naming.h
@@ -31,8 +31,8 @@ limitations under the License.
 //
 // Regexp can also be used: e.g. R"<prefix>.data-\d{5}-of-\d{5}" for data files.
 
-#ifndef TENSORFLOW_UTIL_TENSOR_BUNDLE_NAMING_H_
-#define TENSORFLOW_UTIL_TENSOR_BUNDLE_NAMING_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_NAMING_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_NAMING_H_
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -43,4 +43,4 @@ string DataFilename(StringPiece prefix, int32 shard_id, int32 num_shards);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_BUNDLE_NAMING_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_NAMING_H_
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index d30ce3f0cf..3a2ffbb495 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -58,8 +58,8 @@ limitations under the License.
 //       "/fs/model/train/ckpt-step/ckpt" /* merged prefix */);
 //
 
-#ifndef TENSORFLOW_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
-#define TENSORFLOW_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
 
 #include "tensorflow/core/protobuf/tensor_bundle.pb.h"
 
@@ -346,4 +346,4 @@ class FileOutputBuffer {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_TENSOR_BUNDLE_H_
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index a5f7ecf0d1..f331973f5c 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -25,6 +25,10 @@ string GetConvnet3dDataFormatAttrString() {
   return "data_format: { 'NDHWC', 'NCDHW' } = 'NDHWC' ";
 }
 
+string GetConvnetDataFormat2D3DAttrString() {
+  return "data_format: { 'NHWC', 'NCHW', 'NDHWC', 'NCDHW' } = 'NHWC' ";
+}
+
 string GetConvnetFilterFormatAttrString() {
   return "filter_format: { 'HWIO', 'OIHW' } = 'HWIO' ";
 }
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 918835e1fb..b0c349dd90 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -483,6 +483,7 @@ string GetConvnet3dDataFormatAttrString();
 // Return the string that specifies the filter format for convnet operations.
 string GetConvnetFilterFormatAttrString();
 string GetConvnet3dFilterFormatAttrString();
+string GetConvnetDataFormat2D3DAttrString();
 
 // Returns a tensor shape for the specified format and dimension sizes.
 // Works for both 2D and 3D operations. The output shapes are as follows:
diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h
index 263f56c7fc..4aa9a4708e 100644
--- a/tensorflow/core/util/tensor_slice_reader.h
+++ b/tensorflow/core/util/tensor_slice_reader.h
@@ -16,8 +16,8 @@ limitations under the License.
 // The utility to read checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
 
-#ifndef TENSORFLOW_UTIL_TENSOR_SLICE_READER_H_
-#define TENSORFLOW_UTIL_TENSOR_SLICE_READER_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
 
 #include <unordered_map>
 
@@ -192,4 +192,4 @@ bool TensorSliceReader::CopySliceData(const string& name,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_SLICE_READER_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.h b/tensorflow/core/util/tensor_slice_reader_cache.h
index 63a8d0b068..9f1919df4e 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.h
+++ b/tensorflow/core/util/tensor_slice_reader_cache.h
@@ -16,8 +16,8 @@ limitations under the License.
 // The utility to read checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
 
-#ifndef TENSORFLOW_UTIL_TENSOR_SLICE_READER_CACHE_H_
-#define TENSORFLOW_UTIL_TENSOR_SLICE_READER_CACHE_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
 
 #include <unordered_map>
 
@@ -85,4 +85,4 @@ class TensorSliceReaderCache {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_SLICE_READER_CACHE_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index 2888c66d10..0db2fb4804 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -16,8 +16,8 @@ limitations under the License.
 // The utility to write checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
 
-#ifndef TENSORFLOW_UTIL_TENSOR_SLICE_WRITER_H_
-#define TENSORFLOW_UTIL_TENSOR_SLICE_WRITER_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
 
 #include <unordered_map>
 
@@ -192,4 +192,4 @@ Status CreateTableTensorSliceBuilder(const string& filename,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_SLICE_WRITER_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index 4adf2f14dc..93dfd51ab5 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_UTIL_H_
-#define TENSORFLOW_UTIL_UTIL_H_
+#ifndef TENSORFLOW_CORE_UTIL_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_UTIL_H_
 
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -58,4 +58,4 @@ string SliceDebugString(const TensorShape& shape, const int64 flat);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_UTIL_H_
+#endif  // TENSORFLOW_CORE_UTIL_UTIL_H_
diff --git a/tensorflow/core/util/work_sharder.h b/tensorflow/core/util/work_sharder.h
index 72ce493c1b..b12c31c1ae 100644
--- a/tensorflow/core/util/work_sharder.h
+++ b/tensorflow/core/util/work_sharder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_WORK_SHARDER_H_
-#define TENSORFLOW_UTIL_WORK_SHARDER_H_
+#ifndef TENSORFLOW_CORE_UTIL_WORK_SHARDER_H_
+#define TENSORFLOW_CORE_UTIL_WORK_SHARDER_H_
 
 #include <functional>
 
@@ -95,4 +95,4 @@ class Sharder {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_WORK_SHARDER_H_
+#endif  // TENSORFLOW_CORE_UTIL_WORK_SHARDER_H_
diff --git a/tensorflow/docs_src/about/index.md b/tensorflow/docs_src/about/index.md
index dc1e9af876..c3c13ff329 100644
--- a/tensorflow/docs_src/about/index.md
+++ b/tensorflow/docs_src/about/index.md
@@ -3,9 +3,9 @@
 This section provides a few documents about TensorFlow itself,
 including the following:
 
-  * @{$uses$TensorFlow in Use}, which provides a link to our model zoo and
+  * [TensorFlow in Use](../about/uses.md), which provides a link to our model zoo and
     lists some popular ways that TensorFlow is being used.
-  * @{$bib$TensorFlow White Papers}, which provides abstracts of white papers
+  * [TensorFlow White Papers](../about/bib.md), which provides abstracts of white papers
     about TensorFlow.
-  * @{$attribution$Attribution}, which specifies how to attribute and refer
+  * [Attribution](../about/attribution.md), which specifies how to attribute and refer
     to TensorFlow.
diff --git a/tensorflow/docs_src/api_guides/python/client.md b/tensorflow/docs_src/api_guides/python/client.md
index 56367e6671..fdd48e66dc 100644
--- a/tensorflow/docs_src/api_guides/python/client.md
+++ b/tensorflow/docs_src/api_guides/python/client.md
@@ -3,7 +3,7 @@
 
 This library contains classes for launching graphs and executing operations.
 
-@{$guide/low_level_intro$This guide} has examples of how a graph
+[This guide](../../guide/low_level_intro.md) has examples of how a graph
 is launched in a `tf.Session`.
 
 ## Session management
diff --git a/tensorflow/docs_src/api_guides/python/constant_op.md b/tensorflow/docs_src/api_guides/python/constant_op.md
index 498ec3db5d..9ba95b0f55 100644
--- a/tensorflow/docs_src/api_guides/python/constant_op.md
+++ b/tensorflow/docs_src/api_guides/python/constant_op.md
@@ -64,7 +64,7 @@ print(sess.run(norm))
 ```
 
 Another common use of random values is the initialization of variables. Also see
-the @{$variables$Variables How To}.
+the [Variables How To](../../guide/variables.md).
 
 ```python
 # Use random uniform values in [0, 1) as the initializer for a variable of shape
diff --git a/tensorflow/docs_src/api_guides/python/input_dataset.md b/tensorflow/docs_src/api_guides/python/input_dataset.md
index ab572e53d4..911a76c2df 100644
--- a/tensorflow/docs_src/api_guides/python/input_dataset.md
+++ b/tensorflow/docs_src/api_guides/python/input_dataset.md
@@ -2,7 +2,7 @@
 [TOC]
 
 `tf.data.Dataset` allows you to build complex input pipelines. See the
-@{$guide/datasets} for an in-depth explanation of how to use this API.
+[Importing Data](../../guide/datasets.md) for an in-depth explanation of how to use this API.
 
 ## Reader classes
 
diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md
index ab3c70daa0..d7ce6fdfde 100644
--- a/tensorflow/docs_src/api_guides/python/io_ops.md
+++ b/tensorflow/docs_src/api_guides/python/io_ops.md
@@ -8,7 +8,7 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 ## Placeholders
 
 TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on @{$reading_data#Feeding$Feeding data}.
+on execution.  For more info, see the section on [Feeding data](../../api_guides/python/reading_data.md#Feeding).
 
 *   `tf.placeholder`
 *   `tf.placeholder_with_default`
@@ -21,7 +21,7 @@ there is a convenience function:
 ## Readers
 
 TensorFlow provides a set of Reader classes for reading data formats.
-For more information on inputs and readers, see @{$reading_data$Reading data}.
+For more information on inputs and readers, see [Reading data](../../api_guides/python/reading_data.md).
 
 *   `tf.ReaderBase`
 *   `tf.TextLineReader`
@@ -42,7 +42,7 @@ formats into tensors.
 
 ### Example protocol buffer
 
-TensorFlow's @{$reading_data#standard_tensorflow_format$recommended format for training examples}
+TensorFlow's [recommended format for training examples](../../api_guides/python/reading_data.md#standard_tensorflow_format)
 is serialized `Example` protocol buffers, [described
 here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 They contain `Features`, [described
@@ -62,7 +62,7 @@ here](https://www.tensorflow.org/code/tensorflow/core/example/feature.proto).
 TensorFlow provides several implementations of 'Queues', which are
 structures within the TensorFlow computation graph to stage pipelines
 of tensors together. The following describe the basic Queue interface
-and some implementations.  To see an example use, see @{$threading_and_queues$Threading and Queues}.
+and some implementations.  To see an example use, see [Threading and Queues](../../api_guides/python/threading_and_queues.md).
 
 *   `tf.QueueBase`
 *   `tf.FIFOQueue`
@@ -85,7 +85,7 @@ and some implementations.  To see an example use, see @{$threading_and_queues$Th
 ## Input pipeline
 
 TensorFlow functions for setting up an input-prefetching pipeline.
-Please see the @{$reading_data$reading data how-to}
+Please see the [reading data how-to](../../api_guides/python/reading_data.md)
 for context.
 
 ### Beginning of an input pipeline
diff --git a/tensorflow/docs_src/api_guides/python/math_ops.md b/tensorflow/docs_src/api_guides/python/math_ops.md
index e738161e49..6ec18f48ef 100644
--- a/tensorflow/docs_src/api_guides/python/math_ops.md
+++ b/tensorflow/docs_src/api_guides/python/math_ops.md
@@ -24,6 +24,7 @@ operators to your graph.
 *   `tf.realdiv`
 *   `tf.truncatediv`
 *   `tf.floor_div`
+*   `tf.div_no_nan`
 *   `tf.truncatemod`
 *   `tf.floormod`
 *   `tf.mod`
diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md
index 7dbd9a56f4..5e8a8b4d0f 100644
--- a/tensorflow/docs_src/api_guides/python/meta_graph.md
+++ b/tensorflow/docs_src/api_guides/python/meta_graph.md
@@ -23,7 +23,7 @@ protocol buffer. It contains the following fields:
 * [`SaverDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/saver.proto) for the saver.
 * [`CollectionDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
 map that further describes additional components of the model such as
-@{$python/state_ops$`Variables`},
+[`Variables`](../../api_guides/python/state_ops.md),
 `tf.train.QueueRunner`, etc.
 
 In order for a Python object to be serialized
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index 78c36d965c..9f555ee85d 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -1,7 +1,7 @@
 # Reading data
 
 Note: The preferred way to feed data into a tensorflow program is using the
-@{$datasets$`tf.data` API}.
+[`tf.data` API](../../guide/datasets.md).
 
 There are four methods of getting data into a TensorFlow program:
 
@@ -16,7 +16,7 @@ There are four methods of getting data into a TensorFlow program:
 
 ## `tf.data` API
 
-See the @{$guide/datasets} for an in-depth explanation of `tf.data.Dataset`.
+See the [Importing Data](../../guide/datasets.md) for an in-depth explanation of `tf.data.Dataset`.
 The `tf.data` API enables you to extract and preprocess data
 from different input/file formats, and apply transformations such as batching,
 shuffling, and mapping functions over the dataset. This is an improved version
@@ -56,8 +56,8 @@ in
 ## `QueueRunner`
 
 Warning: This section discusses implementing input pipelines using the
-queue-based APIs which can be cleanly replaced by the @{$datasets$`tf.data`
-API}.
+queue-based APIs which can be cleanly replaced by the [`tf.data`
+API](../../guide/datasets.md).
 
 A typical queue-based pipeline for reading records from files has the following stages:
 
@@ -154,14 +154,14 @@ a uint8 tensor, standard operations can slice out each piece and reformat as
 needed. For CIFAR-10, you can see how to do the reading and decoding in
 [`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
 and described in
-@{$deep_cnn#prepare-the-data$this tutorial}.
+[this tutorial](../../tutorials/images/deep_cnn.md#prepare-the-data).
 
 #### Standard TensorFlow format
 
 Another approach is to convert whatever data you have into a supported format.
 This approach makes it easier to mix and match data sets and network
 architectures. The recommended format for TensorFlow is a
-@{$python/python_io#tfrecords_format_details$TFRecords file}
+[TFRecords file](../../api_guides/python/python_io.md#tfrecords_format_details)
 containing
 [`tf.train.Example` protocol buffers](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
 (which contain
@@ -279,7 +279,7 @@ This can be important:
 How many threads do you need? the `tf.train.shuffle_batch*` functions add a
 summary to the graph that indicates how full the example queue is. If you have
 enough reading threads, that summary will stay above zero.  You can
-@{$summaries_and_tensorboard$view your summaries as training progresses using TensorBoard}.
+[view your summaries as training progresses using TensorBoard](../../guide/summaries_and_tensorboard.md).
 
 ### Creating threads to prefetch using `QueueRunner` objects
 
@@ -368,7 +368,7 @@ threads got an error when running some operation (or an ordinary Python
 exception).
 
 For more about threading, queues, QueueRunners, and Coordinators
-@{$threading_and_queues$see here}.
+[see here](../../api_guides/python/threading_and_queues.md).
 
 #### Aside: How clean shut-down when limiting epochs works
 
@@ -501,18 +501,18 @@ sessions, maybe in separate processes:
   model that reads validation input data.
 
 This is what is done `tf.estimator` and manually in
-@{$deep_cnn#save-and-restore-checkpoints$the example CIFAR-10 model}.
+[the example CIFAR-10 model](../../tutorials/images/deep_cnn.md#save-and-restore-checkpoints).
 This has a couple of benefits:
 
 * The eval is performed on a single snapshot of the trained variables.
 * You can perform the eval even after training has completed and exited.
 
 You can have the train and eval in the same graph in the same process, and share
-their trained variables or layers. See @{$variables$the shared variables tutorial}.
+their trained variables or layers. See [the shared variables tutorial](../../guide/variables.md).
 
 To support the single-graph approach
-@{$guide/datasets$`tf.data`} also supplies
-@{$guide/datasets#creating_an_iterator$advanced iterator types} that
+[`tf.data`](../../guide/datasets.md) also supplies
+[advanced iterator types](../../guide/datasets.md#creating_an_iterator) that
 that allow the user to change the input pipeline without rebuilding the graph or
 session.
 
diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
index f8abbf0f97..d67f38f57a 100644
--- a/tensorflow/docs_src/api_guides/python/regression_examples.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -66,7 +66,7 @@ watch the following video:
 <a name="running"></a>
 ## Running the examples
 
-You must @{$install$install TensorFlow} prior to running these examples.
+You must [install TensorFlow](../../install/index.md) prior to running these examples.
 Depending on the way you've installed TensorFlow, you might also
 need to activate your TensorFlow environment.  Then, do the following:
 
diff --git a/tensorflow/docs_src/api_guides/python/summary.md b/tensorflow/docs_src/api_guides/python/summary.md
index e290703b7d..fc45e7b4c3 100644
--- a/tensorflow/docs_src/api_guides/python/summary.md
+++ b/tensorflow/docs_src/api_guides/python/summary.md
@@ -2,7 +2,7 @@
 [TOC]
 
 Summaries provide a way to export condensed information about a model, which is
-then accessible in tools such as @{$summaries_and_tensorboard$TensorBoard}.
+then accessible in tools such as [TensorBoard](../../guide/summaries_and_tensorboard.md).
 
 ## Generation of Summaries
 
diff --git a/tensorflow/docs_src/api_guides/python/threading_and_queues.md b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
index 48f0778b73..e00f17f955 100644
--- a/tensorflow/docs_src/api_guides/python/threading_and_queues.md
+++ b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
@@ -3,7 +3,7 @@
 Note: In versions of TensorFlow before 1.2, we recommended using multi-threaded,
 queue-based input pipelines for performance. Beginning with TensorFlow 1.4,
 however, we recommend using the `tf.data` module instead. (See
-@{$datasets$Datasets} for details. In TensorFlow 1.2 and 1.3, the module was
+[Datasets](../../guide/datasets.md) for details. In TensorFlow 1.2 and 1.3, the module was
 called `tf.contrib.data`.) The `tf.data` module offers an easier-to-use
 interface for constructing efficient input pipelines. Furthermore, we've stopped
 developing the old multi-threaded, queue-based input pipelines.  We've retained
diff --git a/tensorflow/docs_src/api_guides/python/train.md b/tensorflow/docs_src/api_guides/python/train.md
index a118123665..4b4c6a4fe3 100644
--- a/tensorflow/docs_src/api_guides/python/train.md
+++ b/tensorflow/docs_src/api_guides/python/train.md
@@ -74,9 +74,9 @@ moving averages for evaluations often improve results significantly.
 
 ## Coordinator and QueueRunner
 
-See @{$threading_and_queues$Threading and Queues}
+See [Threading and Queues](../../api_guides/python/threading_and_queues.md)
 for how to use threads and queues.  For documentation on the Queue API,
-see @{$python/io_ops#queues$Queues}.
+see [Queues](../../api_guides/python/io_ops.md#queues).
 
 
 *   `tf.train.Coordinator`
@@ -87,7 +87,7 @@ see @{$python/io_ops#queues$Queues}.
 
 ## Distributed execution
 
-See @{$distributed$Distributed TensorFlow} for
+See [Distributed TensorFlow](../../deploy/distributed.md) for
 more information about how to configure a distributed TensorFlow program.
 
 *   `tf.train.Server`
@@ -105,7 +105,7 @@ more information about how to configure a distributed TensorFlow program.
 
 ## Reading Summaries from Event Files
 
-See @{$summaries_and_tensorboard$Summaries and TensorBoard} for an
+See [Summaries and TensorBoard](../../guide/summaries_and_tensorboard.md) for an
 overview of summaries, event files, and visualization in TensorBoard.
 
 *   `tf.train.summary_iterator`
diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md
index afbb8bbdd0..ece4a7c70b 100644
--- a/tensorflow/docs_src/community/contributing.md
+++ b/tensorflow/docs_src/community/contributing.md
@@ -25,12 +25,12 @@ guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md
 [developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/d/forum/developers)
 mailing list, to coordinate and discuss with others contributing to TensorFlow.
 
-* For coding style conventions, read the @{$style_guide$TensorFlow Style Guide}.
+* For coding style conventions, read the [TensorFlow Style Guide](../community/style_guide.md).
 
-* Finally, review @{$documentation$Writing TensorFlow Documentation}, which
+* Finally, review [Writing TensorFlow Documentation](../community/documentation.md), which
   explains documentation conventions.
 
-You may also wish to review our guide to @{$benchmarks$defining and running benchmarks}.
+You may also wish to review our guide to [defining and running benchmarks](../community/benchmarks.md).
 
 ## Special Interest Groups
 
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index 0aa8e7612a..1a30be32a5 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -25,10 +25,10 @@ the appropriate repository for the project. Major repositories include:
   
 ### Security
 
-Before using TensorFlow, please take a look at our security model, list of
-recent security announcements, and ways you can report security issues to the
-TensorFlow team at the
-[Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
+Before using TensorFlow, please take a look at our [security model](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md#tensorflow-models-are-programs),
+[list of recent security advisories and announcements](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/index.md),
+and [ways you can report security issues](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md#reporting-vulnerabilities)
+to the TensorFlow team at the [Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
 
 ## Stay Informed
 
@@ -40,7 +40,7 @@ We recommend that you join this list if you depend on TensorFlow in any way.
 
 ### Development Roadmap
 
-The @{$roadmap$Roadmap} summarizes plans for upcoming additions to TensorFlow.
+The [Roadmap](../community/roadmap.md) summarizes plans for upcoming additions to TensorFlow.
 
 ### Social Media
 
@@ -70,12 +70,12 @@ the [TensorFlow discuss mailing
 list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
 
 A number of other mailing lists exist, focused on different project areas, which
-can be found at @{$lists$TensorFlow Mailing Lists}.
+can be found at [TensorFlow Mailing Lists](../community/lists.md).
 
 ### User Groups
 
 To meet with like-minded people local to you, check out the many
-@{$groups$TensorFlow user groups} around the world.
+[TensorFlow user groups](../community/groups.md) around the world.
 
 
 ## Contributing To TensorFlow
diff --git a/tensorflow/docs_src/community/roadmap.md b/tensorflow/docs_src/community/roadmap.md
index 0463ca05fe..d11b6ed467 100644
--- a/tensorflow/docs_src/community/roadmap.md
+++ b/tensorflow/docs_src/community/roadmap.md
@@ -58,10 +58,12 @@ across image recognition, speech, object detection, and
 * Increase support for devices beyond Android and iOS (eg. RPi, Cortex-M)
 
 #### TensorFlow.js:
-* Release package for Node.js bindings to the TensorFlow C API through the TensorFlow.js backend interface
-* Expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser
-* Improve Layers API and allow model exporting/saving
+* Continue to expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser
+* Improve inference and training performance in both browser and Node.js environments
+* Widen the collection of pre-built models in [tfjs-models](https://github.com/tensorflow/tfjs-models),
+  including but not limited to audio- and speech-oriented models
 * Release tfjs-data API for efficient data input pipelines
+* Integration with [TF-Hub](https://www.tensorflow.org/hub/)
 
 #### TensorFlow with Swift:
 * Establish open source project including documentation, open design, and code availability.
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
index daf0d2fdc0..c78da20edd 100644
--- a/tensorflow/docs_src/community/style_guide.md
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -88,7 +88,7 @@ creates a part of the graph and returns output tensors.
 * Operations should contain an extensive Python comment with Args and Returns
  declarations that explain both the type and meaning of each value. Possible
  shapes, dtypes, or ranks should be specified in the description.
- @{$documentation$See documentation details}
+ [See documentation details](../community/documentation.md)
 
 * For increased usability include an example of usage with inputs / outputs
  of the op in Example section.
diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md
index 6a760f53c8..2fba36cfa7 100644
--- a/tensorflow/docs_src/deploy/distributed.md
+++ b/tensorflow/docs_src/deploy/distributed.md
@@ -2,7 +2,7 @@
 
 This document shows how to create a cluster of TensorFlow servers, and how to
 distribute a computation graph across that cluster. We assume that you are
-familiar with the @{$guide/low_level_intro$basic concepts} of
+familiar with the [basic concepts](../guide/low_level_intro.md) of
 writing low level TensorFlow programs.
 
 ## Hello distributed TensorFlow!
diff --git a/tensorflow/docs_src/deploy/hadoop.md b/tensorflow/docs_src/deploy/hadoop.md
index c4471562b9..b0d416df2e 100644
--- a/tensorflow/docs_src/deploy/hadoop.md
+++ b/tensorflow/docs_src/deploy/hadoop.md
@@ -6,7 +6,7 @@ at the moment.
 
 ## HDFS
 
-We assume that you are familiar with @{$reading_data$reading data}.
+We assume that you are familiar with [reading data](../api_guides/python/reading_data.md).
 
 To use HDFS with TensorFlow, change the file paths you use to read and write
 data to an HDFS path. For example:
@@ -61,5 +61,5 @@ be set:
     export KRB5CCNAME=/tmp/krb5cc_10002
     ```
 
-If you are running @{$distributed$Distributed TensorFlow}, then all
+If you are running [Distributed TensorFlow](../deploy/distributed.md), then all
 workers must have the environment variables set and Hadoop installed.
diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
index 3322004189..08b28de639 100644
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@@ -3,11 +3,11 @@
 This section focuses on deploying real-world models.  It contains
 the following documents:
 
-  * @{$distributed$Distributed TensorFlow}, which explains how to create
+  * [Distributed TensorFlow](../deploy/distributed.md), which explains how to create
     a cluster of TensorFlow servers.
-  * @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
+  * [How to run TensorFlow on Hadoop](../deploy/hadoop.md), which has a highly
     self-explanatory title.
-  * @{$s3$How to run TensorFlow with the S3 filesystem}, which explains how
+  * [How to run TensorFlow with the S3 filesystem](../deploy/s3.md), which explains how
     to run TensorFlow with the S3 file system.
   * The entire document set for [TensorFlow serving](/serving), an open-source,
     flexible, high-performance serving system for machine-learned models
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index 079c796aa7..b4a759d687 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -64,7 +64,7 @@ You should see output similar to this:
 
 ### Reading Data
 
-When @{$reading_data$reading data}, change the file paths you use to read and write
+When [reading data](../api_guides/python/reading_data.md), change the file paths you use to read and write
 data to an S3 path. For example:
 
 ```python
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index bc0f662f0c..5f8ac64d25 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -225,7 +225,7 @@ it will use the `FooBarFileSystem` implementation.
 Next, you must build a shared object containing this implementation. An example
 of doing so using bazel's `cc_binary` rule can be found
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/BUILD#L244),
-but you may use any build system to do so. See the section on @{$adding_an_op#build_the_op_library$building the op library} for similar
+but you may use any build system to do so. See the section on [building the op library](../extend/adding_an_op.md#build_the_op_library) for similar
 instructions.
 
 The result of building this target is a `.so` shared object file.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index fbf5c0b90d..cc25ab9b45 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -56,8 +56,8 @@ PREREQUISITES:
 
 *   Some familiarity with C++.
 *   Must have installed the
-    @{$install$TensorFlow binary}, or must have
-    @{$install_sources$downloaded TensorFlow source},
+    [TensorFlow binary](../install/index.md), or must have
+    [downloaded TensorFlow source](../install/install_sources.md),
     and be able to build it.
 
 [TOC]
@@ -1140,7 +1140,7 @@ In general, changes to existing, checked-in specifications must be
 backwards-compatible: changing the specification of an op must not break prior
 serialized `GraphDef` protocol buffers constructed from older specifications.
 The details of `GraphDef` compatibility are
-@{$version_compat#compatibility_of_graphs_and_checkpoints$described here}.
+[described here](../guide/version_compat.md#compatibility_of_graphs_and_checkpoints).
 
 There are several ways to preserve backwards-compatibility.
 
@@ -1190,7 +1190,7 @@ callers.  The Python API may be kept compatible by careful changes in a
 hand-written Python wrapper, by keeping the old signature except possibly adding
 new optional arguments to the end.  Generally incompatible changes may only be
 made when TensorFlow's changes major versions, and must conform to the
-@{$version_compat#compatibility_of_graphs_and_checkpoints$`GraphDef` version semantics}.
+[`GraphDef` version semantics](../guide/version_compat.md#compatibility_of_graphs_and_checkpoints).
 
 ### GPU Support
 
@@ -1262,7 +1262,7 @@ For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
 Given a graph of ops, TensorFlow uses automatic differentiation
 (backpropagation) to add new ops representing gradients with respect to the
 existing ops (see
-@{$python/train#gradient_computation$Gradient Computation}).
+[Gradient Computation](../api_guides/python/train.md#gradient_computation)).
 To make automatic differentiation work for new ops, you must register a gradient
 function which computes gradients with respect to the ops' inputs given
 gradients with respect to the ops' outputs.
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
index 83d70c9468..eb33336bee 100644
--- a/tensorflow/docs_src/extend/architecture.md
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -7,8 +7,8 @@ learning models and system-level optimizations.
 This document describes the system architecture that makes this
 combination of scale and flexibility possible. It assumes that you have basic familiarity
 with TensorFlow programming concepts such as the computation graph, operations,
-and sessions. See @{$guide/low_level_intro$this document} for an introduction to
-these topics. Some familiarity with @{$distributed$distributed TensorFlow}
+and sessions. See [this document](../guide/low_level_intro.md) for an introduction to
+these topics. Some familiarity with [distributed TensorFlow](../deploy/distributed.md)
 will also be helpful.
 
 This document is for developers who want to extend TensorFlow in some way not
@@ -199,7 +199,7 @@ Many of the operation kernels are implemented using Eigen::Tensor, which uses
 C++ templates to generate efficient parallel code for multicore CPUs and GPUs;
 however, we liberally use libraries like cuDNN where a more efficient kernel
 implementation is possible. We have also implemented
-@{$quantization$quantization}, which enables
+[quantization](../performance/quantization.md), which enables
 faster inference in environments such as mobile devices and high-throughput
 datacenter applications, and use the
 [gemmlowp](https://github.com/google/gemmlowp) low-precision matrix library to
@@ -209,7 +209,7 @@ If it is difficult or inefficient to represent a subcomputation as a composition
 of operations, users can register additional kernels that provide an efficient
 implementation written in C++. For example, we recommend registering your own
 fused kernels for some performance critical operations, such as the ReLU and
-Sigmoid activation functions and their corresponding gradients. The @{$xla$XLA Compiler} has an
+Sigmoid activation functions and their corresponding gradients. The [XLA Compiler](../performance/xla/index.md) has an
 experimental implementation of automatic kernel fusion.
 
 ### Code
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index 0e4bfd1dc4..bbf4a8139b 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -3,16 +3,16 @@
 This section explains how developers can add functionality to TensorFlow's
 capabilities. Begin by reading the following architectural overview:
 
-  * @{$architecture$TensorFlow Architecture}
+  * [TensorFlow Architecture](../extend/architecture.md)
 
 The following guides explain how to extend particular aspects of
 TensorFlow:
 
-  * @{$adding_an_op$Adding a New Op}, which explains how to create your own
+  * [Adding a New Op](../extend/adding_an_op.md), which explains how to create your own
     operations.
-  * @{$add_filesys$Adding a Custom Filesystem Plugin}, which explains how to
+  * [Adding a Custom Filesystem Plugin](../extend/add_filesys.md), which explains how to
     add support for your own shared or distributed filesystem.
-  * @{$new_data_formats$Custom Data Readers}, which details how to add support
+  * [Custom Data Readers](../extend/new_data_formats.md), which details how to add support
     for your own file and record formats.
 
 Python is currently the only language supported by TensorFlow's API stability
@@ -24,11 +24,11 @@ plus community support for [Haskell](https://github.com/tensorflow/haskell) and
 develop TensorFlow features in a language other than these languages, read the
 following guide:
 
-  * @{$language_bindings$TensorFlow in Other Languages}
+  * [TensorFlow in Other Languages](../extend/language_bindings.md)
 
 To create tools compatible with TensorFlow's model format, read the following
 guide:
 
-  * @{$tool_developers$A Tool Developer's Guide to TensorFlow Model Files}
+  * [A Tool Developer's Guide to TensorFlow Model Files](../extend/tool_developers/index.md)
 
 
diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
index 9a968d365b..4727eabdc1 100644
--- a/tensorflow/docs_src/extend/language_bindings.md
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -125,7 +125,7 @@ The `OpDef` specifies the following:
     instead of CamelCase for the op's function name.
 -   A list of inputs and outputs. The types for these may be polymorphic by
     referencing attributes, as described in the inputs and outputs section of
-    @{$adding_an_op$Adding an     op}.
+    [Adding an     op](../extend/adding_an_op.md).
 -   A list of attributes, along with their default values (if any). Note that
     some of these will be inferred (if they are determined by an input), some
     will be optional (if they have a default), and some will be required (no
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index 47a8344b70..7ca50c9c76 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -4,7 +4,7 @@ PREREQUISITES:
 
 *   Some familiarity with C++.
 *   Must have
-    @{$install_sources$downloaded TensorFlow source}, and be
+    [downloaded TensorFlow source](../install/install_sources.md), and be
     able to build it.
 
 We divide the task of supporting a file format into two pieces:
@@ -67,7 +67,7 @@ need to:
 
 You can put all the C++ code in a single file, such as
 `my_reader_dataset_op.cc`. It will help if you are
-familiar with @{$adding_an_op$the adding an op how-to}. The following skeleton
+familiar with [the adding an op how-to](../extend/adding_an_op.md). The following skeleton
 can be used as a starting point for your implementation:
 
 ```c++
@@ -227,8 +227,8 @@ REGISTER_KERNEL_BUILDER(Name("MyReaderDataset").Device(tensorflow::DEVICE_CPU),
 ```
 
 The last step is to build the C++ code and add a Python wrapper. The easiest way
-to do this is by @{$adding_an_op#build_the_op_library$compiling a dynamic
-library} (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
+to do this is by [compiling a dynamic
+library](../extend/adding_an_op.md#build_the_op_library) (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
 that subclasses `tf.data.Dataset` to wrap it. An example Python program is
 given here:
 
@@ -285,7 +285,7 @@ You can see some examples of `Dataset` wrapper classes in
 ## Writing an Op for a record format
 
 Generally this is an ordinary op that takes a scalar string record as input, and
-so follow @{$adding_an_op$the instructions to add an Op}.
+so follow [the instructions to add an Op](../extend/adding_an_op.md).
 You may optionally take a scalar string key as input, and include that in error
 messages reporting improperly formatted data.  That way users can more easily
 track down where the bad data came from.
diff --git a/tensorflow/docs_src/guide/checkpoints.md b/tensorflow/docs_src/guide/checkpoints.md
index e1add29852..3c92cbbd40 100644
--- a/tensorflow/docs_src/guide/checkpoints.md
+++ b/tensorflow/docs_src/guide/checkpoints.md
@@ -9,13 +9,13 @@ Estimators. TensorFlow provides two model formats:
     the model.
 
 This document focuses on checkpoints. For details on `SavedModel`, see the
-@{$saved_model$Saving and Restoring} guide.
+[Saving and Restoring](../guide/saved_model.md) guide.
 
 
 ## Sample code
 
 This document relies on the same
-[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
+[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in [Getting Started with TensorFlow](../guide/premade_estimators.md).
 To download and access the example, invoke the following two commands:
 
 ```shell
@@ -160,7 +160,7 @@ checkpoint to the `model_dir`. Each subsequent call to the Estimator's
 1.  The Estimator builds the model's
     [graph](https://developers.google.com/machine-learning/glossary/#graph)
     by running the `model_fn()`.  (For details on the `model_fn()`, see
-    @{$custom_estimators$Creating Custom Estimators.})
+    [Creating Custom Estimators.](../guide/custom_estimators.md))
 2.  The Estimator initializes the weights of the new model from the data
     stored in the most recent checkpoint.
 
@@ -231,7 +231,7 @@ This separation will keep your checkpoints recoverable.
 Checkpoints provide an easy automatic mechanism for saving and restoring
 models created by Estimators.
 
-See the @{$saved_model$Saving and Restoring} guide for details about:
+See the [Saving and Restoring](../guide/saved_model.md) guide for details about:
 
 *   Saving and restoring models using low-level TensorFlow APIs.
 *   Exporting and importing models in the SavedModel format, which is a
diff --git a/tensorflow/docs_src/guide/custom_estimators.md b/tensorflow/docs_src/guide/custom_estimators.md
index 199a0e93de..913a35920f 100644
--- a/tensorflow/docs_src/guide/custom_estimators.md
+++ b/tensorflow/docs_src/guide/custom_estimators.md
@@ -5,7 +5,7 @@ This document introduces custom Estimators. In particular, this document
 demonstrates how to create a custom `tf.estimator.Estimator` that
 mimics the behavior of the pre-made Estimator
 `tf.estimator.DNNClassifier` in solving the Iris problem. See
-the @{$premade_estimators$Pre-Made Estimators chapter} for details
+the [Pre-Made Estimators chapter](../guide/premade_estimators.md) for details
 on the Iris problem.
 
 To download and access the example code invoke the following two commands:
@@ -84,7 +84,7 @@ and a logits output layer.
 ## Write an Input function
 
 Our custom Estimator implementation uses the same input function as our
-@{$premade_estimators$pre-made Estimator implementation}, from
+[pre-made Estimator implementation](../guide/premade_estimators.md), from
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py).
 Namely:
 
@@ -106,8 +106,8 @@ This input function builds an input pipeline that yields batches of
 
 ## Create feature columns
 
-As detailed in the @{$premade_estimators$Premade Estimators} and
-@{$feature_columns$Feature Columns} chapters, you must define
+As detailed in the [Premade Estimators](../guide/premade_estimators.md) and
+[Feature Columns](../guide/feature_columns.md) chapters, you must define
 your model's feature columns to specify how the model should use each feature.
 Whether working with pre-made Estimators or custom Estimators, you define
 feature columns in the same fashion.
@@ -145,7 +145,7 @@ to the constructor are in turn passed on to the `model_fn`. In
 [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py)
 the following lines create the estimator and set the params to configure the
 model. This configuration step is similar to how we configured the `tf.estimator.DNNClassifier` in
-@{$premade_estimators}.
+[Premade Estimators](../guide/premade_estimators.md).
 
 ```python
 classifier = tf.estimator.Estimator(
@@ -489,7 +489,7 @@ configure your Estimator without modifying the code in the `model_fn`.
 
 The rest of the code to train, evaluate, and generate predictions using our
 Estimator is the same as in the
-@{$premade_estimators$Premade Estimators} chapter. For
+[Premade Estimators](../guide/premade_estimators.md) chapter. For
 example, the following line will train the model:
 
 ```python
@@ -597,6 +597,6 @@ For more details, be sure to check out:
   which contains more curated examples using custom estimators.
 * This [TensorBoard video](https://youtu.be/eBbEDRsCmv4), which introduces
   TensorBoard.
-* The @{$low_level_intro$Low Level Introduction}, which demonstrates
+* The [Low Level Introduction](../guide/low_level_intro.md), which demonstrates
   how to experiment directly with TensorFlow's low level APIs, making debugging
   easier.
diff --git a/tensorflow/docs_src/guide/datasets.md b/tensorflow/docs_src/guide/datasets.md
index bb18e8b79c..60de181b21 100644
--- a/tensorflow/docs_src/guide/datasets.md
+++ b/tensorflow/docs_src/guide/datasets.md
@@ -335,7 +335,7 @@ restore the current state of the iterator (and, effectively, the whole input
 pipeline). A saveable object thus created can be added to `tf.train.Saver`
 variables list or the `tf.GraphKeys.SAVEABLE_OBJECTS` collection for saving and
 restoring in the same manner as a `tf.Variable`. Refer to
-@{$saved_model$Saving and Restoring} for details on how to save and restore
+[Saving and Restoring](../guide/saved_model.md) for details on how to save and restore
 variables.
 
 ```python
@@ -782,8 +782,9 @@ with tf.train.MonitoredTrainingSession(...) as sess:
     sess.run(training_op)
 ```
 
-To use a `Dataset` in the `input_fn` of a `tf.estimator.Estimator`, we also
-recommend using `Dataset.make_one_shot_iterator()`. For example:
+To use a `Dataset` in the `input_fn` of a `tf.estimator.Estimator`, simply
+return the `Dataset` and the framework will take care of creating an iterator
+and initializing it for you. For example:
 
 ```python
 def dataset_input_fn():
@@ -814,10 +815,9 @@ def dataset_input_fn():
   dataset = dataset.shuffle(buffer_size=10000)
   dataset = dataset.batch(32)
   dataset = dataset.repeat(num_epochs)
-  iterator = dataset.make_one_shot_iterator()
 
-  # `features` is a dictionary in which each value is a batch of values for
-  # that feature; `labels` is a batch of labels.
-  features, labels = iterator.get_next()
-  return features, labels
+  # Each element of `dataset` is tuple containing a dictionary of features
+  # (in which each value is a batch of values for that feature), and a batch of
+  # labels.
+  return dataset
 ```
diff --git a/tensorflow/docs_src/guide/datasets_for_estimators.md b/tensorflow/docs_src/guide/datasets_for_estimators.md
index 969ea579f7..09a3830ca9 100644
--- a/tensorflow/docs_src/guide/datasets_for_estimators.md
+++ b/tensorflow/docs_src/guide/datasets_for_estimators.md
@@ -14,7 +14,7 @@ introduces the API by walking through two simple examples:
 
 Taking slices from an array is the simplest way to get started with `tf.data`.
 
-The @{$premade_estimators$Premade Estimators} chapter describes
+The [Premade Estimators](../guide/premade_estimators.md) chapter describes
 the following `train_input_fn`, from
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py),
 to pipe the data into the Estimator:
@@ -91,8 +91,8 @@ print(mnist_ds)
 ```
 
 This will print the following line, showing the
-@{$guide/tensors#shapes$shapes} and
-@{$guide/tensors#data_types$types} of the items in
+[shapes](../guide/tensors.md#shapes) and
+[types](../guide/tensors.md#data_types) of the items in
 the dataset. Note that a `Dataset` does not know how many items it contains.
 
 ``` None
@@ -128,7 +128,7 @@ print(dataset)
 
 Here we see that when a `Dataset` contains structured elements, the `shapes`
 and `types` of the `Dataset` take on the same structure. This dataset contains
-dictionaries of @{$guide/tensors#rank$scalars}, all of type
+dictionaries of [scalars](../guide/tensors.md#rank), all of type
 `tf.float64`.
 
 The first line of the iris `train_input_fn` uses the same functionality, but
@@ -377,11 +377,11 @@ Now you have the basic idea of how to efficiently load data into an
 Estimator. Consider the following documents next:
 
 
-* @{$custom_estimators}, which demonstrates how to build your own
+* [Creating Custom Estimators](../guide/custom_estimators.md), which demonstrates how to build your own
   custom `Estimator` model.
-* The @{$low_level_intro#datasets$Low Level Introduction}, which demonstrates
+* The [Low Level Introduction](../guide/low_level_intro.md#datasets), which demonstrates
   how to experiment directly with `tf.data.Datasets` using TensorFlow's low
   level APIs.
-* @{$guide/datasets} which goes into great detail about additional
+* [Importing Data](../guide/datasets.md) which goes into great detail about additional
   functionality of `Datasets`.
 
diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md
index 4c4a04a88a..5af27471a2 100644
--- a/tensorflow/docs_src/guide/debugger.md
+++ b/tensorflow/docs_src/guide/debugger.md
@@ -95,7 +95,7 @@ intermediate tensors (tensors that are neither inputs or outputs of the
 `Session.run()` call, but are in the path leading from the inputs to the
 outputs). This filter is for `nan`s and `inf`s is a common enough use case that
 we ship it with the
-@{$python/tfdbg#Classes_for_debug_dump_data_and_directories$`debug_data`}
+[`debug_data`](../api_guides/python/tfdbg.md#Classes_for_debug_dump_data_and_directories)
 module.
 
 Note: You can also write your own custom filters. See `tfdbg.DebugDumpDir.find`
diff --git a/tensorflow/docs_src/guide/eager.md b/tensorflow/docs_src/guide/eager.md
index 24f6e4ee95..3b5797a638 100644
--- a/tensorflow/docs_src/guide/eager.md
+++ b/tensorflow/docs_src/guide/eager.md
@@ -558,7 +558,7 @@ m.result()  # => 5.5
 
 #### Summaries and TensorBoard
 
-@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
+[TensorBoard](../guide/summaries_and_tensorboard.md) is a visualization tool for
 understanding, debugging and optimizing the model training process. It uses
 summary events that are written while executing the program.
 
@@ -568,9 +568,8 @@ inserted during model construction. For example, to record summaries once every
 100 global steps:
 
 ```py
+global_step = tf.train.get_or_create_global_step()
 writer = tf.contrib.summary.create_file_writer(logdir)
-global_step=tf.train.get_or_create_global_step()  # return global step var
-
 writer.set_as_default()
 
 for _ in range(iterations):
diff --git a/tensorflow/docs_src/guide/embedding.md b/tensorflow/docs_src/guide/embedding.md
index 8a98367dfb..6007e6847b 100644
--- a/tensorflow/docs_src/guide/embedding.md
+++ b/tensorflow/docs_src/guide/embedding.md
@@ -78,7 +78,7 @@ Embeddings can be trained in many network types, and with various loss
 functions and data sets. For example, one could use a recurrent neural network
 to predict the next word from the previous one given a large corpus of
 sentences, or one could train two networks to do multi-lingual translation.
-These methods are described in the @{$word2vec$Vector Representations of Words}
+These methods are described in the [Vector Representations of Words](../tutorials/representation/word2vec.md)
 tutorial.
 
 ## Visualizing Embeddings
diff --git a/tensorflow/docs_src/guide/estimators.md b/tensorflow/docs_src/guide/estimators.md
index 7b54e3de29..3903bfd126 100644
--- a/tensorflow/docs_src/guide/estimators.md
+++ b/tensorflow/docs_src/guide/estimators.md
@@ -84,7 +84,7 @@ of the following four steps:
            ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
-    (See @{$guide/datasets} for full details.)
+    (See [Importing Data](../guide/datasets.md) for full details.)
 
 2.  **Define the feature columns.** Each `tf.feature_column`
     identifies a feature name, its type, and any input pre-processing.
@@ -136,7 +136,7 @@ The heart of every Estimator--whether pre-made or custom--is its
 evaluation, and prediction. When you are using a pre-made Estimator,
 someone else has already implemented the model function. When relying
 on a custom Estimator, you must write the model function yourself. A
-@{$custom_estimators$companion document}
+[companion document](../guide/custom_estimators.md)
 explains how to write the model function.
 
 
diff --git a/tensorflow/docs_src/guide/faq.md b/tensorflow/docs_src/guide/faq.md
index 8370097560..a02635ebba 100644
--- a/tensorflow/docs_src/guide/faq.md
+++ b/tensorflow/docs_src/guide/faq.md
@@ -2,7 +2,7 @@
 
 This document provides answers to some of the frequently asked questions about
 TensorFlow. If you have a question that is not covered here, you might find an
-answer on one of the TensorFlow @{$about$community resources}.
+answer on one of the TensorFlow [community resources](../about/index.md).
 
 [TOC]
 
@@ -11,7 +11,7 @@ answer on one of the TensorFlow @{$about$community resources}.
 #### Can I run distributed training on multiple computers?
 
 Yes! TensorFlow gained
-@{$distributed$support for distributed computation} in
+[support for distributed computation](../deploy/distributed.md) in
 version 0.8. TensorFlow now supports multiple devices (CPUs and GPUs) in one or
 more computers.
 
@@ -23,7 +23,7 @@ As of the 0.6.0 release timeframe (Early December 2015), we do support Python
 ## Building a TensorFlow graph
 
 See also the
-@{$python/framework$API documentation on building graphs}.
+[API documentation on building graphs](../api_guides/python/framework.md).
 
 #### Why does `c = tf.matmul(a, b)` not execute the matrix multiplication immediately?
 
@@ -48,16 +48,16 @@ device, and `"/device:GPU:i"` (or `"/gpu:i"`) for the *i*th GPU device.
 To place a group of operations on a device, create them within a
 `tf.device` context.  See
 the how-to documentation on
-@{$using_gpu$using GPUs with TensorFlow} for details of how
+[using GPUs with TensorFlow](../guide/using_gpu.md) for details of how
 TensorFlow assigns operations to devices, and the
-@{$deep_cnn$CIFAR-10 tutorial} for an example model that
+[CIFAR-10 tutorial](../tutorials/images/deep_cnn.md) for an example model that
 uses multiple GPUs.
 
 
 ## Running a TensorFlow computation
 
 See also the
-@{$python/client$API documentation on running graphs}.
+[API documentation on running graphs](../api_guides/python/client.md).
 
 #### What's the deal with feeding and placeholders?
 
@@ -106,7 +106,7 @@ a significant amount of memory, and can be released when the session is closed b
 `tf.Session.close`.
 
 The intermediate tensors that are created as part of a call to
-@{$python/client$`Session.run()`} will be freed at or before the
+[`Session.run()`](../api_guides/python/client.md) will be freed at or before the
 end of the call.
 
 #### Does the runtime parallelize parts of graph execution?
@@ -118,7 +118,7 @@ dimensions:
   CPU, or multiple threads in a GPU.
 * Independent nodes in a TensorFlow graph can run in parallel on multiple
   devices, which makes it possible to speed up
-  @{$deep_cnn$CIFAR-10 training using multiple GPUs}.
+  [CIFAR-10 training using multiple GPUs](../tutorials/images/deep_cnn.md).
 * The Session API allows multiple concurrent steps (i.e. calls to
   `tf.Session.run` in parallel). This
   enables the runtime to get higher throughput, if a single step does not use
@@ -141,9 +141,9 @@ Bindings for various other languages (such as [C#](https://github.com/migueldeic
 #### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
 
 TensorFlow supports multiple GPUs and CPUs. See the how-to documentation on
-@{$using_gpu$using GPUs with TensorFlow} for details of how
+[using GPUs with TensorFlow](../guide/using_gpu.md) for details of how
 TensorFlow assigns operations to devices, and the
-@{$deep_cnn$CIFAR-10 tutorial} for an example model that
+[CIFAR-10 tutorial](../tutorials/images/deep_cnn.md) for an example model that
 uses multiple GPUs.
 
 Note that TensorFlow only uses GPU devices with a compute capability greater
@@ -155,16 +155,16 @@ The `tf.ReaderBase` and
 `tf.QueueBase` classes provide special operations that
 can *block* until input (or free space in a bounded queue) becomes
 available. These operations allow you to build sophisticated
-@{$reading_data$input pipelines}, at the cost of making the
+[input pipelines](../api_guides/python/reading_data.md), at the cost of making the
 TensorFlow computation somewhat more complicated. See the how-to documentation
 for
-@{$reading_data#creating_threads_to_prefetch_using_queuerunner_objects$using `QueueRunner` objects to drive queues and readers}
+[using `QueueRunner` objects to drive queues and readers](../api_guides/python/reading_data.md#creating_threads_to_prefetch_using_queuerunner_objects)
 for more information on how to use them.
 
 ## Variables
 
-See also the how-to documentation on @{$variables$variables} and
-@{$python/state_ops$the API documentation for variables}.
+See also the how-to documentation on [variables](../guide/variables.md) and
+[the API documentation for variables](../api_guides/python/state_ops.md).
 
 #### What is the lifetime of a variable?
 
@@ -231,7 +231,7 @@ to encode the batch size as a Python constant, but instead to use a symbolic
 
 #### How can I visualize a TensorFlow graph?
 
-See the @{$graph_viz$graph visualization tutorial}.
+See the [graph visualization tutorial](../guide/graph_viz.md).
 
 #### What is the simplest way to send data to TensorBoard?
 
@@ -241,7 +241,7 @@ these summaries to a log directory.  Then, start TensorBoard using
     python tensorflow/tensorboard/tensorboard.py --logdir=path/to/log-directory
 
 For more details, see the
-@{$summaries_and_tensorboard$Summaries and TensorBoard tutorial}.
+[Summaries and TensorBoard tutorial](../guide/summaries_and_tensorboard.md).
 
 #### Every time I launch TensorBoard, I get a network security popup!
 
@@ -251,7 +251,7 @@ the flag --host=localhost. This should quiet any security warnings.
 ## Extending TensorFlow
 
 See the how-to documentation for
-@{$adding_an_op$adding a new operation to TensorFlow}.
+[adding a new operation to TensorFlow](../extend/adding_an_op.md).
 
 #### My data is in a custom format. How do I read it using TensorFlow?
 
@@ -273,8 +273,8 @@ consider converting it, offline, to a format that is easily parsable, such
 as `tf.python_io.TFRecordWriter` format.
 
 The most efficient method to customize the parsing behavior is to
-@{$adding_an_op$add a new op written in C++} that parses your
-data format. The @{$new_data_formats$guide to handling new data formats} has
+[add a new op written in C++](../extend/adding_an_op.md) that parses your
+data format. The [guide to handling new data formats](../extend/new_data_formats.md) has
 more information about the steps for doing this.
 
 
diff --git a/tensorflow/docs_src/guide/feature_columns.md b/tensorflow/docs_src/guide/feature_columns.md
index b189c4334e..3ad41855e4 100644
--- a/tensorflow/docs_src/guide/feature_columns.md
+++ b/tensorflow/docs_src/guide/feature_columns.md
@@ -5,7 +5,7 @@ intermediaries between raw data and Estimators. Feature columns are very rich,
 enabling you to transform a diverse range of raw data into formats that
 Estimators can use, allowing easy experimentation.
 
-In @{$premade_estimators$Premade Estimators}, we used the premade
+In [Premade Estimators](../guide/premade_estimators.md), we used the premade
 Estimator, `tf.estimator.DNNClassifier` to train a model to
 predict different types of Iris flowers from four input features. That example
 created only numerical feature columns (of type
@@ -534,7 +534,7 @@ embedding_column = tf.feature_column.embedding_column(
     dimension=embedding_dimensions)
 ```
 
-@{$guide/embedding$Embeddings} is a significant topic within machine
+[Embeddings](../guide/embedding.md) is a significant topic within machine
 learning. This information was just to get you started using them as feature
 columns.
 
@@ -559,7 +559,7 @@ As the following list indicates, not all Estimators permit all types of
 
 For more examples on feature columns, view the following:
 
-* The @{$low_level_intro#feature_columns$Low Level Introduction} demonstrates how
+* The [Low Level Introduction](../guide/low_level_intro.md#feature_columns) demonstrates how
   experiment directly with `feature_columns` using TensorFlow's low level APIs.
 * The [Estimator wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep)
   solves a binary classification problem using `feature_columns` on a variety of
diff --git a/tensorflow/docs_src/guide/graph_viz.md b/tensorflow/docs_src/guide/graph_viz.md
index 97b0e2d4de..23f722bbe7 100644
--- a/tensorflow/docs_src/guide/graph_viz.md
+++ b/tensorflow/docs_src/guide/graph_viz.md
@@ -5,7 +5,7 @@ TensorFlow computation graphs are powerful but complicated. The graph visualizat
 ![Visualization of a TensorFlow graph](https://www.tensorflow.org/images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
 *Visualization of a TensorFlow graph.*
 
-To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
+To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see [TensorBoard: Visualizing Learning](../guide/summaries_and_tensorboard.md).
 
 ## Name scoping and nodes
 
@@ -251,7 +251,7 @@ is a snippet from the train and test section of a modification of the
 [Estimators MNIST tutorial](../tutorials/estimators/cnn.md), in which we have
 recorded summaries and
 runtime statistics. See the
-@{$summaries_and_tensorboard#serializing-the-data$Summaries Tutorial}
+[Summaries Tutorial](../guide/summaries_and_tensorboard.md#serializing-the-data)
 for details on how to record summaries.
 Full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
 
diff --git a/tensorflow/docs_src/guide/graphs.md b/tensorflow/docs_src/guide/graphs.md
index 2bb44fbb32..c70479dba2 100644
--- a/tensorflow/docs_src/guide/graphs.md
+++ b/tensorflow/docs_src/guide/graphs.md
@@ -38,13 +38,13 @@ programs:
   machines. TensorFlow inserts the necessary communication and coordination
   between devices.
 
-* **Compilation.** TensorFlow's @{$performance/xla$XLA compiler} can
+* **Compilation.** TensorFlow's [XLA compiler](../performance/xla/index.md) can
   use the information in your dataflow graph to generate faster code, for
   example, by fusing together adjacent operations.
 
 * **Portability.** The dataflow graph is a language-independent representation
   of the code in your model. You can build a dataflow graph in Python, store it
-  in a @{$saved_model$SavedModel}, and restore it in a C++ program for
+  in a [SavedModel](../guide/saved_model.md), and restore it in a C++ program for
   low-latency inference.
 
 
@@ -93,7 +93,7 @@ to all API functions in the same context.  For example:
   stored value. The `tf.Variable` object also has methods such as
   `tf.Variable.assign` and `tf.Variable.assign_add` that
   create `tf.Operation` objects that, when executed, update the stored value.
-  (See @{$guide/variables} for more information about variables.)
+  (See [Variables](../guide/variables.md) for more information about variables.)
 
 * Calling `tf.train.Optimizer.minimize` will add operations and tensors to the
   default graph that calculates gradients, and return a `tf.Operation` that,
@@ -210,7 +210,7 @@ with tf.device("/device:GPU:0"):
   # Operations created in this context will be pinned to the GPU.
   result = tf.matmul(weights, img)
 ```
-If you are deploying TensorFlow in a @{$distributed$typical distributed configuration},
+If you are deploying TensorFlow in a [typical distributed configuration](../deploy/distributed.md),
 you might specify the job name and task ID to place variables on
 a task in the parameter server job (`"/job:ps"`), and the other operations on
 task in the worker job (`"/job:worker"`):
diff --git a/tensorflow/docs_src/guide/index.md b/tensorflow/docs_src/guide/index.md
index 1c920e7d70..50499582cc 100644
--- a/tensorflow/docs_src/guide/index.md
+++ b/tensorflow/docs_src/guide/index.md
@@ -5,38 +5,38 @@ works. The units are as follows:
 
 ## High Level APIs
 
-  * @{$guide/keras}, TensorFlow's high-level API for building and
+  * [Keras](../guide/keras.md), TensorFlow's high-level API for building and
     training deep learning models.
-  * @{$guide/eager}, an API for writing TensorFlow code
+  * [Eager Execution](../guide/eager.md), an API for writing TensorFlow code
     imperatively, like you would use Numpy.
-  * @{$guide/datasets}, easy input pipelines to bring your data into
+  * [Importing Data](../guide/datasets.md), easy input pipelines to bring your data into
     your TensorFlow program.
-  * @{$guide/estimators}, a high-level API that provides
+  * [Estimators](../guide/estimators.md), a high-level API that provides
     fully-packaged models ready for large-scale training and production.
 
 ## Estimators
 
-* @{$premade_estimators}, the basics of premade Estimators.
-* @{$checkpoints}, save training progress and resume where you left off.
-* @{$feature_columns}, handle a variety of input data types without changes to the model.
-* @{$datasets_for_estimators}, use `tf.data` to input data.
-* @{$custom_estimators}, write your own Estimator.
+* [Premade Estimators](../guide/premade_estimators.md), the basics of premade Estimators.
+* [Checkpoints](../guide/checkpoints.md), save training progress and resume where you left off.
+* [Feature Columns](../guide/feature_columns.md), handle a variety of input data types without changes to the model.
+* [Datasets for Estimators](../guide/datasets_for_estimators.md), use `tf.data` to input data.
+* [Creating Custom Estimators](../guide/custom_estimators.md), write your own Estimator.
 
 ## Accelerators
 
-  * @{$using_gpu} explains how TensorFlow assigns operations to
+  * [Using GPUs](../guide/using_gpu.md) explains how TensorFlow assigns operations to
     devices and how you can change the arrangement manually.
-  * @{$using_tpu} explains how to modify `Estimator` programs to run on a TPU.
+  * [Using TPUs](../guide/using_tpu.md) explains how to modify `Estimator` programs to run on a TPU.
 
 ## Low Level APIs
 
-  * @{$guide/low_level_intro}, which introduces the
+  * [Introduction](../guide/low_level_intro.md), which introduces the
     basics of how you can use TensorFlow outside of the high Level APIs.
-  * @{$guide/tensors}, which explains how to create,
+  * [Tensors](../guide/tensors.md), which explains how to create,
     manipulate, and access Tensors--the fundamental object in TensorFlow.
-  * @{$guide/variables}, which details how
+  * [Variables](../guide/variables.md), which details how
     to represent shared, persistent state in your program.
-  * @{$guide/graphs}, which explains:
+  * [Graphs and Sessions](../guide/graphs.md), which explains:
       * dataflow graphs, which are TensorFlow's representation of computations
         as dependencies between operations.
       * sessions, which are TensorFlow's mechanism for running dataflow graphs
@@ -46,19 +46,19 @@ works. The units are as follows:
     such as Estimators or Keras, the high-level API creates and manages
     graphs and sessions for you, but understanding graphs and sessions
     can still be helpful.
-  * @{$guide/saved_model}, which
+  * [Save and Restore](../guide/saved_model.md), which
     explains how to save and restore variables and models.
 
 ## ML Concepts
 
-  * @{$guide/embedding}, which introduces the concept
+  * [Embeddings](../guide/embedding.md), which introduces the concept
     of embeddings, provides a simple example of training an embedding in
     TensorFlow, and explains how to view embeddings with the TensorBoard
     Embedding Projector.
 
 ## Debugging
 
-  * @{$guide/debugger}, which
+  * [TensorFlow Debugger](../guide/debugger.md), which
     explains how to use the TensorFlow debugger (tfdbg).
 
 ## TensorBoard
@@ -66,17 +66,17 @@ works. The units are as follows:
 TensorBoard is a utility to visualize different aspects of machine learning.
 The following guides explain how to use TensorBoard:
 
-  * @{$guide/summaries_and_tensorboard},
+  * [TensorBoard: Visualizing Learning](../guide/summaries_and_tensorboard.md),
     which introduces TensorBoard.
-  * @{$guide/graph_viz}, which
+  * [TensorBoard: Graph Visualization](../guide/graph_viz.md), which
     explains how to visualize the computational graph.
-  * @{$guide/tensorboard_histograms} which demonstrates the how to
+  * [TensorBoard Histogram Dashboard](../guide/tensorboard_histograms.md) which demonstrates the how to
     use TensorBoard's histogram dashboard.
 
 
 ## Misc
 
-  * @{$guide/version_compat},
+  * [TensorFlow Version Compatibility](../guide/version_compat.md),
     which explains backward compatibility guarantees and non-guarantees.
-  * @{$guide/faq}, which contains frequently asked
+  * [Frequently Asked Questions](../guide/faq.md), which contains frequently asked
     questions about TensorFlow.
diff --git a/tensorflow/docs_src/guide/low_level_intro.md b/tensorflow/docs_src/guide/low_level_intro.md
index dc6cb9ee0d..d002f8af0b 100644
--- a/tensorflow/docs_src/guide/low_level_intro.md
+++ b/tensorflow/docs_src/guide/low_level_intro.md
@@ -9,7 +9,7 @@ This guide gets you started programming in the low-level TensorFlow APIs
   * Use high level components ([datasets](#datasets), [layers](#layers), and
     [feature_columns](#feature_columns)) in this low level environment.
   * Build your own training loop, instead of using the one
-    @{$premade_estimators$provided by Estimators}.
+    [provided by Estimators](../guide/premade_estimators.md).
 
 We recommend using the higher level APIs to build models when possible.
 Knowing TensorFlow Core is valuable for the following reasons:
@@ -21,7 +21,7 @@ Knowing TensorFlow Core is valuable for the following reasons:
 
 ## Setup
 
-Before using this guide, @{$install$install TensorFlow}.
+Before using this guide, [install TensorFlow](../install/index.md).
 
 To get the most out of this guide, you should know the following:
 
@@ -145,7 +145,7 @@ browser, and you should see a graph similar to the following:
 
 ![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png)
 
-For more about TensorBoard's graph visualization tools see @{$graph_viz}.
+For more about TensorBoard's graph visualization tools see [TensorBoard: Graph Visualization](../guide/graph_viz.md).
 
 ### Session
 
@@ -303,7 +303,7 @@ while True:
     break
 ```
 
-For more details on Datasets and Iterators see: @{$guide/datasets}.
+For more details on Datasets and Iterators see: [Importing Data](../guide/datasets.md).
 
 ## Layers
 
@@ -398,7 +398,7 @@ and layer reuse impossible.
 
 The easiest way to experiment with feature columns is using the
 `tf.feature_column.input_layer` function. This function only accepts
-@{$feature_columns$dense columns} as inputs, so to view the result
+[dense columns](../guide/feature_columns.md) as inputs, so to view the result
 of a categorical column you must wrap it in an
 `tf.feature_column.indicator_column`. For example:
 
@@ -589,7 +589,7 @@ print(sess.run(y_pred))
 
 To learn more about building models with TensorFlow consider the following:
 
-* @{$custom_estimators$Custom Estimators}, to learn how to build
+* [Custom Estimators](../guide/custom_estimators.md), to learn how to build
   customized models with TensorFlow. Your knowledge of TensorFlow Core will
   help you understand and debug your own models.
 
@@ -597,8 +597,8 @@ If you want to learn more about the inner workings of TensorFlow consider the
 following documents, which go into more depth on many of the topics discussed
 here:
 
-* @{$graphs}
-* @{$tensors}
-* @{$variables}
+* [Graphs and Sessions](../guide/graphs.md)
+* [Tensors](../guide/tensors.md)
+* [Variables](../guide/variables.md)
 
 
diff --git a/tensorflow/docs_src/guide/premade_estimators.md b/tensorflow/docs_src/guide/premade_estimators.md
index dc38f0c1d3..9b64d51b98 100644
--- a/tensorflow/docs_src/guide/premade_estimators.md
+++ b/tensorflow/docs_src/guide/premade_estimators.md
@@ -8,7 +8,7 @@ how to solve the Iris classification problem in TensorFlow.
 Prior to using the sample code in this document, you'll need to do the
 following:
 
-* @{$install$Install TensorFlow}.
+* [Install TensorFlow](../install/index.md).
 * If you installed TensorFlow with virtualenv or Anaconda, activate your
   TensorFlow environment.
 * Install or upgrade pandas by issuing the following command:
@@ -78,10 +78,10 @@ provides a programming stack consisting of multiple API layers:
 
 We strongly recommend writing TensorFlow programs with the following APIs:
 
-* @{$guide/estimators$Estimators}, which represent a complete model.
+* [Estimators](../guide/estimators.md), which represent a complete model.
   The Estimator API provides methods to train the model, to judge the model's
   accuracy, and to generate predictions.
-* @{$guide/datasets_for_estimators}, which build a data input
+* [Datasets for Estimators](../guide/datasets_for_estimators.md), which build a data input
   pipeline. The Dataset API has methods to load and manipulate data, and feed
   it into your model. The Dataset API meshes well with the Estimators API.
 
@@ -173,14 +173,14 @@ example is an Iris Versicolor.
 An Estimator is TensorFlow's high-level representation of a complete model. It
 handles the details of initialization, logging, saving and restoring, and many
 other features so you can concentrate on your model. For more details see
-@{$guide/estimators}.
+[Estimators](../guide/estimators.md).
 
 An Estimator is any class derived from `tf.estimator.Estimator`. TensorFlow
 provides a collection of
 `tf.estimator`
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
 those, you may write your own
-@{$custom_estimators$custom Estimators}.
+[custom Estimators](../guide/custom_estimators.md).
 We recommend using pre-made Estimators when just getting started.
 
 To write a TensorFlow program based on pre-made Estimators, you must perform the
@@ -287,7 +287,7 @@ for key in train_x.keys():
 ```
 
 Feature columns can be far more sophisticated than those we're showing here.  We
-detail feature columns @{$feature_columns$later on} in our Getting
+detail feature columns [later on](../guide/feature_columns.md) in our Getting
 Started guide.
 
 Now that we have the description of how we want the model to represent the raw
@@ -366,6 +366,8 @@ Running this code yields the following output (or something similar):
 Test set accuracy: 0.967
 ```
 
+The `eval_result` dictionary also contains the `average_loss` (mean loss per sample), the `loss` (mean loss per mini-batch) and the value of the estimator's `global_step` (the number of training iterations it underwent).
+
 ### Making predictions (inferring) from the trained model
 
 We now have a trained model that produces good evaluation results.
@@ -423,8 +425,8 @@ Pre-made Estimators are an effective way to quickly create standard models.
 Now that you've gotten started writing TensorFlow programs, consider the
 following material:
 
-* @{$checkpoints$Checkpoints} to learn how to save and restore models.
-* @{$guide/datasets_for_estimators} to learn more about importing
+* [Checkpoints](../guide/checkpoints.md) to learn how to save and restore models.
+* [Datasets for Estimators](../guide/datasets_for_estimators.md) to learn more about importing
   data into your model.
-* @{$custom_estimators$Creating Custom Estimators} to learn how to
+* [Creating Custom Estimators](../guide/custom_estimators.md) to learn how to
   write your own Estimator, customized for a particular problem.
diff --git a/tensorflow/docs_src/guide/saved_model.md b/tensorflow/docs_src/guide/saved_model.md
index c260da7966..33ab891861 100644
--- a/tensorflow/docs_src/guide/saved_model.md
+++ b/tensorflow/docs_src/guide/saved_model.md
@@ -2,12 +2,12 @@
 
 The `tf.train.Saver` class provides methods to save and restore models. The
 `tf.saved_model.simple_save` function is an easy way to build a
-`tf.saved_model` suitable for serving. [Estimators](./estimators)
+`tf.saved_model` suitable for serving. [Estimators](../guide/estimators.md)
 automatically save and restore variables in the `model_dir`.
 
 ## Save and restore variables
 
-TensorFlow @{$variables} are the best way to represent shared, persistent state
+TensorFlow [Variables](../guide/variables.md) are the best way to represent shared, persistent state
 manipulated by your program. The `tf.train.Saver` constructor adds `save` and
 `restore` ops to the graph for all, or a specified list, of the variables in the
 graph.  The `Saver` object provides methods to run these ops, specifying paths
@@ -274,7 +274,7 @@ Ops has not changed.
 
 The `tf.saved_model.builder.SavedModelBuilder` class allows
 users to control whether default-valued attributes must be stripped from the
-@{$extend/tool_developers#nodes$`NodeDefs`}
+[`NodeDefs`](../extend/tool_developers/index.md#nodes)
 while adding a meta graph to the SavedModel bundle. Both
 `tf.saved_model.builder.SavedModelBuilder.add_meta_graph_and_variables`
 and `tf.saved_model.builder.SavedModelBuilder.add_meta_graph`
@@ -413,7 +413,7 @@ SavedModel format. This section explains how to:
 
 ### Prepare serving inputs
 
-During training, an @{$premade_estimators#input_fn$`input_fn()`} ingests data
+During training, an [`input_fn()`](../guide/premade_estimators.md#input_fn) ingests data
 and prepares it for use by the model.  At serving time, similarly, a
 `serving_input_receiver_fn()` accepts inference requests and prepares them for
 the model.  This function has the following purposes:
@@ -616,7 +616,7 @@ result = stub.Classify(request, 10.0)  # 10 secs timeout
 The returned result in this example is a `ClassificationResponse` protocol
 buffer.
 
-This is a skeletal example; please see the @{$deploy$Tensorflow Serving}
+This is a skeletal example; please see the [Tensorflow Serving](../deploy/index.md)
 documentation and [examples](https://github.com/tensorflow/serving/tree/master/tensorflow_serving/example)
 for more details.
 
@@ -647,7 +647,7 @@ You can use the SavedModel Command Line Interface (CLI) to inspect and
 execute a SavedModel.
 For example, you can use the CLI to inspect the model's `SignatureDef`s.
 The CLI enables you to quickly confirm that the input
-@{$tensors$Tensor dtype and shape} match the model. Moreover, if you
+[Tensor dtype and shape](../guide/tensors.md) match the model. Moreover, if you
 want to test your model, you can use the CLI to do a sanity check by
 passing in sample inputs in various formats (for example, Python
 expressions) and then fetching the output.
diff --git a/tensorflow/docs_src/guide/summaries_and_tensorboard.md b/tensorflow/docs_src/guide/summaries_and_tensorboard.md
index 6177c3393b..788c556b9d 100644
--- a/tensorflow/docs_src/guide/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/guide/summaries_and_tensorboard.md
@@ -36,7 +36,7 @@ lifecycle for summary data within TensorBoard.
 
 First, create the TensorFlow graph that you'd like to collect summary
 data from, and decide which nodes you would like to annotate with
-@{$python/summary$summary operations}.
+[summary operations](../api_guides/python/summary.md).
 
 For example, suppose you are training a convolutional neural network for
 recognizing MNIST digits. You'd like to record how the learning rate
@@ -53,7 +53,7 @@ this data by attaching
 the gradient outputs and to the variable that holds your weights, respectively.
 
 For details on all of the summary operations available, check out the docs on
-@{$python/summary$summary operations}.
+[summary operations](../api_guides/python/summary.md).
 
 Operations in TensorFlow don't do anything until you run them, or an op that
 depends on their output. And the summary nodes that we've just created are
@@ -74,7 +74,7 @@ Also, the `FileWriter` can optionally take a `Graph` in its constructor.
 If it receives a `Graph` object, then TensorBoard will visualize your graph
 along with tensor shape information. This will give you a much better sense of
 what flows through the graph: see
-@{$graph_viz#tensor-shape-information$Tensor shape information}.
+[Tensor shape information](../guide/graph_viz.md#tensor-shape-information).
 
 Now that you've modified your graph and have a `FileWriter`, you're ready to
 start running your network! If you want, you could run the merged summary op
@@ -219,7 +219,7 @@ When looking at TensorBoard, you will see the navigation tabs in the top right
 corner. Each tab represents a set of serialized data that can be visualized.
 
 For in depth information on how to use the *graph* tab to visualize your graph,
-see @{$graph_viz$TensorBoard: Graph Visualization}.
+see [TensorBoard: Graph Visualization](../guide/graph_viz.md).
 
 For more usage information on TensorBoard in general, see the
 [TensorBoard GitHub](https://github.com/tensorflow/tensorboard).
diff --git a/tensorflow/docs_src/guide/tensors.md b/tensorflow/docs_src/guide/tensors.md
index 6b5a110a1c..4f0ddb21b5 100644
--- a/tensorflow/docs_src/guide/tensors.md
+++ b/tensorflow/docs_src/guide/tensors.md
@@ -298,7 +298,7 @@ to call `tf.train.start_queue_runners` before evaluating any `tf.Tensor`s.
 ## Printing Tensors
 
 For debugging purposes you might want to print the value of a `tf.Tensor`. While
- @{$debugger$tfdbg} provides advanced debugging support, TensorFlow also has an
+ [tfdbg](../guide/debugger.md) provides advanced debugging support, TensorFlow also has an
  operation to directly print the value of a `tf.Tensor`.
 
 Note that you rarely want to use the following pattern when printing a
diff --git a/tensorflow/docs_src/guide/using_gpu.md b/tensorflow/docs_src/guide/using_gpu.md
index c0218fd12e..8cb9b354c7 100644
--- a/tensorflow/docs_src/guide/using_gpu.md
+++ b/tensorflow/docs_src/guide/using_gpu.md
@@ -211,5 +211,5 @@ AddN: /job:localhost/replica:0/task:0/cpu:0
  [  98.  128.]]
 ```
 
-The @{$deep_cnn$cifar10 tutorial} is a good example
+The [cifar10 tutorial](../tutorials/images/deep_cnn.md) is a good example
 demonstrating how to do training with multiple GPUs.
diff --git a/tensorflow/docs_src/guide/using_tpu.md b/tensorflow/docs_src/guide/using_tpu.md
index 90a663b75e..59b34e19e0 100644
--- a/tensorflow/docs_src/guide/using_tpu.md
+++ b/tensorflow/docs_src/guide/using_tpu.md
@@ -22,8 +22,8 @@ Standard `Estimators` can drive models on CPU and GPUs. You must use
 `tf.contrib.tpu.TPUEstimator` to drive a model on TPUs.
 
 Refer to TensorFlow's Getting Started section for an introduction to the basics
-of using a @{$premade_estimators$pre-made `Estimator`}, and
-@{$custom_estimators$custom `Estimator`s}.
+of using a [pre-made `Estimator`](../guide/premade_estimators.md), and
+[custom `Estimator`s](../guide/custom_estimators.md).
 
 The `TPUEstimator` class differs somewhat from the `Estimator` class.
 
@@ -171,9 +171,9 @@ This section details the changes you must make to the model function
 During regular usage TensorFlow attempts to determine the shapes of each
 `tf.Tensor` during graph construction. During execution any unknown shape
 dimensions are determined dynamically,
-see @{$guide/tensors#shape$Tensor Shapes} for more details.
+see [Tensor Shapes](../guide/tensors.md#shape) for more details.
 
-To run on Cloud TPUs TensorFlow models are compiled using @{$xla$XLA}.
+To run on Cloud TPUs TensorFlow models are compiled using [XLA](../performance/xla/index.md).
 XLA uses a similar system for determining shapes at compile time. XLA requires
 that all tensor dimensions be statically defined at compile time. All shapes
 must evaluate to a constant, and not depend on external data, or stateful
@@ -184,7 +184,7 @@ operations like variables or a random number generator.
 
 Remove any use of `tf.summary` from your model.
 
-@{$summaries_and_tensorboard$TensorBoard summaries} are a great way see inside
+[TensorBoard summaries](../guide/summaries_and_tensorboard.md) are a great way see inside
 your model. A minimal set of basic summaries are automatically recorded by the
 `TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however,
 are currently unsupported when training on a Cloud TPU. So while the
@@ -343,7 +343,7 @@ weight when creating your `tf.metrics`.
 
 Efficient use of the `tf.data.Dataset` API is critical when using a Cloud
 TPU, as it is impossible to use the Cloud TPU's unless you can feed it data
-quickly enough. See @{$datasets_performance} for details on dataset performance.
+quickly enough. See [Input Pipeline Performance Guide](../performance/datasets_performance.md) for details on dataset performance.
 
 For all but the simplest experimentation (using
 `tf.data.Dataset.from_tensor_slices` or other in-graph data) you will need to
@@ -361,7 +361,7 @@ Small datasets can be loaded entirely into memory using
 `tf.data.Dataset.cache`.
 
 Regardless of the data format used, it is strongly recommended that you
-@{$performance_guide#use_large_files$use large files}, on the order of
+[use large files](../performance/performance_guide.md#use_large_files), on the order of
 100MB. This is especially important in this networked setting as the overhead
 of opening a file is significantly higher.
 
@@ -391,5 +391,5 @@ to make a Cloud TPU compatible model are the example models published in:
 
 For more information about tuning TensorFlow code for performance see:
 
- * The @{$performance$Performance Section.}
+ * The [Performance Section.](../performance/index.md)
 
diff --git a/tensorflow/docs_src/guide/version_compat.md b/tensorflow/docs_src/guide/version_compat.md
index 29ac066e6f..de93d225e3 100644
--- a/tensorflow/docs_src/guide/version_compat.md
+++ b/tensorflow/docs_src/guide/version_compat.md
@@ -38,6 +38,9 @@ patch versions.  The public APIs consist of
   `tensorflow` module and its submodules, except for
     * functions and classes in `tf.contrib`
     * functions and classes whose names start with `_` (as these are private)
+    * functions, arguments, properties and classes whose name starts with
+      `experimental`, or whose fully qualified name includes a module called
+      `experimental`
   Note that the code in the `examples/` and `tools/` directories is not
   reachable through the `tensorflow` Python module and is thus not covered by
   the compatibility guarantee.
@@ -75,7 +78,7 @@ backward incompatible ways between minor releases. These include:
 *   **Other languages**: TensorFlow APIs in languages other than Python and C,
     such as:
 
-  - @{$cc/guide$C++} (exposed through header files in
+  - [C++](../api_guides/cc/guide.md) (exposed through header files in
     [`tensorflow/cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/cc)).
   - [Java](../api_docs/java/reference/org/tensorflow/package-summary),
   - [Go](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
@@ -98,7 +101,7 @@ backward incompatible ways between minor releases. These include:
     accuracy for the overall system.
 
 *   **Random numbers:** The specific random numbers computed by the
-    @{$python/constant_op#Random_Tensors$random ops} may change at any time.
+    [random ops](../api_guides/python/constant_op.md#Random_Tensors) may change at any time.
     Users should rely only on approximately correct distributions and
     statistical strength, not the specific bits computed. However, we will make
     changes to random bits rarely (or perhaps never) for patch releases.  We
@@ -175,6 +178,8 @@ This section is relevant only when making incompatible changes to the `GraphDef`
 format, such as when adding ops, removing ops, or changing the functionality
 of existing ops.  The previous section should suffice for most users.
 
+<a id="backward_forward"/>
+
 ### Backward and partial forward compatibility
 
 Our versioning scheme has three requirements:
diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md
index 55481cc400..76e590e1e1 100644
--- a/tensorflow/docs_src/install/index.md
+++ b/tensorflow/docs_src/install/index.md
@@ -17,23 +17,23 @@ systems listed above.
 The following guides explain how to install a version of TensorFlow
 that enables you to write applications in Python:
 
-  * @{$install_linux$Install TensorFlow on Ubuntu}
-  * @{$install_mac$Install TensorFlow on macOS}
-  * @{$install_windows$Install TensorFlow on Windows}
-  * @{$install_raspbian$Install TensorFlow on a Raspberry Pi}
-  * @{$install_sources$Install TensorFlow from source code}
+  * [Install TensorFlow on Ubuntu](../install/install_linux.md)
+  * [Install TensorFlow on macOS](../install/install_mac.md)
+  * [Install TensorFlow on Windows](../install/install_windows.md)
+  * [Install TensorFlow on a Raspberry Pi](../install/install_raspbian.md)
+  * [Install TensorFlow from source code](../install/install_sources.md)
 
 Many aspects of the Python TensorFlow API changed from version 0.n to 1.0.
 The following guide explains how to migrate older TensorFlow applications
 to Version 1.0:
 
-  * @{$migration$Transition to TensorFlow 1.0}
+  * [Transition to TensorFlow 1.0](../install/migration.md)
 
 The following guides explain how to install TensorFlow libraries for use in
 other programming languages. These APIs are aimed at deploying TensorFlow
 models in applications and are not as extensive as the Python APIs.
 
-  * @{$install_java$Install TensorFlow for Java}
-  * @{$install_c$Install TensorFlow for C}
-  * @{$install_go$Install TensorFlow for Go}
+  * [Install TensorFlow for Java](../install/install_java.md)
+  * [Install TensorFlow for C](../install/install_c.md)
+  * [Install TensorFlow for Go](../install/install_go.md)
 
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 4a63f11fca..084634bc9c 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -28,8 +28,8 @@ enable TensorFlow for C:
      entitled "Determine which TensorFlow to install" in one of the
      following guides:
 
-       * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
-       * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on macOS}
+       * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install)
+       * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install)
 
   2. Download and extract the TensorFlow C library into `/usr/local/lib` by
      invoking the following shell commands:
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index f0f8436777..0c604d7713 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -29,8 +29,8 @@ steps to install this library and enable TensorFlow for Go:
      the help of GPU(s). To help you decide, read the section entitled
      "Determine which TensorFlow to install" in one of the following guides:
 
-     * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
-     * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on macOS}
+     * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install)
+     * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install)
 
   2. Download and extract the TensorFlow C library into `/usr/local/lib` by
      invoking the following shell commands:
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index c131a2ea76..c411cb78fe 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -135,7 +135,7 @@ instead:
 
 GPU acceleration is available via Maven only for Linux and only if your system
 meets the
-@{$install_linux#determine_which_tensorflow_to_install$requirements for GPU}.
+[requirements for GPU](../install/install_linux.md#determine_which_tensorflow_to_install).
 
 ## Using TensorFlow with JDK
 
@@ -155,8 +155,8 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
      the help of GPU(s). To help you decide, read the section entitled
      "Determine which TensorFlow to install" in one of the following guides:
 
-     * @{$install_linux#determine_which_tensorflow_to_install$Installing TensorFlow on Linux}
-     * @{$install_mac#determine_which_tensorflow_to_install$Installing TensorFlow on macOS}
+     * [Installing TensorFlow on Linux](../install/install_linux.md#determine_which_tensorflow_to_install)
+     * [Installing TensorFlow on macOS](../install/install_mac.md#determine_which_tensorflow_to_install)
 
   3. Download and extract the appropriate Java Native Interface (JNI)
      file for your operating system and processor support by running the
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 0febdee99f..5fcfa4b988 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -520,7 +520,7 @@ The following NVIDIA® <i>software</i> must be installed on your system:
 
 To use a GPU with CUDA Compute Capability 3.0, or different versions of the
 preceding NVIDIA libraries see
-@{$install_sources$installing TensorFlow from Sources}. If using Ubuntu 16.04
+[installing TensorFlow from Sources](../install/install_sources.md). If using Ubuntu 16.04
 and possibly other Debian based linux distros, `apt-get` can be used with the
 NVIDIA repository to simplify installation.
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index dfd9fbce4b..44ea18fa7b 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -180,16 +180,16 @@ If you follow these instructions, you will not need to disable SIP.
 
 After installing pip, invoke the following commands:
 
-<pre> $ <b>sudo pip install six numpy wheel mock h5py</b>
- $ <b>sudo pip install keras_applications==1.0.4 --no-deps</b>
- $ <b>sudo pip install keras_preprocessing==1.0.2 --no-deps</b>
+<pre> $ <b>pip install six numpy wheel mock h5py</b>
+ $ <b>pip install keras_applications==1.0.5 --no-deps</b>
+ $ <b>pip install keras_preprocessing==1.0.3 --no-deps</b>
 </pre>
 
 Note: These are just the minimum requirements to _build_ tensorflow. Installing
 the pip package will download additional packages required to _run_ it. If you
 plan on executing tasks directly with `bazel` , without the pip installation,
 you may need to install additional python packages. For example, you should `pip
-install mock enum34` before running TensorFlow's tests with bazel.
+install enum34` before running TensorFlow's tests with bazel.
 
 <a name="ConfigureInstallation"></a>
 
@@ -364,6 +364,8 @@ continue to work against your built package.
 If RAM is an issue on your system, you may limit RAM usage by specifying
 <code>--local_resources 2048,.5,1.0</code> while invoking `bazel`.
 
+### Run the build_pip_package script
+
 The <code>bazel build</code> command builds a script named `build_pip_package`.
 Running this script as follows will build a `.whl` file within the
 `/tmp/tensorflow_pkg` directory:
diff --git a/tensorflow/docs_src/install/install_sources_windows.md b/tensorflow/docs_src/install/install_sources_windows.md
index a1da122317..40dce106d6 100644
--- a/tensorflow/docs_src/install/install_sources_windows.md
+++ b/tensorflow/docs_src/install/install_sources_windows.md
@@ -94,8 +94,8 @@ Assume you already have `pip3` in `%PATH%`, issue the following command:
 
 <pre>
 C:\> <b>pip3 install six numpy wheel</b>
-C:\> <b>pip3 install keras_applications==1.0.4 --no-deps</b>
-C:\> <b>pip3 install keras_preprocessing==1.0.2 --no-deps</b>
+C:\> <b>pip3 install keras_applications==1.0.5 --no-deps</b>
+C:\> <b>pip3 install keras_preprocessing==1.0.3 --no-deps</b>
 </pre>
 
 <a name="InstallCUDA"></a>
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 131d28fa3e..a0f26a8c3a 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -7,18 +7,18 @@ details on the high level APIs to use along with best practices to build
 and train high performance models, and quantize models for the least latency
 and highest throughput for inference.
 
-  * @{$performance_guide$Performance Guide} contains a collection of best
+  * [Performance Guide](../performance/performance_guide.md) contains a collection of best
     practices for optimizing your TensorFlow code.
 
-  * @{$datasets_performance$Data input pipeline guide} describes the tf.data
+  * [Data input pipeline guide](../performance/datasets_performance.md) describes the tf.data
     API for building efficient data input pipelines for TensorFlow.
 
-  * @{$performance/benchmarks$Benchmarks} contains a collection of
+  * [Benchmarks](../performance/benchmarks.md) contains a collection of
     benchmark results for a variety of hardware configurations.
 
   * For improving inference efficiency on mobile and
     embedded hardware, see
-    @{$quantization$How to Quantize Neural Networks with TensorFlow}, which
+    [How to Quantize Neural Networks with TensorFlow](../performance/quantization.md), which
     explains how to use quantization to reduce model size, both in storage
     and at runtime.
 
@@ -31,20 +31,20 @@ XLA (Accelerated Linear Algebra) is an experimental compiler for linear
 algebra that optimizes TensorFlow computations. The following guides explore
 XLA:
 
-  * @{$xla$XLA Overview}, which introduces XLA.
-  * @{$broadcasting$Broadcasting Semantics}, which describes XLA's
+  * [XLA Overview](../performance/xla/index.md), which introduces XLA.
+  * [Broadcasting Semantics](../performance/xla/broadcasting.md), which describes XLA's
     broadcasting semantics.
-  * @{$developing_new_backend$Developing a new back end for XLA}, which
+  * [Developing a new back end for XLA](../performance/xla/developing_new_backend.md), which
     explains how to re-target TensorFlow in order to optimize the performance
     of the computational graph for particular hardware.
-  * @{$jit$Using JIT Compilation}, which describes the XLA JIT compiler that
+  * [Using JIT Compilation](../performance/xla/jit.md), which describes the XLA JIT compiler that
     compiles and runs parts of TensorFlow graphs via XLA in order to optimize
     performance.
-  * @{$operation_semantics$Operation Semantics}, which is a reference manual
+  * [Operation Semantics](../performance/xla/operation_semantics.md), which is a reference manual
     describing the semantics of operations in the `ComputationBuilder`
     interface.
-  * @{$shapes$Shapes and Layout}, which details the `Shape` protocol buffer.
-  * @{$tfcompile$Using AOT compilation}, which explains `tfcompile`, a
+  * [Shapes and Layout](../performance/xla/shapes.md), which details the `Shape` protocol buffer.
+  * [Using AOT compilation](../performance/xla/tfcompile.md), which explains `tfcompile`, a
     standalone tool that compiles TensorFlow graphs into executable code in
     order to optimize performance.
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index df70309568..9ea1d6a705 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -41,7 +41,7 @@ approaches to identifying issues:
     utilization is not approaching 80-100%, then the input pipeline may be the
     bottleneck.
 *   Generate a timeline and look for large blocks of white space (waiting). An
-    example of generating a timeline exists as part of the @{$jit$XLA JIT}
+    example of generating a timeline exists as part of the [XLA JIT](../performance/xla/jit.md)
     tutorial.
 *   Check CPU usage. It is possible to have an optimized input pipeline and lack
     the CPU cycles to process the pipeline.
@@ -68,7 +68,7 @@ the CPU.
 
 #### Using the tf.data API
 
-The @{$datasets$tf.data API} is replacing `queue_runner` as the recommended API
+The [tf.data API](../guide/datasets.md) is replacing `queue_runner` as the recommended API
 for building input pipelines. This
 [ResNet example](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator/cifar10_main.py)
 ([arXiv:1512.03385](https://arxiv.org/abs/1512.03385))
@@ -78,7 +78,7 @@ training CIFAR-10 illustrates the use of the `tf.data` API along with
 The `tf.data` API utilizes C++ multi-threading and has a much lower overhead
 than the Python-based `queue_runner` that is limited by Python's multi-threading
 performance. A detailed performance guide for the `tf.data` API can be found
-@{$datasets_performance$here}.
+[here](../performance/datasets_performance.md).
 
 While feeding data using a `feed_dict` offers a high level of flexibility, in
 general `feed_dict` does not provide a scalable solution. If only a single GPU
@@ -174,7 +174,7 @@ faster using `NHWC` than the normally most efficient `NCHW`.
 ### Common fused Ops
 
 Fused Ops combine multiple operations into a single kernel for improved
-performance. There are many fused Ops within TensorFlow and @{$xla$XLA} will
+performance. There are many fused Ops within TensorFlow and [XLA](../performance/xla/index.md) will
 create fused Ops when possible to automatically improve performance. Collected
 below are select fused Ops that can greatly improve performance and may be
 overlooked.
@@ -257,7 +257,7 @@ the CPU in use. Speedups for training and inference on CPU are documented below
 in [Comparing compiler optimizations](#comparing-compiler-optimizations).
 
 To install the most optimized version of TensorFlow,
-@{$install_sources$build and install} from source. If there is a need to build
+[build and install](../install/install_sources.md) from source. If there is a need to build
 TensorFlow on a platform that has different hardware than the target, then
 cross-compile with the highest optimizations for the target platform. The
 following command is an example of using `bazel` to compile for a specific
@@ -298,7 +298,7 @@ each of the towers. How each tower gets the updated variables and how the
 gradients are applied has an impact on the performance, scaling, and convergence
 of the model.  The rest of this section provides an overview of variable
 placement and the towering of a model on multiple GPUs.
-@{$performance_models$High-Performance Models} gets into more details regarding
+[High-Performance Models](../performance/performance_models.md) gets into more details regarding
 more complex methods that can be used to share and update variables between
 towers.
 
@@ -307,7 +307,7 @@ and even how the hardware has been configured. An example of this, is that two
 systems can be built with NVIDIA Tesla P100s but one may be using PCIe and the
 other [NVLink](http://www.nvidia.com/object/nvlink.html). In that scenario, the
 optimal solution for each system may be different. For real world examples, read
-the @{$performance/benchmarks$benchmark} page which details the settings that
+the [benchmark](../performance/benchmarks.md) page which details the settings that
 were optimal for a variety of platforms. Below is a summary of what was learned
 from benchmarking various platforms and configurations:
 
@@ -433,7 +433,7 @@ scenarios.
 ## Optimizing for CPU
 
 CPUs, which includes Intel® Xeon Phi™, achieve optimal performance when
-TensorFlow is @{$install_sources$built from source} with all of the instructions
+TensorFlow is [built from source](../install/install_sources.md) with all of the instructions
 supported by the target CPU.
 
 Beyond using the latest instruction sets, Intel® has added support for the
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
index 66bf684d5b..151c0b2946 100644
--- a/tensorflow/docs_src/performance/performance_models.md
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -9,7 +9,7 @@ incorporated into high-level APIs.
 
 ## Input Pipeline
 
-The @{$performance_guide$Performance Guide} explains how to identify possible
+The [Performance Guide](../performance/performance_guide.md) explains how to identify possible
 input pipeline issues and best practices. We found that using `tf.FIFOQueue`
 and `tf.train.queue_runner` could not saturate multiple current generation GPUs
 when using large inputs and processing with higher samples per second, such
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 4499f5715c..3326d82964 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -80,7 +80,7 @@ need for a separate calibration step.
 TensorFlow can train models with quantization in the loop. Because training
 requires small gradient adjustments, floating point values are still used. To
 keep models as floating point while adding the quantization error in the training
-loop, @{$array_ops#Fake_quantization$fake quantization} nodes simulate the
+loop, [fake quantization](../api_guides/python/array_ops.md#Fake_quantization) nodes simulate the
 effect of quantization in the forward and backward passes.
 
 Since it's difficult to add these fake quantization operations to all the
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index 8f5de83ea6..770737c34c 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -14,7 +14,7 @@ XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
 algebra that optimizes TensorFlow computations. The results are improvements in
 speed, memory usage, and portability on server and mobile platforms. Initially,
 most users will not see large benefits from XLA, but are welcome to experiment
-by using XLA via @{$jit$just-in-time (JIT) compilation} or @{$tfcompile$ahead-of-time (AOT) compilation}. Developers targeting new hardware accelerators are
+by using XLA via [just-in-time (JIT) compilation](../../performance/xla/jit.md) or [ahead-of-time (AOT) compilation](../../performance/xla/tfcompile.md). Developers targeting new hardware accelerators are
 especially encouraged to try out XLA.
 
 The XLA framework is experimental and in active development. In particular,
@@ -54,13 +54,13 @@ We had several objectives for XLA to work with TensorFlow:
 
 The input language to XLA is called "HLO IR", or just HLO (High Level
 Optimizer). The semantics of HLO are described on the
-@{$operation_semantics$Operation Semantics} page. It
+[Operation Semantics](../../performance/xla/operation_semantics.md) page. It
 is most convenient to think of HLO as a [compiler
 IR](https://en.wikipedia.org/wiki/Intermediate_representation).
 
 XLA takes graphs ("computations") defined in HLO and compiles them into machine
 instructions for various architectures. XLA is modular in the sense that it is
-easy to slot in an alternative backend to @{$developing_new_backend$target some novel HW architecture}. The CPU backend for x64 and ARM64 as
+easy to slot in an alternative backend to [target some novel HW architecture](../../performance/xla/developing_new_backend.md). The CPU backend for x64 and ARM64 as
 well as the NVIDIA GPU backend are in the TensorFlow source tree.
 
 The following diagram shows the compilation process in XLA:
@@ -94,5 +94,5 @@ CPU backend supports multiple CPU ISAs.
 
 ## Supported Platforms
 
-XLA currently supports @{$jit$JIT compilation} on x86-64 and NVIDIA GPUs; and
-@{$tfcompile$AOT compilation} for x86-64 and ARM.
+XLA currently supports [JIT compilation](../../performance/xla/jit.md) on x86-64 and NVIDIA GPUs; and
+[AOT compilation](../../performance/xla/tfcompile.md) for x86-64 and ARM.
diff --git a/tensorflow/docs_src/performance/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md
index 7202ef47f7..83b3e71566 100644
--- a/tensorflow/docs_src/performance/xla/jit.md
+++ b/tensorflow/docs_src/performance/xla/jit.md
@@ -133,7 +133,7 @@ Execute the python script to train the model with XLA and turn on a debugging
 feature of XLA via an environmental variable that outputs the XLA graph.
 
 ```shell
-TF_XLA_FLAGS=--xla_generate_hlo_graph=.* python mnist_softmax_xla.py
+TF_XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index e24a7cda73..c23a7ad9e2 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -505,16 +505,17 @@ Computes a convolution of the kind used in neural networks. Here, a convolution
 can be thought of as a n-dimensional window moving across a n-dimensional base
 area and a computation is performed for each possible position of the window.
 
-| Arguments        | Type                    | Semantics                     |
-| ---------------- | ----------------------- | ----------------------------- |
-| `lhs`            | `XlaOp`                 | rank n+2 array of inputs      |
-| `rhs`            | `XlaOp`                 | rank n+2 array of kernel      |
-:                  :                         : weights                       :
-| `window_strides` | `ArraySlice<int64>`     | n-d array of kernel strides   |
-| `padding`        | `ArraySlice<pair<int64, | n-d array of (low, high)      |
-:                  : int64>>`                : padding                       :
-| `lhs_dilation`   | `ArraySlice<int64>`     | n-d lhs dilation factor array |
-| `rhs_dilation`   | `ArraySlice<int64>`     | n-d rhs dilation factor array |
+| Arguments             | Type                 | Semantics                     |
+| --------------------- | -------------------- | ----------------------------- |
+| `lhs`                 | `XlaOp`              | rank n+2 array of inputs      |
+| `rhs`                 | `XlaOp`              | rank n+2 array of kernel      |
+:                       :                      : weights                       :
+| `window_strides`      | `ArraySlice<int64>`  | n-d array of kernel strides   |
+| `padding`             | `ArraySlice<         | n-d array of (low, high)      |
+:                       : pair<int64, int64>>` : padding                       :
+| `lhs_dilation`        | `ArraySlice<int64>`  | n-d lhs dilation factor array |
+| `rhs_dilation`        | `ArraySlice<int64>`  | n-d rhs dilation factor array |
+| `feature_group_count` | int64                | the number of feature groups  |
 
 Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2
 array describing the base area. This is called the input, even though of course
@@ -532,8 +533,8 @@ The `rhs` argument is a rank n+2 array describing the convolutional
 filter/kernel/window. The dimensions are, in this order:
 
 *   `output-z`: The `z` dimension of the output.
-*   `input-z`: The size of this dimension should equal the size of the `z`
-    dimension in lhs.
+*   `input-z`: The size of this dimension times `feature_group_count` should
+    equal the size of the `z` dimension in lhs.
 *   `spatial_dims`: Describes the `n` spatial dimensions that define the n-d
     window that moves across the base area.
 
@@ -566,6 +567,24 @@ Dilation of the rhs is also called atrous convolution. For more details, see
 `tf.nn.atrous_conv2d`. Dilation of the lhs is also called transposed
 convolution. For more details, see `tf.nn.conv2d_transpose`.
 
+The `feature_group_count` argument (default value 1) can be used for grouped
+convolutions. `feature_group_count` needs to be a divisor of both the input and
+the output feature dimension. If `feature_group_count` is greater than 1, it
+means that conceptually the input and output feature dimension and the `rhs`
+output feature dimension are split evenly into `feature_group_count` many
+groups, each group consisting of a consecutive subsequence of features. The
+input feature dimension of `rhs` needs to be equal to the `lhs` input feature
+dimension divided by `feature_group_count` (so it already has the size of a
+group of input features). The i-th groups are used together to compute
+`feature_group_count` many separate convolutions. The results of these
+convolutions are concatenated together in the output feature dimension.
+
+For depthwise convolution the `feature_group_count` argument would be set to the
+input feature dimension, and the filter would be reshaped from
+`[filter_height, filter_width, in_channels, channel_multiplier]` to
+`[filter_height, filter_width, 1, in_channels * channel_multiplier]`. For more
+details, see `tf.nn.depthwise_conv2d`.
+
 The output shape has these dimensions, in this order:
 
 *   `batch`: Same size as `batch` on the input (`lhs`).
@@ -1009,7 +1028,7 @@ Arguments | Type    | Semantics
 `rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
+[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays. In this variant, operations between arrays of
 different ranks are *not* supported, unless one of the operands is a scalar.
@@ -1017,6 +1036,10 @@ different ranks are *not* supported, unless one of the operands is a scalar.
 When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
 absolute value of the result is always less than the divisor's absolute value.
 
+Integer division overflow (signed/unsigned division/remainder by zero or signed
+divison/remainder of `INT_SMIN` with `-1`) produces an implementation defined
+value.
+
 An alternative variant with different-rank broadcasting support exists for these
 operations:
 
@@ -1033,7 +1056,7 @@ the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
 then broadcasts the shapes along these degenerate dimensions to equalize the
 shapes of both operands. The semantics are described in detail on the
-@{$broadcasting$broadcasting page}.
+[broadcasting page](../../performance/xla/broadcasting.md).
 
 ## Element-wise comparison operations
 
@@ -1056,7 +1079,7 @@ Arguments | Type    | Semantics
 `rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
+[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays with the element type `PRED`. In this variant,
 operations between arrays of different ranks are *not* supported, unless one of
@@ -1073,7 +1096,7 @@ matrix to a vector).
 
 The additional `broadcast_dimensions` operand is a slice of integers specifying
 the dimensions to use for broadcasting the operands. The semantics are described
-in detail on the @{$broadcasting$broadcasting page}.
+in detail on the [broadcasting page](../../performance/xla/broadcasting.md).
 
 ## Element-wise unary functions
 
@@ -1119,7 +1142,7 @@ array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
 ## Gather
 
 The XLA gather operation stitches together several slices (each slice at a
-potentially different runtime offset) of an input tensor into an output tensor.
+potentially different runtime offset) of an input array.
 
 ### General Semantics
 
@@ -1127,151 +1150,141 @@ See also
 [`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 For a more intuitive description, see the "Informal Description" section below.
 
-<b> `gather(operand, gather_indices, output_window_dims, elided_window_dims, window_bounds, gather_dims_to_operand_dims)` </b>
+<b> `gather(operand, start_indices, offset_dims, collapsed_slice_dims, slice_sizes, start_index_map)` </b>
 
 |Arguments         | Type                    | Semantics                       |
 |----------------- | ----------------------- | --------------------------------|
-|`operand`         | `XlaOp`                 | The tensor we’re gathering      |
+|`operand`         | `XlaOp`                 | The array we’re gathering       |
 :                  :                         : from.                           :
-|`gather_indices`  | `XlaOp`                 | Tensor containing the starting  |
-:                  :                         : indices of the slices we're     :
-:                  :                         : stitching together into the     :
-:                  :                         : output tensor.                  :
-|`index_vector_dim`  | `int64`               | The dimension in                |
-:                  :                         : `gather_indices` that contains  :
-:                  :                         : the starting indices.           :
-|`output_window_dims` | `ArraySlice<int64>`  | The set of dimensions in the    |
-:                  :                         : output shape that are _window   :
-:                  :                         : dimensions_ (defined below).    :
-:                  :                         : Not all window dimensions may   :
-:                  :                         : be present in the output shape. :
-|`elided_window_dims` | `ArraySlice<int64>`  | The set of _window dimensions_  |
-:                  :            : that are not present in the output shape.    :
-:                  :            : `window_bounds[i]` must be `1` for all `i`   :
-:                  :            : in `elided_window_dims`.                     :
-|`window_bounds`   | `ArraySlice<int64>`    | `window_bounds[i]` is the bounds |
-:                  :            : for  window dimension `i`. This includes     :
-:                  :            : both the window dimensions that are          :
-:                  :            : explicitly part of the output shape (via     :
-:                  :            : `output_window_dims`) and the window         :
-:                  :            : dimensions that are elided (via              :
-:                  :            : `elided_window_dims`).                       :
-|`gather_dims_to_operand_dims` | `ArraySlice<int64>` | A dimension map (the    |
-:                  :            : array is interpreted as mapping `i` to       :
-:                  :            : `gather_dims_to_operand_dims[i]`)  from      :
-:                  :            : the gather indices in `gather_indices` to    :
-:                  :            : the operand index space.  It has to be       :
-:                  :            : one-to-one and total.                        :
-
-For every index `Out` in the output tensor, we compute two things (more
-precisely described later):
-
-  - An index into `gather_indices.rank` - `1` dimensions of `gather_indices`,
-    which gives us a starting index of a slice, _operand slice_, in the operand
-    tensor.  These `gather_indices.rank` - `1` dimensions are all the dimensions
-    in `gather_indices` except `index_vector_dim`.
-
-  - A _window index_ that has the same rank as the operand.  This index is
-    composed of the values in `Out` at dimensions `output_window_dims`, embedded
-    with zeroes according to `elided_window_dims`.
-
-The _window index_ is the relative index of the element in _operand slice_ that
-should be present in the output at index `Out`.
-
-The output is a tensor of rank `output_window_dims.size` + `gather_indices.rank`
-- `1`.  Additionally, as a shorthand, we define `output_gather_dims` of type
-`ArraySlice<int64>` as the set of dimensions in the output shape but not in
-`output_window_dims`, in ascending order.  E.g. if the output tensor has rank
-`5`, `output_window_dims` is {`2`, `4`} then `output_gather_dims` is {`0`, `1`,
-`3`}
-
-If `index_vector_dim` is equal to `gather_indices.rank` we implicitly
-consider `gather_indices` to have a trailing `1` dimension (i.e. if
-`gather_indices` was of shape `[6,7]` and `index_vector_dim` is `2` then
-we implicitly consider the shape of `gather_indices` to be `[6,7,1]`).
-
-The bounds for the output tensor along dimension `i` is computed as follows:
-
-  1. If `i` is present in `output_gather_dims` (i.e. is equal to
-     `output_gather_dims[k]` for some `k`) then we pick the corresponding
-     dimension bounds out of `gather_indices.shape`, skipping
-     `index_vector_dim` (i.e. pick `gather_indices.shape.dims`[`k`] if `k`
-     < `index_vector_dim` and `gather_indices.shape.dims`[`k`+`1`]
-     otherwise).
-  2. If `i` is present in `output_window_dims` (i.e. equal to
-     `output_window_dims`[`k`] for some `k`) then we pick the corresponding
-     bound out of `window_bounds` after accounting for `elided_window_dims`
-     (i.e. we pick `adjusted_window_bounds`[`k`] where `adjusted_window_bounds`
-     is `window_bounds` with the bounds at indices `elided_window_dims`
-     removed).
-
-The operand index `In` corresponding to an output index `Out` is computed as
-follows:
-
-  1. Let `G` = { `Out`[`k`] for `k` in `output_gather_dims` }.  Use `G` to slice
-     out vector `S` such that `S`[`i`] = `gather_indices`[Combine(`G`, `i`)]
-     where Combine(A, b) inserts b at position `index_vector_dim` into A.
-     Note that this is well defined even if `G` is empty -- if `G` is empty then
-     `S` = `gather_indices`.
-  2. Create an index, `S`<sub>`in`</sub>, into `operand` using `S` by
-     scattering `S` using the `gather_dims_to_operand_dims` map
-     (`S`<sub>`in`</sub> is the starting indices for _operand slice_ mentioned
-     above).  More precisely:
-       1. `S`<sub>`in`</sub>[`gather_dims_to_operand_dims`[`k`]] = `S`[`k`] if `k` <
-          `gather_dims_to_operand_dims.size`.
+|`start_indices`   | `XlaOp`                 | Array containing the starting  |
+:                  :                         : indices of the slices we gather.:
+|`index_vector_dim` | `int64`                | The dimension in                |
+:                  :                         : `start_indices` that "contains" :
+:                  :                         : the starting indices.  See      :
+:                  :                         : below for a detailed            :
+:                  :                         : description.                    :
+|`offset_dims`     | `ArraySlice<int64>`     | The set of dimensions in  the   :
+:                  :                         : output shape that offset into a :
+:                  :                         : array sliced from operand.     :
+|`slice_sizes`     | `ArraySlice<int64>`      | `slice_sizes[i]` is the bounds |
+:                  :                          : for the slice on dimension `i`.:
+|`collapsed_slice_dims` | `ArraySlice<int64>` | The set of dimensions in each  :
+|                  :                          | slice that are collapsed away. :
+|                  :                          | These dimensions must have size:
+|                  :                          | 1.                             |
+|`start_index_map` | `ArraySlice<int64>`      | A map that describes how to map|
+:                  :                          : indices in `start_indices` to  :
+:                  :                          : to legal indices into operand. :
+
+For convenience, we label dimensions in the output array not in `offset_dims`
+as `batch_dims`.
+
+The output is an array of rank `batch_dims.size` + `operand.rank` -
+`collapsed_slice_dims`.size.
+
+If `index_vector_dim` is equal to `start_indices.rank` we implicitly consider
+`start_indices` to have a trailing `1` dimension (i.e. if `start_indices` was of
+shape `[6,7]` and `index_vector_dim` is `2` then we implicitly consider the
+shape of `start_indices` to be `[6,7,1]`).
+
+The bounds for the output array along dimension `i` is computed as follows:
+
+  1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
+     some `k`) then we pick the corresponding dimension bounds out of
+     `start_indices.shape`, skipping `index_vector_dim` (i.e. pick
+     `start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
+     `start_indices.shape.dims`[`k`+`1`] otherwise).
+
+  2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
+     some `k`) then we pick the corresponding bound out of `slice_sizes` after
+     accounting for `collapsed_slice_dims` (i.e. we pick
+     `adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
+     with the bounds at indices `collapsed_slice_dims` removed).
+
+Formally, the operand index `In` corresponding to an output index `Out` is
+computed as follows:
+
+  1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
+     vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
+     Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
+     this is well defined even if `G` is empty -- if `G` is empty then `S` =
+     `start_indices`.
+
+  2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
+     scattering `S` using `start_index_map`.  More precisely:
+       1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
+          `start_index_map.size`.
        2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
-  3. Create an index `W`<sub>`in`</sub> into `operand` by scattering the indices
-     at the output window dimensions in `Out` according to
-     the `elided_window_dims` set (`W`<sub>`in`</sub> is the _window index_
-     mentioned above).  More precisely:
-       1. `W`<sub>`in`</sub>[`window_dims_to_operand_dims`(`k`)] = `Out`[`k`] if
-          `k` < `output_window_dims.size` (`window_dims_to_operand_dims` is
-          defined below).
-       2. `W`<sub>`in`</sub>[`_`] = `0` otherwise.
-  4. `In` is `W`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
+
+  3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
+     at the offset dimensions in `Out` according to the `collapsed_slice_dims`
+     set.  More precisely:
+       1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
+          `Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
+          (`expand_offset_dims` is defined below).
+       2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
+  4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
      addition.
 
-`window_dims_to_operand_dims` is the monotonic function with domain [`0`,
-`output_window_dims.size`) and range [`0`, `operand.rank`) \
-`elided_window_dims`.  So if, e.g., `output_window_dims.size` is `4`,
-`operand.rank` is `6` and `elided_window_dims` is {`0`, `2`} then
-`window_dims_to_operand_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
+`expand_offset_dims` is the monotonic function with domain [`0`, `offset.size`)
+and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
+`offset.size` is `4`, `operand.rank` is `6` and `collapsed_slice_dims` is {`0`,
+`2`} then `expand_offset_dims` is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}.
 
 ### Informal Description and Examples
 
-`index_vector_dim` is set to `gather_indices.rank` - `1` in all of the
-examples that follow.  More interesting values for `index_vector_dim`
-does not change the operation fundamentally, but makes the visual representation
-more cumbersome.
+Informally, every index `Out` in the output array corresponds to an element `E`
+in the operand array, computed as follows:
+
+  - We use the batch dimensions in `Out` to look up a starting index from
+    `start_indices`.
+
+  - We use `start_index_map` to map the starting index (which may have size less
+    than operand.rank) to a "full" starting index into operand.
+
+  - We dynamic-slice out a slice with size `slice_sizes` using the full starting
+    index.
+
+  - We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
+    Since all collapsed slice dimensions have to have bound 1 this reshape is
+    always legal.
+
+  - We use the offset dimensions in `Out` to index into this slice to get the
+    input element, `E`, corresponding to output index `Out`.
+
+`index_vector_dim` is set to `start_indices.rank` - `1` in all of the
+examples that follow.  More interesting values for `index_vector_dim` does not
+change the operation fundamentally, but makes the visual representation more
+cumbersome.
 
 To get an intuition on how all of the above fits together, let's look at an
-example that gathers 5 slices of shape `[8,6]` from a `[16,11]` tensor.  The
-position of a slice into the `[16,11]` tensor can be represented as an index
+example that gathers 5 slices of shape `[8,6]` from a `[16,11]` array.  The
+position of a slice into the `[16,11]` array can be represented as an index
 vector of shape `S64[2]`, so the set of 5 positions can be represented as a
-`S64[5,2]` tensor.
+`S64[5,2]` array.
 
 The behavior of the gather operation can then be depicted as an index
-transformation that takes [`G`,`W`<sub>`0`</sub>,`W`<sub>`1`</sub>], an index in
-the output shape, and maps it to an element in the input tensor in the following
+transformation that takes [`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>], an index in
+the output shape, and maps it to an element in the input array in the following
 way:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%" src="../../images/ops_xla_gather_0.svg">
 </div>
 
-We first select an (`X`,`Y`) vector from the gather indices tensor using `G`.
-The element in the output tensor at index
-[`G`,`W`<sub>`0`</sub>,`W`<sub>`1`</sub>] is then the element in the input
-tensor at index [`X`+`W`<sub>`0`</sub>,`Y`+`W`<sub>`1`</sub>].
+We first select an (`X`,`Y`) vector from the gather indices array using `G`.
+The element in the output array at index
+[`G`,`O`<sub>`0`</sub>,`O`<sub>`1`</sub>] is then the element in the input
+array at index [`X`+`O`<sub>`0`</sub>,`Y`+`O`<sub>`1`</sub>].
 
-`window_bounds` is `[8,6]`, which decides the range of W<sub>`0`</sub> and
+`slice_sizes` is `[8,6]`, which decides the range of W<sub>`0`</sub> and
 W<sub>`1`</sub>, and this in turn decides the bounds of the slice.
 
 This gather operation acts as a batch dynamic slice with `G` as the batch
 dimension.
 
 The gather indices may be multidimensional.  For instance, a more general
-version of the example above using a "gather indices" tensor of shape `[4,5,2]`
+version of the example above using a "gather indices" array of shape `[4,5,2]`
 would translate indices like this:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -1279,25 +1292,25 @@ would translate indices like this:
 </div>
 
 Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
-`G`<sub>`1`</sub> as the batch dimensions.  The window bounds are still `[8,6]`.
+`G`<sub>`1`</sub> as the batch dimensions.  The slice size is still `[8,6]`.
 
 The gather operation in XLA generalizes the informal semantics outlined above in
 the following ways:
 
- 1. We can configure which dimensions in the output shape are the window
-    dimensions (dimensions containing `W`<sub>`0`</sub>, `W`<sub>`1`</sub> in
-    the last example).  The output gather dimensions (dimensions containing
+ 1. We can configure which dimensions in the output shape are the offset
+    dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
+    the last example).  The output batch dimensions (dimensions containing
     `G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
-    the output dimensions that are not window dimensions.
+    the output dimensions that are not offset dimensions.
 
- 2. The number of output window dimensions explicitly present in the output
+ 2. The number of output offset dimensions explicitly present in the output
     shape may be smaller than the input rank.  These "missing" dimensions, which
-    are listed explicitly as `elided_window_dims`, must have a window bound of
-    `1`.  Since they have a window bound of `1` the only valid index for them is
+    are listed explicitly as `collapsed_slice_dims`, must have a slice size of
+    `1`.  Since they have a slice size of `1` the only valid index for them is
     `0` and eliding them does not introduce ambiguity.
 
- 3. The slice extracted from the "Gather Indices" tensor ((`X`, `Y`) in the last
-    example) may have fewer elements than the input tensor rank, and an explicit
+ 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
+    example) may have fewer elements than the input array rank, and an explicit
     mapping dictates how the index should be expanded to have the same rank as
     the input.
 
@@ -1308,20 +1321,19 @@ As a final example, we use (2) and (3) to implement `tf.gather_nd`:
 </div>
 
 `G`<sub>`0`</sub> and `G`<sub>`1`</sub> are used to slice out a starting index
-from the gather indices tensor as usual, except the starting index has only one
-element, `X`.  Similarly, there is only one output window index with the value
-`W`<sub>`0`</sub>.  However, before being used as indices into the input tensor,
-these are expanded in accordance to "Gather Index Mapping"
-(`gather_dims_to_operand_dims` in the formal description) and "Window Mapping"
-(`window_dims_to_operand_dims` in the formal description) into
-[`0`,`W`<sub>`0`</sub>] and [`X`,`0`] respectively, adding up to
-[`X`,`W`<sub>`0`</sub>].  In other words, the output index
-[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`W`<sub>`0`</sub>] maps to the input index
+from the gather indices array as usual, except the starting index has only one
+element, `X`.  Similarly, there is only one output offset index with the value
+`O`<sub>`0`</sub>.  However, before being used as indices into the input array,
+these are expanded in accordance to "Gather Index Mapping" (`start_index_map` in
+the formal description) and "Offset Mapping" (`expand_offset_dims` in the formal
+description) into [`0`,`O`<sub>`0`</sub>] and [`X`,`0`] respectively, adding up
+to [`X`,`O`<sub>`0`</sub>].  In other words, the output index
+[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`O`<sub>`0`</sub>] maps to the input index
 [`GatherIndices`[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`0`],`X`] which gives us
 the semantics for `tf.gather_nd`.
 
-`window_bounds` for this case is `[1,11]`.  Intuitively this means that every
-index `X` in the gather indices tensor picks an entire row and the result is the
+`slice_sizes` for this case is `[1,11]`.  Intuitively this means that every
+index `X` in the gather indices array picks an entire row and the result is the
 concatenation of all these rows.
 
 ## GetTupleElement
diff --git a/tensorflow/docs_src/performance/xla/tfcompile.md b/tensorflow/docs_src/performance/xla/tfcompile.md
index e4b803164f..2e0f3774c4 100644
--- a/tensorflow/docs_src/performance/xla/tfcompile.md
+++ b/tensorflow/docs_src/performance/xla/tfcompile.md
@@ -17,7 +17,7 @@ kernels that are actually used in the computation.
 The compiler is built on top of the XLA framework. The code bridging TensorFlow
 to the XLA framework resides under
 [tensorflow/compiler](https://www.tensorflow.org/code/tensorflow/compiler/),
-which also includes support for @{$jit$just-in-time (JIT) compilation} of
+which also includes support for [just-in-time (JIT) compilation](../../performance/xla/jit.md) of
 TensorFlow graphs.
 
 ## What does tfcompile do?
@@ -116,7 +116,7 @@ tf_library(
 > [make_test_graphs.py]("https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/make_test_graphs.py")
 > and specify the output location with the --out_dir flag.
 
-Typical graphs contain @{$python/state_ops$`Variables`}
+Typical graphs contain [`Variables`](../../api_guides/python/state_ops.md)
 representing the weights that are learned via training, but `tfcompile` cannot
 compile a subgraph that contain `Variables`. The
 [freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py)
diff --git a/tensorflow/docs_src/tutorials/_toc.yaml b/tensorflow/docs_src/tutorials/_toc.yaml
index 0e25208a00..c0b85497e0 100644
--- a/tensorflow/docs_src/tutorials/_toc.yaml
+++ b/tensorflow/docs_src/tutorials/_toc.yaml
@@ -37,6 +37,26 @@ toc:
     status: external
   - title: "Custom training: walkthrough"
     path: /tutorials/eager/custom_training_walkthrough
+
+- title: ML at production scale
+  style: accordion
+  section:
+  - title: Linear model with Estimators
+    path: /tutorials/estimators/linear
+  - title: Wide and deep learning
+    path: https://github.com/tensorflow/models/tree/master/official/wide_deep
+    status: external
+  - title: Boosted trees
+    path: https://github.com/tensorflow/models/tree/master/official/boosted_trees
+    status: external
+  - title: Text classifier with TF-Hub
+    path: /hub/tutorials/text_classification_with_tf_hub
+  - title: Build a CNN using Estimators
+    path: /tutorials/estimators/cnn
+
+- title: Generative models
+  style: accordion
+  section:
   - title: Text generation
     path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
     status: external
@@ -46,41 +66,25 @@ toc:
   - title: Image captioning
     path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
     status: external
-  - title: Neural Style Transfer
-    path: https://github.com/tensorflow/models/blob/master/research/nst_blogpost/4_Neural_Style_Transfer_with_Eager_Execution.ipynb
-    status: external
   - title: DCGAN
     path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
     status: external
   - title: VAE
     path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
     status: external
+
+- title: Images
+  style: accordion
+  section:
   - title: Pix2Pix
     path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb
     status: external
+  - title: Neural Style Transfer
+    path: https://github.com/tensorflow/models/blob/master/research/nst_blogpost/4_Neural_Style_Transfer_with_Eager_Execution.ipynb
+    status: external
   - title: Image Segmentation
     path: https://github.com/tensorflow/models/blob/master/samples/outreach/blogs/segmentation_blogpost/image_segmentation.ipynb
     status: external
-
-- title: ML at production scale
-  style: accordion
-  section:
-  - title: Linear model with Estimators
-    path: /tutorials/estimators/linear
-  - title: Wide and deep learning
-    path: https://github.com/tensorflow/models/tree/master/official/wide_deep
-    status: external
-  - title: Boosted trees
-    path: https://github.com/tensorflow/models/tree/master/official/boosted_trees
-    status: external
-  - title: Text classifier with TF-Hub
-    path: /hub/tutorials/text_classification_with_tf_hub
-  - title: Build a CNN using Estimators
-    path: /tutorials/estimators/cnn
-
-- title: Images
-  style: accordion
-  section:
   - title: Image recognition
     path: /tutorials/images/image_recognition
   - title: Image retraining
diff --git a/tensorflow/docs_src/tutorials/eager/index.md b/tensorflow/docs_src/tutorials/eager/index.md
index a13b396094..887c820b85 100644
--- a/tensorflow/docs_src/tutorials/eager/index.md
+++ b/tensorflow/docs_src/tutorials/eager/index.md
@@ -10,4 +10,3 @@ auto&nbsp;differentiation. Start with these notebooks, then read the
 3. <span>[Custom training: basics](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/custom_training.ipynb){:.external}</span>
 4. <span>[Custom layers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/custom_layers.ipynb){:.external}</span>
 5. [Custom training: walkthrough](/tutorials/eager/custom_training_walkthrough)
-6. <span>[Advanced example: Neural machine translation with attention](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb){:.external}</span>
diff --git a/tensorflow/docs_src/tutorials/estimators/cnn.md b/tensorflow/docs_src/tutorials/estimators/cnn.md
index 100f501cc2..2fd69f50a0 100644
--- a/tensorflow/docs_src/tutorials/estimators/cnn.md
+++ b/tensorflow/docs_src/tutorials/estimators/cnn.md
@@ -190,7 +190,7 @@ def cnn_model_fn(features, labels, mode):
 The following sections (with headings corresponding to each code block above)
 dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
-you're already experienced with CNNs and @{$custom_estimators$TensorFlow `Estimator`s},
+you're already experienced with CNNs and [TensorFlow `Estimator`s](../../guide/custom_estimators.md),
 and find the above code intuitive, you may want to skim these sections or just
 skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist).
 
@@ -501,8 +501,8 @@ if mode == tf.estimator.ModeKeys.TRAIN:
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
-> functions, see @{$custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"}
-> in the @{$custom_estimators$"Creating Estimations in tf.estimator"} tutorial.
+> functions, see ["Defining the training op for the model"](../../guide/custom_estimators.md#defining-the-training-op-for-the-model)
+> in the ["Creating Estimations in tf.estimator"](../../guide/custom_estimators.md) tutorial.
 
 
 ### Add evaluation metrics
@@ -567,7 +567,7 @@ be saved (here, we specify the temp directory `/tmp/mnist_convnet_model`, but
 feel free to change to another directory of your choice).
 
 > Note: For an in-depth walkthrough of the TensorFlow `Estimator` API, see the
-> tutorial @{$custom_estimators$"Creating Estimators in tf.estimator."}
+> tutorial ["Creating Estimators in tf.estimator."](../../guide/custom_estimators.md)
 
 ### Set Up a Logging Hook {#set_up_a_logging_hook}
 
@@ -593,8 +593,8 @@ operation earlier when we generated the probabilities in `cnn_model_fn`.
 > Note: If you don't explicitly assign a name to an operation via the `name`
 > argument, TensorFlow will assign a default name. A couple easy ways to
 > discover the names applied to operations are to visualize your graph on
-> @{$graph_viz$TensorBoard}) or to enable the
-> @{$guide/debugger$TensorFlow Debugger (tfdbg)}.
+> [TensorBoard](../../guide/graph_viz.md)) or to enable the
+> [TensorFlow Debugger (tfdbg)](../../guide/debugger.md).
 
 Next, we create the `LoggingTensorHook`, passing `tensors_to_log` to the
 `tensors` argument. We set `every_n_iter=50`, which specifies that probabilities
@@ -686,9 +686,9 @@ Here, we've achieved an accuracy of 97.3% on our test data set.
 To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the
 following resources:
 
-*   @{$custom_estimators$Creating Estimators in tf.estimator}
+*   [Creating Estimators in tf.estimator](../../guide/custom_estimators.md)
     provides an introduction to the TensorFlow Estimator API. It walks through
     configuring an Estimator, writing a model function, calculating loss, and
     defining a training op.
-*   @{$deep_cnn} walks through how to build a MNIST CNN classification model
+*   [Advanced Convolutional Neural Networks](../../tutorials/images/deep_cnn.md) walks through how to build a MNIST CNN classification model
     *without estimators* using lower-level TensorFlow operations.
diff --git a/tensorflow/docs_src/tutorials/images/deep_cnn.md b/tensorflow/docs_src/tutorials/images/deep_cnn.md
index 42ad484bbf..00996b82e6 100644
--- a/tensorflow/docs_src/tutorials/images/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/images/deep_cnn.md
@@ -40,7 +40,7 @@ designing larger and more sophisticated models in TensorFlow:
 and `tf.nn.local_response_normalization`
 (Chapter 3.3 in
 [AlexNet paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)).
-* @{$summaries_and_tensorboard$Visualization}
+* [Visualization](../../guide/summaries_and_tensorboard.md)
 of network activities during training, including input images,
 losses and distributions of activations and gradients.
 * Routines for calculating the
@@ -114,7 +114,7 @@ The input part of the model is built by the functions `inputs()` and
 `distorted_inputs()` which read images from the CIFAR-10 binary data files.
 These files contain fixed byte length records, so we use
 `tf.FixedLengthRecordReader`.
-See @{$reading_data#reading-from-files$Reading Data} to
+See [Reading Data](../../api_guides/python/reading_data.md#reading-from-files) to
 learn more about how the `Reader` class works.
 
 The images are processed as follows:
@@ -131,10 +131,10 @@ artificially increase the data set size:
 * Randomly distort the `tf.image.random_brightness`.
 * Randomly distort the `tf.image.random_contrast`.
 
-Please see the @{$python/image$Images} page for the list of
+Please see the [Images](../../api_guides/python/image.md) page for the list of
 available distortions. We also attach an
 `tf.summary.image` to the images
-so that we may visualize them in @{$summaries_and_tensorboard$TensorBoard}.
+so that we may visualize them in [TensorBoard](../../guide/summaries_and_tensorboard.md).
 This is a good practice to verify that inputs are built correctly.
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -160,8 +160,8 @@ Layer Name | Description
 `conv2` | `tf.nn.conv2d` and `tf.nn.relu` activation.
 `norm2` | `tf.nn.local_response_normalization`.
 `pool2` | `tf.nn.max_pool`.
-`local3` | @{$python/nn$fully connected layer with rectified linear activation}.
-`local4` | @{$python/nn$fully connected layer with rectified linear activation}.
+`local3` | [fully connected layer with rectified linear activation](../../api_guides/python/nn.md).
+`local4` | [fully connected layer with rectified linear activation](../../api_guides/python/nn.md).
 `softmax_linear` | linear transformation to produce logits.
 
 Here is a graph generated from TensorBoard describing the inference operation:
@@ -205,7 +205,7 @@ We visualize it in TensorBoard with a `tf.summary.scalar`:
 
 We train the model using standard
 [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
-algorithm (see @{$python/train$Training} for other methods)
+algorithm (see [Training](../../api_guides/python/train.md) for other methods)
 with a learning rate that
 `tf.train.exponential_decay`
 over time.
@@ -265,7 +265,7 @@ in `cifar10_input.py`.
 
 `cifar10_train.py` periodically uses a  `tf.train.Saver` to save
 all model parameters in
-@{$guide/saved_model$checkpoint files}
+[checkpoint files](../../guide/saved_model.md)
 but it does *not* evaluate the model. The checkpoint file
 will be used by `cifar10_eval.py` to measure the predictive
 performance (see [Evaluating a Model](#evaluating-a-model) below).
@@ -282,7 +282,7 @@ how the model is training. We want more insight into the model during training:
 * Are the gradients, activations and weights reasonable?
 * What is the learning rate currently at?
 
-@{$summaries_and_tensorboard$TensorBoard} provides this
+[TensorBoard](../../guide/summaries_and_tensorboard.md) provides this
 functionality, displaying data exported periodically from `cifar10_train.py` via
 a
 `tf.summary.FileWriter`.
@@ -413,7 +413,7 @@ scope indicating that they should be run on the first GPU.
 All variables are pinned to the CPU and accessed via
 `tf.get_variable`
 in order to share them in a multi-GPU version.
-See how-to on @{$variables$Sharing Variables}.
+See how-to on [Sharing Variables](../../guide/variables.md).
 
 ### Launching and Training the Model on Multiple GPU cards
 
diff --git a/tensorflow/docs_src/tutorials/images/image_recognition.md b/tensorflow/docs_src/tutorials/images/image_recognition.md
index 83a8d97cf0..52913b2082 100644
--- a/tensorflow/docs_src/tutorials/images/image_recognition.md
+++ b/tensorflow/docs_src/tutorials/images/image_recognition.md
@@ -106,7 +106,7 @@ curl -L "https://storage.googleapis.com/download.tensorflow.org/models/inception
 
 Next, we need to compile the C++ binary that includes the code to load and run the graph.
 If you've followed
-@{$install_sources$the instructions to download the source installation of TensorFlow}
+[the instructions to download the source installation of TensorFlow](../../install/install_sources.md)
 for your platform, you should be able to build the example by
 running this command from your shell terminal:
 
@@ -448,7 +448,7 @@ and Michael Nielsen's book has a
 covering them.
 
 To find out more about implementing convolutional neural networks, you can jump
-to the TensorFlow @{$deep_cnn$deep convolutional networks tutorial},
+to the TensorFlow [deep convolutional networks tutorial](../../tutorials/images/deep_cnn.md),
 or start a bit more gently with our [Estimator MNIST tutorial](../estimators/cnn.md).
 Finally, if you want to get up to speed on research in this area, you can
 read the recent work of all the papers referenced in this tutorial.
diff --git a/tensorflow/docs_src/tutorials/representation/kernel_methods.md b/tensorflow/docs_src/tutorials/representation/kernel_methods.md
index 71e87f4d3e..67adc4951c 100644
--- a/tensorflow/docs_src/tutorials/representation/kernel_methods.md
+++ b/tensorflow/docs_src/tutorials/representation/kernel_methods.md
@@ -2,7 +2,7 @@
 
 Note: This document uses a deprecated version of `tf.estimator`,
 `tf.contrib.learn.Estimator`, which has a different interface. It also uses
-other `contrib` methods whose @{$version_compat#not_covered$API may not be stable}.
+other `contrib` methods whose [API may not be stable](../../guide/version_compat.md#not_covered).
 
 In this tutorial, we demonstrate how combining (explicit) kernel methods with
 linear models can drastically increase the latters' quality of predictions
@@ -52,7 +52,7 @@ In order to feed data to a `tf.contrib.learn Estimator`, it is helpful to conver
 it to Tensors. For this, we will use an `input function` which adds Ops to the
 TensorFlow graph that, when executed, create mini-batches of Tensors to be used
 downstream. For more background on input functions, check
-@{$premade_estimators#create_input_functions$this section on input functions}.
+[this section on input functions](../../guide/premade_estimators.md#create_input_functions).
 In this example, we will use the `tf.train.shuffle_batch` Op which, besides
 converting numpy arrays to Tensors, allows us to specify the batch_size and
 whether to randomize the input every time the input_fn Ops are executed
diff --git a/tensorflow/docs_src/tutorials/representation/linear.md b/tensorflow/docs_src/tutorials/representation/linear.md
index 014409c617..4f0e67f08e 100644
--- a/tensorflow/docs_src/tutorials/representation/linear.md
+++ b/tensorflow/docs_src/tutorials/representation/linear.md
@@ -18,7 +18,7 @@ tutorial walks through the code in greater detail.
 
 To understand this overview it will help to have some familiarity
 with basic machine learning concepts, and also with
-@{$premade_estimators$Estimators}.
+[Estimators](../../guide/premade_estimators.md).
 
 [TOC]
 
@@ -175,7 +175,7 @@ the data itself. You provide the data through an input function.
 The input function must return a dictionary of tensors. Each key corresponds to
 the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See
-@{$premade_estimators#input_fn} for a
+[Premade Estimators](../../guide/premade_estimators.md#input_fn) for a
 more comprehensive look at input functions, and `input_fn` in the
 [wide and deep learning tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep)
 for an example implementation of an input function.
diff --git a/tensorflow/docs_src/tutorials/representation/word2vec.md b/tensorflow/docs_src/tutorials/representation/word2vec.md
index 7964650e19..df0d3176b6 100644
--- a/tensorflow/docs_src/tutorials/representation/word2vec.md
+++ b/tensorflow/docs_src/tutorials/representation/word2vec.md
@@ -383,13 +383,13 @@ compromised speed because we use Python for reading and feeding data items --
 each of which require very little work on the TensorFlow back-end.  If you find
 your model is seriously bottlenecked on input data, you may want to implement a
 custom data reader for your problem, as described in
-@{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
+[New Data Formats](../../extend/new_data_formats.md).  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
 [models/tutorials/embedding/word2vec.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
-@{$adding_an_op$Adding a New Op}.  Again we've provided an
+[Adding a New Op](../../extend/adding_an_op.md).  Again we've provided an
 example of this for the Skip-Gram case
 [models/tutorials/embedding/word2vec_optimized.py](https://github.com/tensorflow/models/tree/master/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
diff --git a/tensorflow/docs_src/tutorials/sequences/recurrent.md b/tensorflow/docs_src/tutorials/sequences/recurrent.md
index 10d60f7966..39ad441381 100644
--- a/tensorflow/docs_src/tutorials/sequences/recurrent.md
+++ b/tensorflow/docs_src/tutorials/sequences/recurrent.md
@@ -138,7 +138,7 @@ for current_batch_of_words in words_in_dataset:
 ### Inputs
 
 The word IDs will be embedded into a dense representation (see the
-@{$word2vec$Vector Representations Tutorial}) before feeding to
+[Vector Representations Tutorial](../../tutorials/representation/word2vec.md)) before feeding to
 the LSTM. This allows the model to efficiently represent the knowledge about
 particular words. It is also easy to write:
 
diff --git a/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md
index 37bce5b76d..657fab8a53 100644
--- a/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md
+++ b/tensorflow/docs_src/tutorials/sequences/recurrent_quickdraw.md
@@ -32,7 +32,7 @@ drawings in 345 categories.
 
 To try the code for this tutorial:
 
-1.  @{$install$Install TensorFlow} if you haven't already.
+1.  [Install TensorFlow](../../install/index.md) if you haven't already.
 1.  Download the [tutorial code]
 (https://github.com/tensorflow/models/tree/master/tutorials/rnn/quickdraw/train_model.py).
 1.  [Download the data](#download-the-data) in `TFRecord` format from
@@ -58,8 +58,7 @@ To try the code for this tutorial:
 
 We make the data that we use in this tutorial available as `TFRecord` files
 containing `TFExamples`. You can download the data from here:
-
-http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz
+<a rel="nofollow" href="http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz">http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz</a> (~1GB).
 
 Alternatively you can download the original data in `ndjson` format from the
 Google cloud and convert it to the `TFRecord` files containing `TFExamples`
@@ -108,7 +107,7 @@ This download will take a while and download a bit more than 23GB of data.
 ### Optional: Converting the data
 
 To convert the `ndjson` files to
-@{$python/python_io#TFRecords_Format_Details$TFRecord} files containing
+[TFRecord](../../api_guides/python/python_io.md#TFRecords_Format_Details) files containing
 [`tf.train.Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
 protos run the following command.
 
@@ -118,7 +117,7 @@ protos run the following command.
 ```
 
 This will store the data in 10 shards of
-@{$python/python_io#TFRecords_Format_Details$TFRecord} files with 10000 items
+[TFRecord](../../api_guides/python/python_io.md#TFRecords_Format_Details) files with 10000 items
 per class for the training data and 1000 items per class as eval data.
 
 This conversion process is described in more detail in the following.
@@ -220,7 +219,7 @@ length 2.
 ### Defining the model
 
 To define the model we create a new `Estimator`. If you want to read more about
-estimators, we recommend @{$custom_estimators$this tutorial}.
+estimators, we recommend [this tutorial](../../guide/custom_estimators.md).
 
 To build the model, we:
 
diff --git a/tensorflow/examples/adding_an_op/cuda_op_test.py b/tensorflow/examples/adding_an_op/cuda_op_test.py
index 07390bc3bf..a9aaa81e3f 100644
--- a/tensorflow/examples/adding_an_op/cuda_op_test.py
+++ b/tensorflow/examples/adding_an_op/cuda_op_test.py
@@ -26,7 +26,7 @@ class AddOneTest(tf.test.TestCase):
 
   def test(self):
     if tf.test.is_built_with_cuda():
-      with self.test_session():
+      with self.cached_session():
         result = cuda_op.add_one([5, 4, 3, 2, 1])
         self.assertAllEqual(result.eval(), [6, 5, 4, 3, 2])
 
diff --git a/tensorflow/examples/adding_an_op/fact_test.py b/tensorflow/examples/adding_an_op/fact_test.py
index f7f17e5180..11163e7ba5 100644
--- a/tensorflow/examples/adding_an_op/fact_test.py
+++ b/tensorflow/examples/adding_an_op/fact_test.py
@@ -24,7 +24,7 @@ import tensorflow as tf
 class FactTest(tf.test.TestCase):
 
   def test(self):
-    with self.test_session():
+    with self.cached_session():
       print(tf.user_ops.my_fact().eval())
 
 
diff --git a/tensorflow/examples/adding_an_op/zero_out_1_test.py b/tensorflow/examples/adding_an_op/zero_out_1_test.py
index fac486100d..342d3a020c 100644
--- a/tensorflow/examples/adding_an_op/zero_out_1_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_1_test.py
@@ -28,7 +28,7 @@ from tensorflow.examples.adding_an_op import zero_out_op_1
 class ZeroOut1Test(tf.test.TestCase):
 
   def test(self):
-    with self.test_session():
+    with self.cached_session():
       result = zero_out_op_1.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
diff --git a/tensorflow/examples/adding_an_op/zero_out_2_test.py b/tensorflow/examples/adding_an_op/zero_out_2_test.py
index 217bbbcffa..4504597817 100644
--- a/tensorflow/examples/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_2_test.py
@@ -29,17 +29,17 @@ from tensorflow.examples.adding_an_op import zero_out_op_2
 class ZeroOut2Test(tf.test.TestCase):
 
   def test(self):
-    with self.test_session():
+    with self.cached_session():
       result = zero_out_op_2.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
   def test_2d(self):
-    with self.test_session():
+    with self.cached_session():
       result = zero_out_op_2.zero_out([[6, 5, 4], [3, 2, 1]])
       self.assertAllEqual(result.eval(), [[6, 0, 0], [0, 0, 0]])
 
   def test_grad(self):
-    with self.test_session():
+    with self.cached_session():
       shape = (5,)
       x = tf.constant([5, 4, 3, 2, 1], dtype=tf.float32)
       y = zero_out_op_2.zero_out(x)
@@ -47,7 +47,7 @@ class ZeroOut2Test(tf.test.TestCase):
       self.assertLess(err, 1e-4)
 
   def test_grad_2d(self):
-    with self.test_session():
+    with self.cached_session():
       shape = (2, 3)
       x = tf.constant([[6, 5, 4], [3, 2, 1]], dtype=tf.float32)
       y = zero_out_op_2.zero_out(x)
diff --git a/tensorflow/examples/adding_an_op/zero_out_3_test.py b/tensorflow/examples/adding_an_op/zero_out_3_test.py
index 01280caf49..15d62495aa 100644
--- a/tensorflow/examples/adding_an_op/zero_out_3_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_3_test.py
@@ -26,23 +26,23 @@ from tensorflow.examples.adding_an_op import zero_out_op_3
 class ZeroOut3Test(tf.test.TestCase):
 
   def test(self):
-    with self.test_session():
+    with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
   def testAttr(self):
-    with self.test_session():
+    with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=3)
       self.assertAllEqual(result.eval(), [0, 0, 0, 2, 0])
 
   def testNegative(self):
-    with self.test_session():
+    with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=-1)
       with self.assertRaisesOpError("Need preserve_index >= 0, got -1"):
         result.eval()
 
   def testLarge(self):
-    with self.test_session():
+    with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=17)
       with self.assertRaisesOpError("preserve_index out of range"):
         result.eval()
diff --git a/tensorflow/examples/android/jni/object_tracking/jni_utils.h b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
index b81d9e0c12..06048ecfd3 100644
--- a/tensorflow/examples/android/jni/object_tracking/jni_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
@@ -60,4 +60,4 @@ class JniLongField {
   jfieldID field_ID_;
 };
 
-#endif
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/logging.h b/tensorflow/examples/android/jni/object_tracking/logging.h
index 852a749399..24d05e3398 100644
--- a/tensorflow/examples/android/jni/object_tracking/logging.h
+++ b/tensorflow/examples/android/jni/object_tracking/logging.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
-#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOGGING_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOGGING_H_
 
 #include <android/log.h>
 #include <string.h>
@@ -118,4 +118,4 @@ void LogPrintF(const int severity, const char* format, ...);
 
 #endif
 
-#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOGGING_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_model.h b/tensorflow/examples/android/jni/object_tracking/object_model.h
index 5e81c49080..4bc4d5bc9e 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_model.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_model.h
@@ -19,8 +19,8 @@ limitations under the License.
 
 // Contains ObjectModelBase declaration.
 
-#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
-#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_MODEL_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_MODEL_H_
 
 #ifdef __RENDER_OPENGL__
 #include <GLES/gl.h>
@@ -99,4 +99,4 @@ class ObjectModel : public ObjectModelBase {
 
 }  // namespace tf_tracking
 
-#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_MODEL_H_
diff --git a/tensorflow/examples/android/jni/rgb2yuv.h b/tensorflow/examples/android/jni/rgb2yuv.h
index 13ac4148f3..ff720fda7d 100755
--- a/tensorflow/examples/android/jni/rgb2yuv.h
+++ b/tensorflow/examples/android/jni/rgb2yuv.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef ORG_TENSORFLOW_JNI_IMAGEUTILS_RGB2YUV_H_
-#define ORG_TENSORFLOW_JNI_IMAGEUTILS_RGB2YUV_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_RGB2YUV_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_RGB2YUV_H_
 
 #include <stdint.h>
 
@@ -32,4 +32,4 @@ void ConvertRGB565ToYUV420SP(const uint16_t* const input, uint8_t* const output,
 }
 #endif
 
-#endif  // ORG_TENSORFLOW_JNI_IMAGEUTILS_RGB2YUV_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_RGB2YUV_H_
diff --git a/tensorflow/examples/android/jni/yuv2rgb.h b/tensorflow/examples/android/jni/yuv2rgb.h
index 7d2b8ab7f4..fab462f0e1 100644
--- a/tensorflow/examples/android/jni/yuv2rgb.h
+++ b/tensorflow/examples/android/jni/yuv2rgb.h
@@ -16,8 +16,8 @@ limitations under the License.
 // This is a collection of routines which converts various YUV image formats
 // to (A)RGB.
 
-#ifndef ORG_TENSORFLOW_JNI_IMAGEUTILS_YUV2RGB_H_
-#define ORG_TENSORFLOW_JNI_IMAGEUTILS_YUV2RGB_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_YUV2RGB_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_YUV2RGB_H_
 
 #include <stdint.h>
 
@@ -54,4 +54,4 @@ void ConvertYUV420SPToRGB565(const uint8_t* const input, uint16_t* const output,
 }
 #endif
 
-#endif  // ORG_TENSORFLOW_JNI_IMAGEUTILS_YUV2RGB_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_YUV2RGB_H_
diff --git a/tensorflow/examples/ios/benchmark/ios_image_load.h b/tensorflow/examples/ios/benchmark/ios_image_load.h
index 78eaded8d7..3f94984692 100644
--- a/tensorflow/examples/ios/benchmark/ios_image_load.h
+++ b/tensorflow/examples/ios/benchmark/ios_image_load.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
-#define TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
+#ifndef TENSORFLOW_EXAMPLES_IOS_BENCHMARK_IOS_IMAGE_LOAD_H_
+#define TENSORFLOW_EXAMPLES_IOS_BENCHMARK_IOS_IMAGE_LOAD_H_
 
 #include <vector>
 
@@ -24,4 +24,4 @@ std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
                                                  int* out_height,
                                                  int* out_channels);
 
-#endif  // TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
+#endif  // TENSORFLOW_EXAMPLES_IOS_BENCHMARK_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/examples/ios/camera/ios_image_load.h b/tensorflow/examples/ios/camera/ios_image_load.h
index 87a847e145..f10b0b983a 100644
--- a/tensorflow/examples/ios/camera/ios_image_load.h
+++ b/tensorflow/examples/ios/camera/ios_image_load.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef TENSORFLOW_CONTRIB_IOS_EXAMPLES_CAMERA_IMAGE_LOAD_H_
-#define TENSORFLOW_CONTRIB_IOS_EXAMPLES_CAMERA_IMAGE_LOAD_H_
+#ifndef TENSORFLOW_EXAMPLES_IOS_CAMERA_IOS_IMAGE_LOAD_H_
+#define TENSORFLOW_EXAMPLES_IOS_CAMERA_IOS_IMAGE_LOAD_H_
 
 #include <vector>
 
@@ -24,4 +24,4 @@ std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_height,
 						 int* out_channels);
 
-#endif  // TENSORFLOW_CONTRIB_IOS_EXAMPLES_CAMERA_IMAGE_LOAD_H_
+#endif  // TENSORFLOW_EXAMPLES_IOS_CAMERA_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index baa65d3243..ee2927d0a5 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -106,7 +106,7 @@ static Status ReadEntireFile(tensorflow::Env* env, const string& filename,
                                         "' expected ", file_size, " got ",
                                         data.size());
   }
-  output->scalar<string>()() = data.ToString();
+  output->scalar<string>()() = string(data);
   return Status::OK();
 }
 
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3e0ea619e3..0aba0393af 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -3355,6 +3355,28 @@ func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+//
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
+//
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "PopulationCount",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the mean along sparse segments of a tensor.
 //
 // Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
@@ -4037,78 +4059,6 @@ func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output,
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
 // ApproximateEqualAttr is an optional argument to ApproximateEqual.
 type ApproximateEqualAttr func(optionalAttr)
 
@@ -8419,139 +8369,6 @@ func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...Or
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
-//
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
 type ResourceApplyFtrlAttr func(optionalAttr)
 
@@ -8794,28 +8611,6 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 	return scope.AddOperation(opspec)
 }
 
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-//
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
-//
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PopulationCount",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Broadcasts a tensor value to one or more other devices.
 func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
@@ -9496,34 +9291,216 @@ func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
 //
-// N is the size of the segment being reduced.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
 // This Op does not require `a_indices` be sorted in standard lexicographic order.
@@ -9824,6 +9801,139 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StringSplitV2Attr is an optional argument to StringSplitV2.
 type StringSplitV2Attr func(optionalAttr)
 
@@ -9997,6 +10107,24 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 	return op.Output(0)
 }
 
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates quantized tensors along one dimension.
 //
 // Arguments:
@@ -11227,6 +11355,85 @@ func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max
 	return op.Output(0)
 }
 
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Applies softmax to a batched N-D `SparseTensor`.
 //
 // The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
@@ -12171,34 +12378,6 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 	return values
 }
 
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
 type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
@@ -12777,85 +12956,6 @@ func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataT
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See @{tf.scatter_nd} for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // SqueezeAttr is an optional argument to Squeeze.
 type SqueezeAttr func(optionalAttr)
 
@@ -16074,6 +16174,78 @@ func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
 // RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
 type RandomStandardNormalAttr func(optionalAttr)
 
@@ -16882,216 +17054,6 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
-//
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
-//
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringToNumber",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
 type MutableDenseHashTableV2Attr func(optionalAttr)
 
@@ -17191,6 +17153,34 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	return op.Output(0)
 }
 
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // 2D fast Fourier transform.
 //
 // Computes the 2-dimensional discrete Fourier transform over the inner-most
@@ -17699,123 +17689,6 @@ func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes size of weights that can be used by a Cudnn RNN model.
-//
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T, "S": S}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
-		Input: []tf.Input{
-			num_layers, num_units, input_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the set of files matching one or more glob patterns.
 //
 // Note that this routine only supports wildcard characters in the
@@ -20548,6 +20421,151 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
+
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
+//
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "S": S}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsSize",
+		Input: []tf.Input{
+			num_layers, num_units, input_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for SparseSegmentMean.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtN",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
 // The upper regularized incomplete Gamma function is defined as:
@@ -23378,6 +23396,8 @@ func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, it
 
 // Computes the matrix exponential of one or more square matrices:
 //
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+//
 // \\(exp(A) = \sum_{n=0}^\infty A^n/n!\\)
 //
 // The exponential is computed using a combination of the scaling and squaring
@@ -31898,21 +31918,3 @@ func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Outpu
 	}
 	return scope.AddOperation(opspec)
 }
-
-// Elementwise computes the bitwise AND of `x` and `y`.
-//
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 035077e1e0..e1bf2c7dba 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -32,8 +32,8 @@
     <module>libtensorflow_jni_gpu</module>
     <module>tensorflow</module>
     <module>proto</module>
-    <module>hadoop</module>
-    <module>spark-connector</module>
+    <module>tensorflow-hadoop</module>
+    <module>spark-tensorflow-connector</module>
   </modules>
 
   <!-- Two profiles are used:
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index 8c4c9d498c..75c6cff529 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -41,7 +41,7 @@ clean() {
   mvn -q clean
   rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target \
     libtensorflow/src libtensorflow/target tensorflow-android/target proto/src proto/target \
-    hadoop/src hadoop/target spark-connector/src spark-connector/target
+    tensorflow-hadoop/src tensorflow-hadoop/target spark-tensorflow-connector/src spark-tensorflow-connector/target
 }
 
 update_version_in_pom() {
@@ -170,8 +170,8 @@ generate_java_protos() {
 # is updated for each module.
 download_tf_ecosystem() {
   ECOSYSTEM_DIR="/tmp/tensorflow-ecosystem"
-  HADOOP_DIR="${DIR}/hadoop"
-  SPARK_DIR="${DIR}/spark-connector"
+  HADOOP_DIR="${DIR}/tensorflow-hadoop"
+  SPARK_DIR="${DIR}/spark-tensorflow-connector"
 
   # Clean any previous attempts
   rm -rf "${ECOSYSTEM_DIR}"
diff --git a/tensorflow/java/maven/spark-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index 31e39c588a..1b7995be2c 100644
--- a/tensorflow/java/maven/spark-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -4,7 +4,7 @@
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
     <groupId>org.tensorflow</groupId>
-    <artifactId>spark-connector_2.11</artifactId>
+    <artifactId>spark-tensorflow-connector_2.11</artifactId>
     <packaging>jar</packaging>
     <version>1.10.0</version>
     <name>spark-tensorflow-connector</name>
@@ -120,7 +120,7 @@
                                 <artifactSet>
                                     <includes>
                                         <include>com.google.protobuf:protobuf-java</include>
-                                        <include>org.tensorflow:hadoop</include>
+                                        <include>org.tensorflow:tensorflow-hadoop</include>
                                         <include>org.tensorflow:proto</include>
                                     </includes>
                                 </artifactSet>
@@ -305,7 +305,7 @@
     <dependencies>
         <dependency>
             <groupId>org.tensorflow</groupId>
-            <artifactId>hadoop</artifactId>
+            <artifactId>tensorflow-hadoop</artifactId>
             <version>${project.version}</version>
         </dependency>
         <dependency>
diff --git a/tensorflow/java/maven/hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index e0409fa41b..0fe6f4dce4 100644
--- a/tensorflow/java/maven/hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -3,7 +3,7 @@
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
     <modelVersion>4.0.0</modelVersion>
     <groupId>org.tensorflow</groupId>
-    <artifactId>hadoop</artifactId>
+    <artifactId>tensorflow-hadoop</artifactId>
     <packaging>jar</packaging>
     <version>1.10.0</version>
     <name>tensorflow-hadoop</name>
@@ -15,7 +15,7 @@
         <maven.compiler.source>1.6</maven.compiler.source>
         <maven.compiler.target>1.6</maven.compiler.target>
         <hadoop.version>2.6.0</hadoop.version>
-        <protobuf.version>3.3.1</protobuf.version>
+        <protobuf.version>3.5.1</protobuf.version>
         <junit.version>4.11</junit.version>
     </properties>
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java b/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
index 0c751aed9f..824f7fbe32 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/types/UInt8.java
@@ -16,6 +16,33 @@ limitations under the License.
 package org.tensorflow.types;
 
 /** Represents an 8-bit unsigned integer. */
-public class UInt8 {
+public class UInt8 extends Number {
+
+  private static final long serialVersionUID = 1L;
+  
+  // This class is only used for generic parameterization and is not instantiable. Thus,
+  // it is safe to implement the Number abstract methods with all zeros, as they will
+  // never be invoked.
+
+  @Override
+  public double doubleValue() {
+    return 0.0;
+  }
+
+  @Override
+  public float floatValue() {
+    return 0.0f;
+  }
+
+  @Override
+  public int intValue() {
+    return 0;
+  }
+
+  @Override
+  public long longValue() {
+    return 0L;
+  }
+
   private UInt8() {}
 }
diff --git a/tensorflow/java/src/main/native/exception_jni.h b/tensorflow/java/src/main/native/exception_jni.h
index 28f26d7ebf..465281f804 100644
--- a/tensorflow/java/src/main/native/exception_jni.h
+++ b/tensorflow/java/src/main/native/exception_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_EXCEPTION_JNI_H_
-#define TENSORFLOW_JAVA_EXCEPTION_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
 
 #include <jni.h>
 
@@ -39,4 +39,4 @@ bool throwExceptionIfNotOK(JNIEnv* env, const TF_Status* status);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_EXCEPTION_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
diff --git a/tensorflow/java/src/main/native/graph_jni.h b/tensorflow/java/src/main/native/graph_jni.h
index 215695cdfd..efed23f83b 100644
--- a/tensorflow/java/src/main/native/graph_jni.h
+++ b/tensorflow/java/src/main/native/graph_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_GRAPH_JNI_H_
-#define TENSORFLOW_JAVA_GRAPH_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_GRAPH_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_GRAPH_JNI_H_
 
 #include <jni.h>
 
@@ -85,4 +85,4 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_addGradients(
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_GRAPH_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_GRAPH_JNI_H_
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.h b/tensorflow/java/src/main/native/operation_builder_jni.h
index cf0abe4829..1cda7acea8 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.h
+++ b/tensorflow/java/src/main/native/operation_builder_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_OPERATION_BUILDER_JNI_H_
-#define TENSORFLOW_JAVA_OPERATION_BUILDER_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_OPERATION_BUILDER_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_OPERATION_BUILDER_JNI_H_
 
 #include <jni.h>
 
@@ -188,4 +188,4 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrStringList(
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_OPERATION_BUILDER_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_OPERATION_BUILDER_JNI_H_
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
index 6f379256d2..56da2ebaee 100644
--- a/tensorflow/java/src/main/native/operation_jni.h
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_OPERATION_JNI_H_
-#define TENSORFLOW_JAVA_OPERATION_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_OPERATION_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_OPERATION_JNI_H_
 
 #include <jni.h>
 
@@ -87,4 +87,4 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_inputListLength(JNIEnv *,
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_OPERATION_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_OPERATION_JNI_H_
diff --git a/tensorflow/java/src/main/native/saved_model_bundle_jni.h b/tensorflow/java/src/main/native/saved_model_bundle_jni.h
index a4b05d0409..e8f28dd670 100644
--- a/tensorflow/java/src/main/native/saved_model_bundle_jni.h
+++ b/tensorflow/java/src/main/native/saved_model_bundle_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_SAVEDMODELBUNDLE_JNI_H_
-#define TENSORFLOW_JAVA_SAVEDMODELBUNDLE_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SAVED_MODEL_BUNDLE_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SAVED_MODEL_BUNDLE_JNI_H_
 
 #include <jni.h>
 
@@ -34,4 +34,4 @@ JNIEXPORT jobject JNICALL Java_org_tensorflow_SavedModelBundle_load(
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_SAVEDMODELBUNDLE_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SAVED_MODEL_BUNDLE_JNI_H_
diff --git a/tensorflow/java/src/main/native/session_jni.h b/tensorflow/java/src/main/native/session_jni.h
index 54c9c0aa4d..1cc196bdc8 100644
--- a/tensorflow/java/src/main/native/session_jni.h
+++ b/tensorflow/java/src/main/native/session_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_SESSION_JNI_H_
-#define TENSORFLOW_JAVA_SESSION_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SESSION_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SESSION_JNI_H_
 
 #include <jni.h>
 
@@ -59,4 +59,4 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_SESSION_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SESSION_JNI_H_
diff --git a/tensorflow/java/src/main/native/tensor_jni.h b/tensorflow/java/src/main/native/tensor_jni.h
index a300936884..4cf682548e 100644
--- a/tensorflow/java/src/main/native/tensor_jni.h
+++ b/tensorflow/java/src/main/native/tensor_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_TENSOR_JNI_H_
-#define TENSORFLOW_JAVA_TENSOR_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
 
 #include <jni.h>
 
@@ -153,4 +153,4 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_readNDArray(JNIEnv *, jclass,
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_TENSOR_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
diff --git a/tensorflow/java/src/main/native/tensorflow_jni.h b/tensorflow/java/src/main/native/tensorflow_jni.h
index c0c9322020..d7c44fb0e2 100644
--- a/tensorflow/java/src/main/native/tensorflow_jni.h
+++ b/tensorflow/java/src/main/native/tensorflow_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_TENSORFLOW_JNI_H_
-#define TENSORFLOW_JAVA_TENSORFLOW_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_JNI_H_
 
 #include <jni.h>
 
@@ -67,4 +67,4 @@ Java_org_tensorflow_TensorFlow_libraryOpList(JNIEnv *, jclass, jlong);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_JAVA_TENSORFLOW_JNI_H_
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_JNI_H_
diff --git a/tensorflow/java/src/main/native/utils_jni.h b/tensorflow/java/src/main/native/utils_jni.h
index 352298e7de..d1e1b93878 100644
--- a/tensorflow/java/src/main/native/utils_jni.h
+++ b/tensorflow/java/src/main/native/utils_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_JAVA_UTILS_JNI_H_
-#define TENSORFLOW_JAVA_UTILS_JNI_H_
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_UTILS_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_UTILS_JNI_H_
 
 #include <jni.h>
 
@@ -30,4 +30,4 @@ void resolveOutputs(JNIEnv* env, const char* type, jlongArray src_op,
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif /* TENSORFLOW_JAVA_UTILS_JNI_H_ */
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_UTILS_JNI_H_
diff --git a/tensorflow/js/BUILD b/tensorflow/js/BUILD
new file mode 100644
index 0000000000..ad0dc44f54
--- /dev/null
+++ b/tensorflow/js/BUILD
@@ -0,0 +1,52 @@
+# Description:
+# JavaScript/TypeScript code generation for TensorFlow.js
+
+visibility = [
+    "//tensorflow:internal",
+]
+
+package(default_visibility = visibility)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "ts_op_gen",
+    srcs = [
+        "ops/ts_op_gen.cc",
+    ],
+    hdrs = [
+        "ops/ts_op_gen.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "ts_op_gen_test",
+    srcs = [
+        "ops/ts_op_gen.cc",
+        "ops/ts_op_gen.h",
+        "ops/ts_op_gen_test.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/js/ops/ts_op_gen.cc b/tensorflow/js/ops/ts_op_gen.cc
new file mode 100644
index 0000000000..fb93bb6d8e
--- /dev/null
+++ b/tensorflow/js/ops/ts_op_gen.cc
@@ -0,0 +1,290 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/js/ops/ts_op_gen.h"
+#include <unordered_map>
+
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+namespace {
+
+static bool IsListAttr(const OpDef_ArgDef& arg) {
+  return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+}
+
+// Struct to hold a combo OpDef and ArgDef for a given Op argument:
+struct ArgDefs {
+  ArgDefs(const OpDef::ArgDef& op_def_arg, const ApiDef::Arg& api_def_arg)
+      : op_def_arg(op_def_arg), api_def_arg(api_def_arg) {}
+
+  const OpDef::ArgDef& op_def_arg;
+  const ApiDef::Arg& api_def_arg;
+};
+
+// Struct to hold a combo OpDef::AttrDef and ApiDef::Attr for an Op.
+struct OpAttrs {
+  OpAttrs(const OpDef::AttrDef& op_def_attr, const ApiDef::Attr& api_def_attr)
+      : op_def_attr(op_def_attr), api_def_attr(api_def_attr) {}
+
+  const OpDef::AttrDef& op_def_attr;
+  const ApiDef::Attr& api_def_attr;
+};
+
+// Helper class to generate TypeScript code for a given OpDef:
+class GenTypeScriptOp {
+ public:
+  GenTypeScriptOp(const OpDef& op_def, const ApiDef& api_def);
+  ~GenTypeScriptOp();
+
+  // Returns the generated code as a string:
+  string Code();
+
+ private:
+  void ProcessArgs();
+  void ProcessAttrs();
+  void AddAttrForArg(const string& attr, int arg_index);
+  string InputForAttr(const OpDef::AttrDef& op_def_attr);
+
+  void AddMethodSignature();
+  void AddOpAttrs();
+  void AddMethodReturnAndClose();
+
+  const OpDef& op_def_;
+  const ApiDef& api_def_;
+
+  // Placeholder string for all generated code:
+  string result_;
+
+  // Holds in-order vector of Op inputs:
+  std::vector<ArgDefs> input_op_args_;
+
+  // Holds in-order vector of Op attributes:
+  std::vector<OpAttrs> op_attrs_;
+
+  // Stores attributes-to-arguments by name:
+  typedef std::unordered_map<string, std::vector<int>> AttrArgIdxMap;
+  AttrArgIdxMap attr_arg_idx_map_;
+
+  // Holds number of outputs:
+  int num_outputs_;
+};
+
+GenTypeScriptOp::GenTypeScriptOp(const OpDef& op_def, const ApiDef& api_def)
+    : op_def_(op_def), api_def_(api_def), num_outputs_(0) {}
+
+GenTypeScriptOp::~GenTypeScriptOp() {}
+
+string GenTypeScriptOp::Code() {
+  ProcessArgs();
+  ProcessAttrs();
+
+  // Generate exported function for Op:
+  AddMethodSignature();
+  AddOpAttrs();
+  AddMethodReturnAndClose();
+
+  strings::StrAppend(&result_, "\n");
+  return result_;
+}
+
+void GenTypeScriptOp::ProcessArgs() {
+  for (int i = 0; i < api_def_.arg_order_size(); i++) {
+    auto op_def_arg = FindInputArg(api_def_.arg_order(i), op_def_);
+    if (op_def_arg == nullptr) {
+      LOG(WARNING) << "Could not find OpDef::ArgDef for "
+                   << api_def_.arg_order(i);
+      continue;
+    }
+    auto api_def_arg = FindInputArg(api_def_.arg_order(i), api_def_);
+    if (api_def_arg == nullptr) {
+      LOG(WARNING) << "Could not find ApiDef::Arg for "
+                   << api_def_.arg_order(i);
+      continue;
+    }
+
+    // Map attr names to arg indexes:
+    if (!op_def_arg->type_attr().empty()) {
+      AddAttrForArg(op_def_arg->type_attr(), i);
+    } else if (!op_def_arg->type_list_attr().empty()) {
+      AddAttrForArg(op_def_arg->type_list_attr(), i);
+    }
+    if (!op_def_arg->number_attr().empty()) {
+      AddAttrForArg(op_def_arg->number_attr(), i);
+    }
+
+    input_op_args_.push_back(ArgDefs(*op_def_arg, *api_def_arg));
+  }
+
+  num_outputs_ = api_def_.out_arg_size();
+}
+
+void GenTypeScriptOp::ProcessAttrs() {
+  for (int i = 0; i < op_def_.attr_size(); i++) {
+    op_attrs_.push_back(OpAttrs(op_def_.attr(i), api_def_.attr(i)));
+  }
+}
+
+void GenTypeScriptOp::AddAttrForArg(const string& attr, int arg_index) {
+  // Keep track of attributes-to-arguments by name. These will be used for
+  // construction Op attributes that require information about the inputs.
+  auto iter = attr_arg_idx_map_.find(attr);
+  if (iter == attr_arg_idx_map_.end()) {
+    attr_arg_idx_map_.insert(AttrArgIdxMap::value_type(attr, {arg_index}));
+  } else {
+    iter->second.push_back(arg_index);
+  }
+}
+
+string GenTypeScriptOp::InputForAttr(const OpDef::AttrDef& op_def_attr) {
+  string inputs;
+  auto arg_list = attr_arg_idx_map_.find(op_def_attr.name());
+  if (arg_list != attr_arg_idx_map_.end()) {
+    for (auto iter = arg_list->second.begin(); iter != arg_list->second.end();
+         ++iter) {
+      strings::StrAppend(&inputs, input_op_args_[*iter].op_def_arg.name());
+    }
+  }
+  return inputs;
+}
+
+void GenTypeScriptOp::AddMethodSignature() {
+  strings::StrAppend(&result_, "export function ", api_def_.endpoint(0).name(),
+                     "(");
+
+  bool is_first = true;
+  for (auto& in_arg : input_op_args_) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      strings::StrAppend(&result_, ", ");
+    }
+
+    auto op_def_arg = in_arg.op_def_arg;
+
+    strings::StrAppend(&result_, op_def_arg.name(), ": ");
+    if (IsListAttr(op_def_arg)) {
+      strings::StrAppend(&result_, "tfc.Tensor[]");
+    } else {
+      strings::StrAppend(&result_, "tfc.Tensor");
+    }
+  }
+
+  if (num_outputs_ == 1) {
+    strings::StrAppend(&result_, "): tfc.Tensor {\n");
+  } else {
+    strings::StrAppend(&result_, "): tfc.Tensor[] {\n");
+  }
+}
+
+void GenTypeScriptOp::AddOpAttrs() {
+  strings::StrAppend(&result_, "  const opAttrs = [\n");
+
+  bool is_first = true;
+  for (auto& attr : op_attrs_) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      strings::StrAppend(&result_, ",\n");
+    }
+
+    // Append 4 spaces to start:
+    strings::StrAppend(&result_, "    ");
+
+    if (attr.op_def_attr.type() == "type") {
+      // Type OpAttributes can be generated from a helper function:
+      strings::StrAppend(&result_, "createTensorsTypeOpAttr('",
+                         attr.op_def_attr.name(), "', ",
+                         InputForAttr(attr.op_def_attr), ")");
+    } else if (attr.op_def_attr.type() == "int") {
+      strings::StrAppend(&result_, "{name: '", attr.op_def_attr.name(), "', ");
+      strings::StrAppend(&result_, "type: nodeBackend().binding.TF_ATTR_INT, ");
+      strings::StrAppend(&result_, "value: ", InputForAttr(attr.op_def_attr),
+                         ".length}");
+    }
+  }
+  strings::StrAppend(&result_, "\n  ];\n");
+}
+
+void GenTypeScriptOp::AddMethodReturnAndClose() {
+  strings::StrAppend(&result_, "  return null;\n}\n");
+}
+
+void WriteTSOp(const OpDef& op_def, const ApiDef& api_def, WritableFile* ts) {
+  GenTypeScriptOp ts_op(op_def, api_def);
+  TF_CHECK_OK(ts->Append(GenTypeScriptOp(op_def, api_def).Code()));
+}
+
+void StartFile(WritableFile* ts_file) {
+  const string header =
+      R"header(/**
+ * @license
+ * Copyright 2018 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+// This file is MACHINE GENERATED! Do not edit
+
+import * as tfc from '@tensorflow/tfjs-core';
+import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
+
+)header";
+
+  TF_CHECK_OK(ts_file->Append(header));
+}
+
+}  // namespace
+
+void WriteTSOps(const OpList& ops, const ApiDefMap& api_def_map,
+                const string& ts_filename) {
+  Env* env = Env::Default();
+
+  std::unique_ptr<WritableFile> ts_file = nullptr;
+  TF_CHECK_OK(env->NewWritableFile(ts_filename, &ts_file));
+
+  StartFile(ts_file.get());
+
+  for (const auto& op_def : ops.op()) {
+    // Skip deprecated ops
+    if (op_def.has_deprecation() &&
+        op_def.deprecation().version() <= TF_GRAPH_DEF_VERSION) {
+      continue;
+    }
+
+    const auto* api_def = api_def_map.GetApiDef(op_def.name());
+    if (api_def->visibility() == ApiDef::VISIBLE) {
+      WriteTSOp(op_def, *api_def, ts_file.get());
+    }
+  }
+
+  TF_CHECK_OK(ts_file->Close());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/parallel_check_op.cc b/tensorflow/js/ops/ts_op_gen.h
index db5c195578..fcd46a17a7 100644
--- a/tensorflow/compiler/jit/ops/parallel_check_op.cc
+++ b/tensorflow/js/ops/ts_op_gen.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
+#ifndef TENSORFLOW_JS_OPS_TS_OP_GEN_H_
+#define TENSORFLOW_JS_OPS_TS_OP_GEN_H_
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-REGISTER_OP("ParallelCheck")
-    .Attr("T: list(type) >= 0")
-    .Input("expected: T")
-    .Input("actual: T")
-    .Output("result: T")
-    .Doc(R"doc(
-Op that compares two sets of inputs for near-identity, and propagates the first.
-Inequality is logged to ERROR log.
-)doc");
+// Generated code is written to the file ts_filename:
+void WriteTSOps(const OpList& ops, const ApiDefMap& api_def_map,
+                const string& ts_filename);
 
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_JS_OPS_TS_OP_GEN_H_
diff --git a/tensorflow/js/ops/ts_op_gen_test.cc b/tensorflow/js/ops/ts_op_gen_test.cc
new file mode 100644
index 0000000000..03241689b5
--- /dev/null
+++ b/tensorflow/js/ops/ts_op_gen_test.cc
@@ -0,0 +1,246 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/js/ops/ts_op_gen.h"
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+void ExpectContainsStr(StringPiece s, StringPiece expected) {
+  EXPECT_TRUE(str_util::StrContains(s, expected))
+      << "'" << s << "' does not contain '" << expected << "'";
+}
+
+void ExpectDoesNotContainStr(StringPiece s, StringPiece expected) {
+  EXPECT_FALSE(str_util::StrContains(s, expected))
+      << "'" << s << "' does not contain '" << expected << "'";
+}
+
+constexpr char kBaseOpDef[] = R"(
+op {
+  name: "Foo"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+    number_attr: "N"
+    description: "Images to process."
+  }
+  input_arg {
+    name: "dim"
+    description: "Description for dim."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    description: "Description for output."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "Type for images"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+      }
+    }
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Summary for op Foo."
+  description: "Description for op Foo."
+}
+)";
+
+// Generate TypeScript code
+void GenerateTsOpFileText(const string& op_def_str, const string& api_def_str,
+                          string* ts_file_text) {
+  Env* env = Env::Default();
+  OpList op_defs;
+  protobuf::TextFormat::ParseFromString(
+      op_def_str.empty() ? kBaseOpDef : op_def_str, &op_defs);
+  ApiDefMap api_def_map(op_defs);
+
+  if (!api_def_str.empty()) {
+    TF_ASSERT_OK(api_def_map.LoadApiDef(api_def_str));
+  }
+
+  const string& tmpdir = testing::TmpDir();
+  const auto ts_file_path = io::JoinPath(tmpdir, "test.ts");
+
+  WriteTSOps(op_defs, api_def_map, ts_file_path);
+  TF_ASSERT_OK(ReadFileToString(env, ts_file_path, ts_file_text));
+}
+
+TEST(TsOpGenTest, TestImports) {
+  string ts_file_text;
+  GenerateTsOpFileText("", "", &ts_file_text);
+
+  const string expected = R"(
+import * as tfc from '@tensorflow/tfjs-core';
+import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
+)";
+  ExpectContainsStr(ts_file_text, expected);
+}
+
+TEST(TsOpGenTest, InputSingleAndList) {
+  const string api_def = R"(
+op {
+  name: "Foo"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+    number_attr: "N"
+  }
+}
+)";
+
+  string ts_file_text;
+  GenerateTsOpFileText("", api_def, &ts_file_text);
+
+  const string expected = R"(
+export function Foo(images: tfc.Tensor[], dim: tfc.Tensor): tfc.Tensor {
+)";
+  ExpectContainsStr(ts_file_text, expected);
+}
+
+TEST(TsOpGenTest, TestVisibility) {
+  const string api_def = R"(
+op {
+  graph_op_name: "Foo"
+  visibility: HIDDEN
+}
+)";
+
+  string ts_file_text;
+  GenerateTsOpFileText("", api_def, &ts_file_text);
+
+  const string expected = R"(
+export function Foo(images: tfc.Tensor[], dim: tfc.Tensor): tfc.Tensor {
+)";
+  ExpectDoesNotContainStr(ts_file_text, expected);
+}
+
+TEST(TsOpGenTest, SkipDeprecated) {
+  const string op_def = R"(
+op {
+  name: "DeprecatedFoo"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    description: "Description for input."
+  }
+  output_arg {
+    name: "output"
+    description: "Description for output."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "Type for input"
+    allowed_values {
+      list {
+        type: DT_FLOAT 
+      }
+    }
+  }
+  deprecation {
+    explanation: "Deprecated."
+  }
+}
+)";
+
+  string ts_file_text;
+  GenerateTsOpFileText(op_def, "", &ts_file_text);
+
+  ExpectDoesNotContainStr(ts_file_text, "DeprecatedFoo");
+}
+
+TEST(TsOpGenTest, MultiOutput) {
+  const string op_def = R"(
+op {
+  name: "MultiOutputFoo"
+  input_arg {
+    name: "input"
+    description: "Description for input."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output1"
+    description: "Description for output 1."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output2"
+    description: "Description for output 2."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "Type for input"
+    allowed_values {
+      list {
+        type: DT_FLOAT 
+      }
+    }
+  }
+  summary: "Summary for op MultiOutputFoo."
+  description: "Description for op MultiOutputFoo."
+}
+)";
+
+  string ts_file_text;
+  GenerateTsOpFileText(op_def, "", &ts_file_text);
+
+  const string expected = R"(
+export function MultiOutputFoo(input: tfc.Tensor): tfc.Tensor[] {
+)";
+  ExpectContainsStr(ts_file_text, expected);
+}
+
+TEST(TsOpGenTest, OpAttrs) {
+  string ts_file_text;
+  GenerateTsOpFileText("", "", &ts_file_text);
+
+  const string expectedFooAttrs = R"(
+  const opAttrs = [
+    createTensorsTypeOpAttr('T', images),
+    {name: 'N', type: nodeBackend().binding.TF_ATTR_INT, value: images.length}
+  ];
+)";
+
+  ExpectContainsStr(ts_file_text, expectedFooAttrs);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2a71eaf030..37af3d350e 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -44,6 +44,10 @@ load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_mpi_deps")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_gdr_deps")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load(
+    "//third_party/ngraph:build_defs.bzl",
+    "if_ngraph",
+)
 
 py_library(
     name = "python",
@@ -130,6 +134,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/compat",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute:estimator_training",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras",
         "//tensorflow/python/ops/distributions",
@@ -138,6 +143,8 @@ py_library(
         "//tensorflow/python/ops/parallel_for",
         "//tensorflow/python/profiler",
         "//tensorflow/python/saved_model",
+        "//tensorflow/python/tools:component_api_helper",
+        "//tensorflow/python/tools/api/generator:create_python_api",
         "//third_party/py/numpy",
     ],
 )
@@ -717,7 +724,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":cond_v2_impl",
         ":dtypes",
         ":framework_ops",
         ":graph_to_function_def",
@@ -2608,6 +2614,19 @@ py_library(
     ],
 )
 
+py_test(
+    name = "sparse_ops_test",
+    srcs = ["ops/sparse_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":constant_op",
+        ":dtypes",
+        ":framework_test_lib",
+        ":sparse_ops",
+        ":sparse_tensor",
+    ],
+)
+
 py_library(
     name = "spectral_grad",
     srcs = ["ops/spectral_grad.py"],
@@ -2779,11 +2798,13 @@ py_library(
     srcs = ["ops/state_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":array_ops",
         ":framework_ops",
+        ":math_ops_gen",
         ":resource_variable_ops_gen",
         ":state_ops_gen",
         ":tensor_shape",
-        "//tensorflow/python/eager:context",
+        ":util",
     ],
 )
 
@@ -3226,7 +3247,6 @@ py_library(
     ),
     srcs_version = "PY2AND3",
     deps = [
-        "saver",
         ":array_ops",
         ":array_ops_gen",
         ":checkpoint_management",
@@ -3250,6 +3270,7 @@ py_library(
         ":random_ops",
         ":resource_variable_ops",
         ":resources",
+        ":saver",
         ":sdca_ops",
         ":session",
         ":sparse_ops",
@@ -3265,6 +3286,7 @@ py_library(
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         # `layers` dependency only exists due to the use of a small utility.
@@ -3340,7 +3362,10 @@ py_library(
 
 py_library(
     name = "distribute",
-    srcs = ["training/distribute.py"],
+    srcs = [
+        "training/distribute.py",
+        "training/distribution_strategy_context.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
@@ -3809,7 +3834,9 @@ tf_py_wrap_cc(
          tf_additional_plugin_deps() +
          tf_additional_verbs_deps() +
          tf_additional_mpi_deps() +
-         tf_additional_gdr_deps()),
+         tf_additional_gdr_deps()) + if_ngraph([
+        "@ngraph_tf//:ngraph_tf",
+    ]),
 )
 
 # ** Targets for Windows build (start) **
@@ -4205,7 +4232,6 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -4499,7 +4525,6 @@ py_test(
     srcs = ["training/saver_large_partitioned_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_windows",
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
@@ -4657,7 +4682,10 @@ py_test(
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67945581
+    tags = [
+        "no_pip",
+        "notsan",  # b/67945581
+    ],
     deps = [
         ":array_ops",
         ":checkpoint_management",
@@ -4675,6 +4703,7 @@ py_test(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/distribute:distribute_coordinator",
     ],
 )
 
diff --git a/tensorflow/python/client/client_lib.py b/tensorflow/python/client/client_lib.py
index c94767a03c..80a256bf7a 100644
--- a/tensorflow/python/client/client_lib.py
+++ b/tensorflow/python/client/client_lib.py
@@ -15,7 +15,7 @@
 
 """Support for launching graphs and executing operations.
 
-See the @{$python/client} guide.
+See the [Client](https://tensorflow.org/api_guides/python/client) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 28f26ad27e..1841dd998b 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1500,7 +1500,7 @@ class Session(BaseSession):
     Args:
       target: (Optional.) The execution engine to connect to.
         Defaults to using an in-process engine. See
-        @{$distributed$Distributed TensorFlow}
+        [Distributed TensorFlow](https://tensorflow.org/deploy/distributed)
         for more examples.
       graph: (Optional.) The `Graph` to be launched (described above).
       config: (Optional.) A
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index c2cfbf5f66..dd3b957da9 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -14,8 +14,8 @@
 # ==============================================================================
 """Utilities for API compatibility between TensorFlow release versions.
 
-See
-@{$guide/version_compat#backward_and_partial_forward_compatibility}
+See [Version
+Compatibility](https://tensorflow.org/guide/version_compat#backward_forward)
 """
 
 from __future__ import absolute_import
@@ -26,14 +26,15 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 8, 26)
 
 
 @tf_export("compat.forward_compatible")
 def forward_compatible(year, month, day):
   """Return true if the forward compatibility window has expired.
 
-  See @{$guide/version_compat#backward_and_partial_forward_compatibility}.
+  See [Version
+  compatibility](https://tensorflow.org/guide/version_compat#backward_forward).
 
   Forward-compatibility refers to scenarios where the producer of a TensorFlow
   model (a GraphDef or SavedModel) is compiled against a version of the
@@ -91,7 +92,8 @@ def forward_compatible(year, month, day):
 def forward_compatibility_horizon(year, month, day):
   """Context manager for testing forward compatibility of generated graphs.
 
-  See @{$guide/version_compat#backward_and_partial_forward_compatibility}.
+  See [Version
+  compatibility](https://tensorflow.org/guide/version_compat#backward_forward).
 
   To ensure forward compatibility of generated graphs (see `forward_compatible`)
   with older binaries, new features can be gated with:
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 3b9bf2469e..f8b561205e 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """`tf.data.Dataset` API for input pipelines.
 
-See @{$guide/datasets$Importing Data} for an overview.
+See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
index e16aa82d4d..159218c99b 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
@@ -110,8 +110,24 @@ class ConcatenateDatasetTest(test.TestCase):
     dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
         to_concatenate_components)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
+    with self.assertRaisesRegexp(TypeError, "have different types"):
+      input_dataset.concatenate(dataset_to_concatenate)
+
+  def testConcatenateDatasetDifferentKeys(self):
+    input_components = {
+        "foo": np.array([[1], [2], [3], [4]]),
+        "bar": np.array([[12], [13], [14], [15]])
+    }
+    to_concatenate_components = {
+        "foo": np.array([[1], [2], [3], [4]]),
+        "baz": np.array([[5], [6], [7], [8]])
+    }
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+
+    with self.assertRaisesRegexp(TypeError, "have different types"):
       input_dataset.concatenate(dataset_to_concatenate)
 
   def testConcatenateDatasetDifferentType(self):
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 352424514e..b0414ad655 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -756,7 +756,7 @@ class IteratorTest(test.TestCase):
     # Saving iterator for RangeDataset graph.
     with ops.Graph().as_default() as g:
       init_op, _, save_op, _ = _build_range_dataset_graph()
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         sess.run(save_op)
 
@@ -767,7 +767,7 @@ class IteratorTest(test.TestCase):
     # IteratorResource::set_iterator.
     with ops.Graph().as_default() as g:
       _, _, _, restore_op = _build_reader_dataset_graph()
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(restore_op)
 
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 637bde9ae4..52b4320bf1 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -24,6 +24,7 @@ import warnings
 
 import numpy as np
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -31,6 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -673,6 +675,36 @@ class MapDatasetTest(test.TestCase):
         r"Dataset.map\(\): None."):
       _ = dataset.map(lambda x: None)
 
+  def testBrokenFunctionErrorOnInitialization(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([1.0, 2.0, 3.0])
+
+    def broken_function(_):
+      """A function deliberately designed to fail on instantiation."""
+      value = []
+      tensor_value = attr_value_pb2.AttrValue()
+      tensor_value.tensor.CopyFrom(
+          tensor_util.make_tensor_proto(
+              value, dtype=dtypes.float32, shape=[0], verify_shape=False))
+      dtype_value = attr_value_pb2.AttrValue(type=dtypes.int32.as_datatype_enum)
+
+      # Create a "Const" op with a `tf.float32` value and a `tf.int32` type
+      # attr.
+      const_tensor = ops.get_default_graph().create_op(
+          "Const", [], [dtypes.int32],
+          attrs={
+              "value": tensor_value,
+              "dtype": dtype_value
+          },
+          name="BrokenConst").outputs[0]
+      return const_tensor
+
+    dataset = dataset.map(broken_function)
+    iterator = dataset.make_initializable_iterator()
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
+        sess.run(iterator.initializer)
+
 
 class MapDatasetBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
index 0c530522b8..ad87f31b01 100644
--- a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
@@ -203,7 +203,7 @@ class RangeDatasetTest(test.TestCase):
     break_point = 5
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for i in range(start, break_point):
@@ -212,7 +212,7 @@ class RangeDatasetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         sess.run(restore_op)
         for i in range(break_point, stop):
@@ -223,7 +223,7 @@ class RangeDatasetTest(test.TestCase):
     # Saving and restoring in same session.
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for i in range(start, break_point):
@@ -254,7 +254,7 @@ class RangeDatasetTest(test.TestCase):
     break_epoch = 3
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for _ in range(break_epoch):
@@ -272,7 +272,7 @@ class RangeDatasetTest(test.TestCase):
                                                       output_shapes)
       restore_op = self._restore_op(iterator._iterator_resource)
       get_next = iterator.get_next()
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for i in range(break_point, stop):
           self.assertEqual(i, sess.run(get_next))
@@ -300,7 +300,7 @@ class RangeDatasetTest(test.TestCase):
     break_point = 5
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for i in range(start, break_point):
@@ -311,7 +311,7 @@ class RangeDatasetTest(test.TestCase):
       # Intentionally build a graph with a different value for stop to make sure
       # the original dataset graph is actually getting loaded.
       init_op, get_next, _, restore_op = _build_graph(start, stop_1)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for i in range(break_point, stop):
           self.assertEqual(i, sess.run(get_next))
@@ -338,7 +338,7 @@ class RangeDatasetTest(test.TestCase):
     break_point = 5
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for i in range(start, break_point):
@@ -347,7 +347,7 @@ class RangeDatasetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         sess.run(restore_op)
         for i in range(break_point, stop):
@@ -373,7 +373,7 @@ class RangeDatasetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for i in range(start, break_point1):
@@ -382,7 +382,7 @@ class RangeDatasetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for i in range(break_point1, break_point2):
           self.assertEqual(i, sess.run(get_next))
@@ -391,7 +391,7 @@ class RangeDatasetTest(test.TestCase):
     break_point2 = 7
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for i in range(break_point2, stop):
           self.assertEqual(i, sess.run(get_next))
@@ -417,7 +417,7 @@ class RangeDatasetTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(
           start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
@@ -433,7 +433,7 @@ class RangeDatasetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for i in range(break_range, stop):
           self.assertEqual(i, sess.run(get_next))
@@ -460,7 +460,7 @@ class RangeDatasetTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(
           start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
@@ -476,7 +476,7 @@ class RangeDatasetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
index e99f0a203b..431362aa9a 100644
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
@@ -374,7 +374,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
@@ -401,7 +401,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
@@ -427,7 +427,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
@@ -454,7 +454,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         sess.run(restore_op)
         for epoch in range(num_epochs):
@@ -479,7 +479,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
@@ -506,7 +506,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs_1)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
@@ -529,7 +529,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
@@ -555,7 +555,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       restore_op, get_next_op = self._restore_iterator()
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for epoch in range(num_epochs):
           for f in range(self._num_files):
@@ -574,7 +574,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
@@ -585,7 +585,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         for _ in range(num_epochs * self._num_files * self._num_records):
           sess.run(get_next_op)
@@ -598,7 +598,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(init_op)
         # Note: There is no checkpoint saved currently so a NotFoundError is
         # raised.
@@ -615,7 +615,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
           num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(restore_op)
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next_op)
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 50ba5f403e..57517afae8 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -27,6 +27,7 @@ py_library(
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 8ba98cb88d..8c37b1871b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -225,7 +225,7 @@ class Dataset(object):
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
     memory and run into byte limits of graph serialization.  If tensors contains
     one or more large NumPy arrays, consider the alternative described in
-    @{$guide/datasets#consuming_numpy_arrays$this guide}.
+    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors.
@@ -244,7 +244,7 @@ class Dataset(object):
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
     memory and run into byte limits of graph serialization.  If tensors contains
     one or more large NumPy arrays, consider the alternative described in
-    @{$guide/datasets#consuming_numpy_arrays$this guide}.
+    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors, each having the same size in the
@@ -1684,15 +1684,14 @@ class ConcatenateDataset(Dataset):
     super(ConcatenateDataset, self).__init__()
     self._input_dataset = input_dataset
     self._dataset_to_concatenate = dataset_to_concatenate
-    nest.assert_same_structure(input_dataset.output_types,
-                               dataset_to_concatenate.output_types)
-    for a, b in zip(
-        nest.flatten(input_dataset.output_types),
-        nest.flatten(dataset_to_concatenate.output_types)):
-      if a != b:
-        raise TypeError(
-            "Two datasets to concatenate have different types %s and %s" %
-            (input_dataset.output_types, dataset_to_concatenate.output_types))
+    if input_dataset.output_types != dataset_to_concatenate.output_types:
+      raise TypeError(
+          "Two datasets to concatenate have different types %s and %s" %
+          (input_dataset.output_types, dataset_to_concatenate.output_types))
+    if input_dataset.output_classes != dataset_to_concatenate.output_classes:
+      raise TypeError(
+          "Two datasets to concatenate have different classes %s and %s" %
+          (input_dataset.output_classes, dataset_to_concatenate.output_classes))
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 5fcc62b60b..39082ce370 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -63,6 +63,41 @@ py_test(
 )
 
 py_library(
+    name = "structure",
+    srcs = ["structure.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nest",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "structure_test",
+    size = "small",
+    srcs = ["structure_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nest",
+        ":structure",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variables",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
     name = "convert",
     srcs = ["convert.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 1b596bdfc0..9d621fcd30 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -129,35 +129,18 @@ def flatten(nest):
   return _pywrap_tensorflow.FlattenForData(nest)
 
 
-def _recursive_assert_same_structure(nest1, nest2, check_types):
-  is_sequence_nest1 = is_sequence(nest1)
-  if is_sequence_nest1 != is_sequence(nest2):
-    raise ValueError(
-        "The two structures don't have the same nested structure. "
-        "First structure: %s, second structure: %s." % (nest1, nest2))
-
-  if is_sequence_nest1:
-    type_nest1 = type(nest1)
-    type_nest2 = type(nest2)
-    if check_types and type_nest1 != type_nest2:
-      raise TypeError(
-          "The two structures don't have the same sequence type. First "
-          "structure has type %s, while second structure has type %s."
-          % (type_nest1, type_nest2))
-
-    for n1, n2 in zip(_yield_value(nest1), _yield_value(nest2)):
-      _recursive_assert_same_structure(n1, n2, check_types)
-
-
 def assert_same_structure(nest1, nest2, check_types=True):
   """Asserts that two structures are nested in the same way.
 
   Args:
     nest1: an arbitrarily nested structure.
     nest2: an arbitrarily nested structure.
-    check_types: if `True` (default) types of sequences are checked as
-      well. If set to `False`, for example a list and a tuple of objects will
-      look same if they have the same size.
+    check_types: if `True` (default) types of sequences should be same as
+      well. For dictionary, "type" of dictionary is considered to include its
+      keys. In other words, two dictionaries with different keys are considered
+      to have a different "type". If set to `False`, two iterables are
+      considered same as long as they yield the elements that have same
+      structures.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
@@ -165,13 +148,7 @@ def assert_same_structure(nest1, nest2, check_types=True):
     TypeError: If the two structures differ in the type of sequence in any of
       their substructures. Only possible if `check_types` is `True`.
   """
-  len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
-  len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
-  if len_nest1 != len_nest2:
-    raise ValueError("The two structures don't have the same number of "
-                     "elements. First structure: %s, second structure: %s."
-                     % (nest1, nest2))
-  _recursive_assert_same_structure(nest1, nest2, check_types)
+  _pywrap_tensorflow.AssertSameStructureForData(nest1, nest2, check_types)
 
 
 def _packed_nest_with_indices(structure, flat, index):
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index ff380815a4..616aa9f551 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -163,21 +163,30 @@ class NestTest(test.TestCase):
     structure2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
     structure_different_num_elements = ("spam", "eggs")
     structure_different_nesting = (((1, 2), 3), 4, 5, (6,))
+    structure_dictionary = {"foo": 2, "bar": 4, "baz": {"foo": 5, "bar": 6}}
+    structure_dictionary_diff_nested = {
+        "foo": 2,
+        "bar": 4,
+        "baz": {
+            "foo": 5,
+            "baz": 6
+        }
+    }
     nest.assert_same_structure(structure1, structure2)
     nest.assert_same_structure("abc", 1.0)
     nest.assert_same_structure("abc", np.array([0, 1]))
     nest.assert_same_structure("abc", constant_op.constant([0, 1]))
 
     with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
+                                 "don't have the same nested structure"):
       nest.assert_same_structure(structure1, structure_different_num_elements)
 
     with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
+                                 "don't have the same nested structure"):
       nest.assert_same_structure((0, 1), np.array([0, 1]))
 
     with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
+                                 "don't have the same nested structure"):
       nest.assert_same_structure(0, (0, 1))
 
     with self.assertRaisesRegexp(ValueError,
@@ -203,11 +212,23 @@ class NestTest(test.TestCase):
       nest.assert_same_structure(((3,), 4), (3, (4,)))
 
     structure1_list = {"a": ((1, 2), 3), "b": 4, "c": (5, 6)}
+    structure2_list = {"a": ((1, 2), 3), "b": 4, "d": (5, 6)}
     with self.assertRaisesRegexp(TypeError,
                                  "don't have the same sequence type"):
       nest.assert_same_structure(structure1, structure1_list)
     nest.assert_same_structure(structure1, structure2, check_types=False)
     nest.assert_same_structure(structure1, structure1_list, check_types=False)
+    with self.assertRaisesRegexp(ValueError, "don't have the same set of keys"):
+      nest.assert_same_structure(structure1_list, structure2_list)
+    with self.assertRaisesRegexp(ValueError, "don't have the same set of keys"):
+      nest.assert_same_structure(structure_dictionary,
+                                 structure_dictionary_diff_nested)
+    nest.assert_same_structure(
+        structure_dictionary,
+        structure_dictionary_diff_nested,
+        check_types=False)
+    nest.assert_same_structure(
+        structure1_list, structure2_list, check_types=False)
 
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
new file mode 100644
index 0000000000..c5764b8dfe
--- /dev/null
+++ b/tensorflow/python/data/util/structure.py
@@ -0,0 +1,315 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for describing the structure of a `tf.data` type."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import sparse_ops
+
+
+class Structure(object):
+  """Represents structural information, such as type and shape, about a value.
+
+  A `Structure` generalizes the `tf.Tensor.dtype` and `tf.Tensor.shape`
+  properties, so that we can define generic containers of objects including:
+
+  * `tf.Tensor`
+  * `tf.SparseTensor`
+  * Nested structures of the above.
+
+  TODO(b/110122868): In the future, a single `Structure` will replace the
+  `tf.data.Dataset.output_types`, `tf.data.Dataset.output_shapes`,
+  and `tf.data.Dataset.output_classes`, and similar properties and arguments in
+  the `tf.data.Iterator` and `Optional` classes.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def _flat_shapes(self):
+    """A list of shapes matching the shapes of `self._to_tensor_list()`.
+
+    Returns:
+      A list of `tf.TensorShape` objects.
+    """
+    raise NotImplementedError("Structure._flat_shapes")
+
+  @abc.abstractproperty
+  def _flat_types(self):
+    """A list of types matching the types of `self._to_tensor_list()`.
+
+    Returns:
+      A list of `tf.DType` objects.
+    """
+    raise NotImplementedError("Structure._flat_shapes")
+
+  @abc.abstractmethod
+  def is_compatible_with(self, value):
+    """Returns `True` if `value` is compatible with this structure.
+
+    A value `value` is compatible with a structure `s` if
+    `Structure.from_value(value)` would return a structure `t` that is a
+    "subtype" of `s`. A structure `t` is a "subtype" of `s` if:
+
+    * `s` and `t` are instances of the same `Structure` subclass.
+    * The nested structures (if any) of `s` and `t` are the same, according to
+      `tf.contrib.framework.nest.assert_same_structure`, and each nested
+      structure of `t` is a "subtype" of the corresponding nested structure of
+      `s`.
+    * Any `tf.DType` components of `t` are the same as the corresponding
+      components in `s`.
+    * Any `tf.TensorShape` components of `t` are compatible with the
+      corresponding components in `s`, according to
+      `tf.TensorShape.is_compatible_with`.
+
+    Args:
+      value: A potentially structured value.
+
+    Returns:
+      `True` if `value` matches this structure, otherwise `False`.
+    """
+    raise NotImplementedError("Structure.is_compatible_with()")
+
+  @abc.abstractmethod
+  def _to_tensor_list(self, value):
+    """Returns a flat list of `tf.Tensor` representing `value`.
+
+    This method can be used, along with `self._flat_shapes` and
+    `self._flat_types` to represent structured values in lower level APIs
+    (such as plain TensorFlow operations) that do not understand structure.
+
+    Requires: `self.is_compatible_with(value)`.
+
+    Args:
+      value: A value with compatible structure.
+
+    Returns:
+      A flat list of `tf.Tensor` representing `value`.
+    """
+    raise NotImplementedError("Structure._to_tensor_list()")
+
+  @abc.abstractmethod
+  def _from_tensor_list(self, flat_value):
+    """Builds a flat list of `tf.Tensor` into a value matching this structure.
+
+    Requires: The shapes and types of the tensors in `flat_value` must be
+    compatible with `self._flat_shapes` and `self._flat_types` respectively.
+
+    Args:
+      flat_value: A list of `tf.Tensor` with compatible flat structure.
+
+    Returns:
+      A structured object matching this structure.
+    """
+    raise NotImplementedError("Structure._from_tensor_list()")
+
+  @staticmethod
+  def from_value(value):
+    """Returns a `Structure` that represents the given `value`.
+
+    Args:
+      value: A potentially structured value.
+
+    Returns:
+      A `Structure` that is compatible with `value`.
+
+    Raises:
+      TypeError: If a structure cannot be built for `value`, because its type
+        or one of its component types is not supported.
+    """
+
+    # TODO(b/110122868): Add support for custom types, Dataset, and Optional
+    # to this method.
+    if isinstance(
+        value,
+        (sparse_tensor_lib.SparseTensor, sparse_tensor_lib.SparseTensorValue)):
+      return SparseTensorStructure.from_value(value)
+    elif isinstance(value, (tuple, dict)):
+      return NestedStructure.from_value(value)
+    else:
+      try:
+        tensor = ops.convert_to_tensor(value)
+      except (ValueError, TypeError):
+        raise TypeError("Could not build a structure for %r" % value)
+      return TensorStructure.from_value(tensor)
+
+
+# NOTE(mrry): The following classes make extensive use of non-public methods of
+# their base class, so we disable the protected-access lint warning once here.
+# pylint: disable=protected-access
+class NestedStructure(Structure):
+  """Represents a nested structure in which each leaf is a `Structure`."""
+
+  def __init__(self, nested_structure):
+    self._nested_structure = nested_structure
+    self._flat_shapes_list = []
+    self._flat_types_list = []
+    for s in nest.flatten(nested_structure):
+      if not isinstance(s, Structure):
+        raise TypeError("nested_structure must be a (potentially nested) tuple "
+                        "or dictionary of Structure objects.")
+      self._flat_shapes_list.extend(s._flat_shapes)
+      self._flat_types_list.extend(s._flat_types)
+
+  @property
+  def _flat_shapes(self):
+    return self._flat_shapes_list
+
+  @property
+  def _flat_types(self):
+    return self._flat_types_list
+
+  def is_compatible_with(self, value):
+    try:
+      nest.assert_shallow_structure(self._nested_structure, value)
+    except (ValueError, TypeError):
+      return False
+
+    return all(
+        s.is_compatible_with(v) for s, v in zip(
+            nest.flatten(self._nested_structure),
+            nest.flatten_up_to(self._nested_structure, value)))
+
+  def _to_tensor_list(self, value):
+    ret = []
+
+    try:
+      flat_value = nest.flatten_up_to(self._nested_structure, value)
+    except (ValueError, TypeError):
+      raise ValueError("The value %r is not compatible with the nested "
+                       "structure %r." % (value, self._nested_structure))
+
+    for sub_value, structure in zip(flat_value,
+                                    nest.flatten(self._nested_structure)):
+      if not structure.is_compatible_with(sub_value):
+        raise ValueError("Component value %r is not compatible with the nested "
+                         "structure %r." % (sub_value, structure))
+      ret.extend(structure._to_tensor_list(sub_value))
+    return ret
+
+  def _from_tensor_list(self, flat_value):
+    if len(flat_value) != len(self._flat_types):
+      raise ValueError("Expected %d flat values in NestedStructure but got %d."
+                       % (len(self._flat_types), len(flat_value)))
+
+    flat_ret = []
+    for sub_value, structure in zip(flat_value,
+                                    nest.flatten(self._nested_structure)):
+      flat_ret.append(structure._from_tensor_list([sub_value]))
+
+    return nest.pack_sequence_as(self._nested_structure, flat_ret)
+
+  @staticmethod
+  def from_value(value):
+    flat_nested_structure = [
+        Structure.from_value(sub_value) for sub_value in nest.flatten(value)
+    ]
+    return NestedStructure(nest.pack_sequence_as(value, flat_nested_structure))
+
+
+class TensorStructure(Structure):
+  """Represents structural information about a `tf.Tensor`."""
+
+  def __init__(self, dtype, shape):
+    self._dtype = dtypes.as_dtype(dtype)
+    self._shape = tensor_shape.as_shape(shape)
+
+  @property
+  def _flat_shapes(self):
+    return [self._shape]
+
+  @property
+  def _flat_types(self):
+    return [self._dtype]
+
+  def is_compatible_with(self, value):
+    try:
+      value = ops.convert_to_tensor(value, dtype=self._dtype)
+    except (ValueError, TypeError):
+      return False
+
+    return (self._dtype.is_compatible_with(value.dtype) and
+            self._shape.is_compatible_with(value.shape))
+
+  def _to_tensor_list(self, value):
+    if not self.is_compatible_with(value):
+      raise ValueError("Value %r is not convertible to a tensor with dtype %s "
+                       "and shape %s." % (value, self._dtype, self._shape))
+    return [value]
+
+  def _from_tensor_list(self, flat_value):
+    if len(flat_value) != 1:
+      raise ValueError("TensorStructure corresponds to a single tf.Tensor.")
+    if not self.is_compatible_with(flat_value[0]):
+      raise ValueError("Cannot convert %r to a tensor with dtype %s and shape "
+                       "%s." % (flat_value[0], self._dtype, self._shape))
+    return flat_value[0]
+
+  @staticmethod
+  def from_value(value):
+    return TensorStructure(value.dtype, value.shape)
+
+
+class SparseTensorStructure(Structure):
+  """Represents structural information about a `tf.SparseTensor`."""
+
+  def __init__(self, dtype, dense_shape):
+    self._dtype = dtypes.as_dtype(dtype)
+    self._dense_shape = tensor_shape.as_shape(dense_shape)
+
+  @property
+  def _flat_shapes(self):
+    return [tensor_shape.vector(3)]
+
+  @property
+  def _flat_types(self):
+    return [dtypes.variant]
+
+  def is_compatible_with(self, value):
+    try:
+      value = sparse_tensor_lib.SparseTensor.from_value(value)
+    except TypeError:
+      return False
+    return (isinstance(value, (sparse_tensor_lib.SparseTensor,
+                               sparse_tensor_lib.SparseTensorValue)) and
+            self._dtype.is_compatible_with(value.dtype) and
+            self._dense_shape.is_compatible_with(
+                tensor_util.constant_value_as_shape(value.dense_shape)))
+
+  def _to_tensor_list(self, value):
+    return [sparse_ops.serialize_sparse(value, out_type=dtypes.variant)]
+
+  def _from_tensor_list(self, flat_value):
+    if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
+        not flat_value[0].shape.is_compatible_with(tensor_shape.vector(3))):
+      raise ValueError("SparseTensorStructure corresponds to a single "
+                       "tf.variant vector of length 3.")
+    return sparse_ops.deserialize_sparse(
+        flat_value[0], dtype=self._dtype, rank=self._dense_shape.ndims)
+
+  @staticmethod
+  def from_value(value):
+    sparse_tensor = sparse_tensor_lib.SparseTensor.from_value(value)
+    return SparseTensorStructure(
+        sparse_tensor.dtype,
+        tensor_util.constant_value_as_shape(sparse_tensor.dense_shape))
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
new file mode 100644
index 0000000000..d0c7df67ae
--- /dev/null
+++ b/tensorflow/python/data/util/structure_test.py
@@ -0,0 +1,327 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utilities working with arbitrarily nested structures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class StructureTest(test.TestCase, parameterized.TestCase):
+  # pylint disable=protected-access
+
+  @parameterized.parameters(
+      (constant_op.constant(37.0), structure.TensorStructure, [dtypes.float32],
+       [[]]), (sparse_tensor.SparseTensor(
+           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+               structure.SparseTensorStructure, [dtypes.variant], [[3]]),
+      ((constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
+       structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]), ({
+           "a": constant_op.constant(37.0),
+           "b": constant_op.constant([1, 2, 3])
+       }, structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
+      ({
+          "a":
+              constant_op.constant(37.0),
+          "b": (sparse_tensor.SparseTensor(
+              indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+                sparse_tensor.SparseTensor(
+                    indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
+      }, structure.NestedStructure,
+       [dtypes.float32, dtypes.variant, dtypes.variant], [[], [3], [3]]))
+  def testFlatStructure(self, value, expected_structure, expected_types,
+                        expected_shapes):
+    s = structure.Structure.from_value(value)
+    self.assertIsInstance(s, expected_structure)
+    self.assertEqual(expected_types, s._flat_types)
+    self.assertEqual(expected_shapes, s._flat_shapes)
+
+  @parameterized.parameters(
+      (constant_op.constant(37.0), [
+          constant_op.constant(38.0),
+          array_ops.placeholder(dtypes.float32),
+          variables.Variable(100.0), 42.0,
+          np.array(42.0, dtype=np.float32)
+      ], [constant_op.constant([1.0, 2.0]),
+          constant_op.constant(37)]),
+      (sparse_tensor.SparseTensor(
+          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+       [
+           sparse_tensor.SparseTensor(
+               indices=[[1, 1], [3, 4]], values=[10, -1], dense_shape=[4, 5]),
+           sparse_tensor.SparseTensorValue(
+               indices=[[1, 1], [3, 4]], values=[10, -1], dense_shape=[4, 5]),
+           array_ops.sparse_placeholder(dtype=dtypes.int32),
+           array_ops.sparse_placeholder(dtype=dtypes.int32, shape=[None, None])
+       ], [
+           constant_op.constant(37, shape=[4, 5]),
+           sparse_tensor.SparseTensor(
+               indices=[[3, 4]], values=[-1], dense_shape=[5, 6]),
+           array_ops.sparse_placeholder(
+               dtype=dtypes.int32, shape=[None, None, None]),
+           sparse_tensor.SparseTensor(
+               indices=[[3, 4]], values=[-1.0], dense_shape=[4, 5])
+       ]),
+      ({
+          "a": constant_op.constant(37.0),
+          "b": constant_op.constant([1, 2, 3])
+      }, [{
+          "a": constant_op.constant(15.0),
+          "b": constant_op.constant([4, 5, 6])
+      }], [{
+          "a": constant_op.constant(15.0),
+          "b": constant_op.constant([4, 5, 6, 7])
+      }, {
+          "a": constant_op.constant(15),
+          "b": constant_op.constant([4, 5, 6])
+      }, {
+          "a":
+              constant_op.constant(15),
+          "b":
+              sparse_tensor.SparseTensor(
+                  indices=[[0], [1], [2]], values=[4, 5, 6], dense_shape=[3])
+      }, (constant_op.constant(15.0), constant_op.constant([4, 5, 6]))]),
+  )
+  def testIsCompatibleWith(self, original_value, compatible_values,
+                           incompatible_values):
+    s = structure.Structure.from_value(original_value)
+    for compatible_value in compatible_values:
+      self.assertTrue(s.is_compatible_with(compatible_value))
+    for incompatible_value in incompatible_values:
+      self.assertFalse(s.is_compatible_with(incompatible_value))
+
+  # NOTE(mrry): The arguments must be lifted into lambdas because otherwise they
+  # will be executed before the (eager- or graph-mode) test environment has been
+  # set up.
+  # pylint: disable=g-long-lambda
+  @parameterized.parameters(
+      (lambda: constant_op.constant(37.0),),
+      (lambda: sparse_tensor.SparseTensor(
+          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),),
+      (lambda: {"a": constant_op.constant(37.0),
+                "b": constant_op.constant([1, 2, 3])},),
+      (lambda: {"a": constant_op.constant(37.0),
+                "b": (sparse_tensor.SparseTensor(
+                    indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+                      sparse_tensor.SparseTensor(
+                          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
+               },),
+      )
+  def testRoundTripConversion(self, value_fn):
+    value = value_fn()
+    s = structure.Structure.from_value(value)
+    before = self.evaluate(value)
+    after = self.evaluate(s._from_tensor_list(s._to_tensor_list(value)))
+
+    flat_before = nest.flatten(before)
+    flat_after = nest.flatten(after)
+    for b, a in zip(flat_before, flat_after):
+      if isinstance(b, sparse_tensor.SparseTensorValue):
+        self.assertAllEqual(b.indices, a.indices)
+        self.assertAllEqual(b.values, a.values)
+        self.assertAllEqual(b.dense_shape, a.dense_shape)
+      else:
+        self.assertAllEqual(b, a)
+  # pylint: enable=g-long-lambda
+
+  def testIncompatibleStructure(self):
+    # Define three mutually incompatible values/structures, and assert that:
+    # 1. Using one structure to flatten a value with an incompatible structure
+    #    fails.
+    # 2. Using one structure to restructre a flattened value with an
+    #    incompatible structure fails.
+    value_tensor = constant_op.constant(42.0)
+    s_tensor = structure.Structure.from_value(value_tensor)
+    flat_tensor = s_tensor._to_tensor_list(value_tensor)
+
+    value_sparse_tensor = sparse_tensor.SparseTensor(
+        indices=[[0, 0]], values=[1], dense_shape=[1, 1])
+    s_sparse_tensor = structure.Structure.from_value(value_sparse_tensor)
+    flat_sparse_tensor = s_sparse_tensor._to_tensor_list(value_sparse_tensor)
+
+    value_nest = {
+        "a": constant_op.constant(37.0),
+        "b": constant_op.constant([1, 2, 3])
+    }
+    s_nest = structure.Structure.from_value(value_nest)
+    flat_nest = s_nest._to_tensor_list(value_nest)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"SparseTensor.* is not convertible to a tensor with "
+        r"dtype.*float32.* and shape \(\)"):
+      s_tensor._to_tensor_list(value_sparse_tensor)
+    with self.assertRaisesRegexp(
+        ValueError, r"Value \{.*\} is not convertible to a tensor with "
+        r"dtype.*float32.* and shape \(\)"):
+      s_tensor._to_tensor_list(value_nest)
+
+    with self.assertRaisesRegexp(TypeError, "Input must be a SparseTensor"):
+      s_sparse_tensor._to_tensor_list(value_tensor)
+
+    with self.assertRaisesRegexp(TypeError, "Input must be a SparseTensor"):
+      s_sparse_tensor._to_tensor_list(value_nest)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Tensor.* not compatible with the nested structure "
+        ".*TensorStructure.*TensorStructure"):
+      s_nest._to_tensor_list(value_tensor)
+
+    with self.assertRaisesRegexp(
+        ValueError, "SparseTensor.* not compatible with the nested structure "
+        ".*TensorStructure.*TensorStructure"):
+      s_nest._to_tensor_list(value_sparse_tensor)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Cannot convert.*with dtype.*float32.* and shape \(\)"):
+      s_tensor._from_tensor_list(flat_sparse_tensor)
+
+    with self.assertRaisesRegexp(
+        ValueError, "TensorStructure corresponds to a single tf.Tensor."):
+      s_tensor._from_tensor_list(flat_nest)
+
+    with self.assertRaisesRegexp(
+        ValueError, "SparseTensorStructure corresponds to a single tf.variant "
+        "vector of length 3."):
+      s_sparse_tensor._from_tensor_list(flat_tensor)
+
+    with self.assertRaisesRegexp(
+        ValueError, "SparseTensorStructure corresponds to a single tf.variant "
+        "vector of length 3."):
+      s_sparse_tensor._from_tensor_list(flat_nest)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Expected 2 flat values in NestedStructure but got 1."):
+      s_nest._from_tensor_list(flat_tensor)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Expected 2 flat values in NestedStructure but got 1."):
+      s_nest._from_tensor_list(flat_sparse_tensor)
+
+  def testIncompatibleNestedStructure(self):
+    # Define three mutually incompatible nested values/structures, and assert
+    # that:
+    # 1. Using one structure to flatten a value with an incompatible structure
+    #    fails.
+    # 2. Using one structure to restructre a flattened value with an
+    #    incompatible structure fails.
+
+    value_0 = {
+        "a": constant_op.constant(37.0),
+        "b": constant_op.constant([1, 2, 3])
+    }
+    s_0 = structure.Structure.from_value(value_0)
+    flat_s_0 = s_0._to_tensor_list(value_0)
+
+    # `value_1` has compatible nested structure with `value_0`, but different
+    # classes.
+    value_1 = {
+        "a":
+            constant_op.constant(37.0),
+        "b":
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[1], dense_shape=[1, 1])
+    }
+    s_1 = structure.Structure.from_value(value_1)
+    flat_s_1 = s_1._to_tensor_list(value_1)
+
+    # `value_2` has incompatible nested structure with `value_0` and `value_1`.
+    value_2 = {
+        "a":
+            constant_op.constant(37.0),
+        "b": (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+              sparse_tensor.SparseTensor(
+                  indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
+    }
+    s_2 = structure.Structure.from_value(value_2)
+    flat_s_2 = s_2._to_tensor_list(value_2)
+
+    with self.assertRaisesRegexp(
+        ValueError, "SparseTensor.* not compatible with the nested structure "
+        ".*TensorStructure"):
+      s_0._to_tensor_list(value_1)
+
+    with self.assertRaisesRegexp(
+        ValueError, "SparseTensor.*SparseTensor.* not compatible with the "
+        "nested structure .*TensorStructure"):
+      s_0._to_tensor_list(value_2)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Tensor.* not compatible with the nested structure "
+        ".*SparseTensorStructure"):
+      s_1._to_tensor_list(value_0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "SparseTensor.*SparseTensor.* not compatible with the "
+        "nested structure .*TensorStructure"):
+      s_0._to_tensor_list(value_2)
+
+    # NOTE(mrry): The repr of the dictionaries is not sorted, so the regexp
+    # needs to account for "a" coming before or after "b". It might be worth
+    # adding a deterministic repr for these error messages (among other
+    # improvements).
+    with self.assertRaisesRegexp(
+        ValueError, "Tensor.*Tensor.* not compatible with the nested structure "
+        ".*(TensorStructure.*SparseTensorStructure.*SparseTensorStructure|"
+        "SparseTensorStructure.*SparseTensorStructure.*TensorStructure)"):
+      s_2._to_tensor_list(value_0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "(Tensor.*SparseTensor|SparseTensor.*Tensor).* "
+        "not compatible with the nested structure .*"
+        "(TensorStructure.*SparseTensorStructure.*SparseTensorStructure|"
+        "SparseTensorStructure.*SparseTensorStructure.*TensorStructure)"):
+      s_2._to_tensor_list(value_1)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Cannot convert.*with dtype.*int32.* and shape \(3,\)"):
+      s_0._from_tensor_list(flat_s_1)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Expected 2 flat values in NestedStructure but got 3."):
+      s_0._from_tensor_list(flat_s_2)
+
+    with self.assertRaisesRegexp(
+        ValueError, "SparseTensorStructure corresponds to a single tf.variant "
+        "vector of length 3."):
+      s_1._from_tensor_list(flat_s_0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Expected 2 flat values in NestedStructure but got 3."):
+      s_1._from_tensor_list(flat_s_2)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Expected 3 flat values in NestedStructure but got 2."):
+      s_2._from_tensor_list(flat_s_0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Expected 3 flat values in NestedStructure but got 2."):
+      s_2._from_tensor_list(flat_s_1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 27b8ebd362..55d2709845 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -576,7 +576,6 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
-        "nomac",
         "oss_serial",
     ],
     deps = [
@@ -936,7 +935,6 @@ py_test(
     size = "small",
     srcs = ["cli/profile_analyzer_cli_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":debugger_cli_common",
         ":profile_analyzer_cli",
@@ -1048,7 +1046,6 @@ cuda_py_test(
     tags = [
         "no_oss",  # Incompatible with bazel_pip.
         "no_windows",
-        "nomac",  # TODO(cais): Install of futures and grpcio on all macs.
         "notsan",
     ],
 )
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 34da44b60d..242215dccb 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Public Python API of TensorFlow Debugger (tfdbg).
 
-See the @{$python/tfdbg} guide.
+See the [TFDBG](https://tensorflow.org/api_guides/python/tfdbg) guide.
 
 @@add_debug_tensor_watch
 @@watch_graph
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 68d8b8d13b..ebfcd085e6 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -9,6 +9,25 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
+    name = "distribute",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":distribute_config",
+        ":distribute_coordinator",
+        ":distribute_coordinator_context",
+    ],
+)
+
+py_library(
+    name = "distribute_config",
+    srcs = [
+        "distribute_config.py",
+    ],
+    deps = [],
+)
+
+py_library(
     name = "distribute_coordinator",
     srcs = [
         "distribute_coordinator.py",
@@ -41,3 +60,57 @@ py_test(
         "//tensorflow/python:variables",
     ],
 )
+
+py_library(
+    name = "distribute_coordinator_context",
+    srcs = [
+        "distribute_coordinator_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
+py_library(
+    name = "multi_worker_util",
+    srcs = [
+        "multi_worker_util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "multi_worker_util_test",
+    srcs = ["multi_worker_util_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":multi_worker_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+# Used only by estimator.
+py_library(
+    name = "estimator_training",
+    srcs = [
+        "estimator_training.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":distribute_coordinator",
+        ":distribute_coordinator_context",
+        "//tensorflow/python:training",
+    ],
+)
diff --git a/tensorflow/python/distribute/distribute_config.py b/tensorflow/python/distribute/distribute_config.py
new file mode 100644
index 0000000000..fac35742fe
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_config.py
@@ -0,0 +1,45 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A configure tuple for high-level APIs for running distribution strategies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+class DistributeConfig(
+    collections.namedtuple(
+        'DistributeConfig',
+        ['train_distribute', 'eval_distribute', 'remote_cluster'])):
+  """A config tuple for distribution strategies.
+
+  Attributes:
+    train_distribute: a `DistributionStrategy` object for training.
+    eval_distribute: an optional `DistributionStrategy` object for
+      evaluation.
+    remote_cluster: a dict, `ClusterDef` or `ClusterSpec` object specifying
+      the cluster configurations. If this is given, the `train_and_evaluate`
+      method will be running as a standalone client which connects to the
+      cluster for training.
+  """
+
+  def __new__(cls,
+              train_distribute=None,
+              eval_distribute=None,
+              remote_cluster=None):
+    return super(DistributeConfig, cls).__new__(cls, train_distribute,
+                                                eval_distribute, remote_cluster)
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index fc9ca4ac4a..46cdd64a6e 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A unified and split coordinator for distributed TensorFlow."""
+"""A component for running distributed TensorFlow."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,8 +22,13 @@ import copy
 import json
 import os
 import threading
+import time
 
 from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.client import session
+from tensorflow.python.distribute import distribute_coordinator_context
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import server_lib
 
 
@@ -43,23 +48,12 @@ class CoordinatorMode(object):
   # client and connects to remote servers for training.  Each remote server can
   # use the distribute coordinator binary with task_type set correctly which
   # will then turn into standard servers.
-  SPLIT_CLIENT = 0
+  STANDALONE_CLIENT = "standalone_client"
 
   # The distribute coordinator runs on each worker. It will run a standard
   # server on each worker and optionally run the `worker_fn` that is configured
   # to talk to its standard server.
-  INDEPENDENT_WORKER = 1
-
-
-_worker_context = threading.local()
-
-
-def get_current_worker_context():
-  """Returns the current task context."""
-  try:
-    return _worker_context.current
-  except AttributeError:
-    return None
+  INDEPENDENT_WORKER = "independent_worker"
 
 
 class _Barrier(object):
@@ -113,14 +107,17 @@ class _WorkerContext(object):
   """
 
   def __init__(self,
+               strategy,
                cluster_spec,
                task_type,
                task_id,
+               session_config=None,
                rpc_layer="grpc",
                worker_barrier=None):
     """Initialize the worker context object.
 
     Args:
+      strategy: a `DistributionStrategy` object.
       cluster_spec: a ClusterSpec object. It can be empty or None in the local
         training case.
       task_type: a string indicating the role of the corresponding task, such as
@@ -128,14 +125,17 @@ class _WorkerContext(object):
         replicated training.
       task_id: an integer indicating id of the corresponding task. It can be
         None if it is local training or in-graph replicated training.
+      session_config: an optional @{tf.ConfigProto} object.
       rpc_layer: optional string specifying the RPC protocol for communication
         with worker masters. If None or empty, hosts in the `cluster_spec` will
         be used directly.
       worker_barrier: optional, the barrier object for worker synchronization.
     """
+    self._strategy = strategy
     self._cluster_spec = cluster_spec
     self._task_type = task_type
     self._task_id = task_id
+    self._session_config = session_config
     self._worker_barrier = worker_barrier
     self._rpc_layer = rpc_layer
     self._master_target = self._get_master_target()
@@ -143,26 +143,31 @@ class _WorkerContext(object):
     self._is_chief_node = self._is_chief()
 
   def _debug_message(self):
-    return "[cluster_spec: %r, task_type: %r, task_id: %r]" % (
-        self._cluster_spec, self.task_type, self.task_id)
+    if self._cluster_spec:
+      return "[cluster_spec: %r, task_type: %r, task_id: %r]" % (
+          self._cluster_spec, self.task_type, self.task_id)
+    else:
+      return "[local]"
 
   def __enter__(self):
-    old_context = get_current_worker_context()
+    old_context = distribute_coordinator_context.get_current_worker_context()
     if old_context:
       raise ValueError(
           "You cannot run distribute coordinator in a `worker_fn`.\t" +
           self._debug_message())
-    _worker_context.current = self
+    # pylint: disable=protected-access
+    distribute_coordinator_context._worker_context.current = self
 
   def __exit__(self, unused_exception_type, unused_exception_value,
                unused_traceback):
-    _worker_context.current = None
+    # pylint: disable=protected-access
+    distribute_coordinator_context._worker_context.current = None
 
   def _get_master_target(self):
     """Return the master target for a task."""
     # If cluster_spec is None or empty, we use local master.
     if not self._cluster_spec:
-      return "local"
+      return ""
 
     # If task_type is None, then it is in-graph replicated training. In this
     # case we use the chief or first worker's master target.
@@ -207,6 +212,47 @@ class _WorkerContext(object):
                        self._debug_message())
     self._worker_barrier.wait()
 
+  def session_creator(self,
+                      scaffold=None,
+                      config=None,
+                      checkpoint_dir=None,
+                      checkpoint_filename_with_path=None,
+                      max_wait_secs=7200):
+    """Returns a session creator.
+
+    The returned session creator will be configured with the correct master
+    target and session configs. It will also run either init ops or ready ops
+    by querying the `strategy` object when `create_session` is called on it.
+
+    Args:
+      scaffold: A `Scaffold` used for gathering or building supportive ops. If
+        not specified a default one is created. It's used to finalize the graph.
+      config: `ConfigProto` proto used to configure the session.
+      checkpoint_dir: A string. Optional path to a directory where to restore
+        variables.
+      checkpoint_filename_with_path: Full file name path to the checkpoint file.
+        Only one of `checkpoint_dir` or `checkpoint_filename_with_path` can be
+        specified.
+      max_wait_secs: Maximum time to wait for the session to become available.
+
+    Returns:
+      a descendant of SessionCreator.
+    """
+    # TODO(yuefengz): merge session config.
+    if self._strategy.should_init:
+      return monitored_session.ChiefSessionCreator(
+          scaffold,
+          master=self.master_target,
+          config=config or self._session_config,
+          checkpoint_dir=checkpoint_dir,
+          checkpoint_filename_with_path=checkpoint_filename_with_path)
+    else:
+      return monitored_session.WorkerSessionCreator(
+          scaffold,
+          master=self.master_target,
+          config=config or self._session_config,
+          max_wait_secs=max_wait_secs)
+
   @property
   def has_barrier(self):
     """Whether the barrier is set or not."""
@@ -247,46 +293,91 @@ class _WorkerContext(object):
     """Returns number of workers in the cluster, including chief."""
     return self._num_workers
 
+  @property
+  def should_checkpoint(self):
+    """Whether to save checkpoint."""
+    return self._strategy.should_checkpoint
+
+  @property
+  def should_save_summary(self):
+    """Whether to save summaries."""
+    return self._strategy.should_save_summary
+
 
 def _run_single_worker(worker_fn,
+                       strategy,
                        cluster_spec,
                        task_type,
                        task_id,
-                       rpc_layer,
+                       session_config,
+                       rpc_layer="",
                        worker_barrier=None):
   """Runs a single worker by calling `worker_fn` under context."""
-  with _WorkerContext(
+  strategy = copy.deepcopy(strategy)
+  # If there is an EVALUATOR task, we run single-machine eval on that task.
+  if task_type == _TaskType.EVALUATOR:
+    strategy.configure(session_config)
+  else:
+    strategy.configure(session_config, cluster_spec, task_type, task_id)
+  context = _WorkerContext(
+      strategy,
       cluster_spec,
       task_type,
       task_id,
+      session_config=session_config,
       rpc_layer=rpc_layer,
-      worker_barrier=worker_barrier):
-    worker_fn()
+      worker_barrier=worker_barrier)
+  with context:
+    worker_fn(strategy)
 
 
 def _run_std_server(cluster_spec=None,
                     task_type=None,
                     task_id=None,
                     session_config=None,
-                    rpc_layer=None):
+                    rpc_layer=None,
+                    environment=None):
   """Runs a standard server."""
-  server = server_lib.Server(
-      cluster_spec,
-      job_name=task_type,
-      task_index=task_id,
-      config=session_config,
-      protocol=rpc_layer)
-  server.start()
-  return server
-
 
-def _run_between_graph_client(worker_fn, cluster_spec, rpc_layer):
+  class _FakeServer(object):
+    """A fake server that runs a master session."""
+
+    def start(self):
+      assert cluster_spec
+      target = cluster_spec.task_address(task_type, task_id)
+      if rpc_layer:
+        target = rpc_layer + "://" + target
+      # A tensorflow server starts when a remote session is created.
+      session.Session(target=target, config=session_config)
+
+    def join(self):
+      while True:
+        time.sleep(5)
+
+  if environment == "google":
+    server = _FakeServer()
+    server.start()
+    return server
+  else:
+    server = server_lib.Server(
+        cluster_spec,
+        job_name=task_type,
+        task_index=task_id,
+        config=session_config,
+        protocol=rpc_layer)
+    server.start()
+    return server
+
+
+def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
+                              cluster_spec, session_config, rpc_layer):
   """Runs a standalone client for between-graph replication."""
   eval_thread = None
   if _TaskType.EVALUATOR in cluster_spec.jobs:
     eval_thread = threading.Thread(
         target=_run_single_worker,
-        args=(worker_fn, cluster_spec, _TaskType.EVALUATOR, 0),
+        args=(eval_fn, eval_strategy, None, _TaskType.EVALUATOR, 0,
+              session_config),
         kwargs={
             "rpc_layer": rpc_layer,
         })
@@ -298,7 +389,8 @@ def _run_between_graph_client(worker_fn, cluster_spec, rpc_layer):
     for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
       t = threading.Thread(
           target=_run_single_worker,
-          args=(worker_fn, cluster_spec, task_type, task_id),
+          args=(worker_fn, strategy, cluster_spec, task_type, task_id,
+                session_config),
           kwargs={
               "rpc_layer": rpc_layer,
               "worker_barrier": worker_barrier
@@ -315,43 +407,55 @@ def _run_between_graph_client(worker_fn, cluster_spec, rpc_layer):
     eval_thread.join()
 
 
-def _run_in_graph_client(worker_fn, cluster_spec, rpc_layer):
+def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
+                         cluster_spec, session_config, rpc_layer):
   """Runs a standalone client for in-graph replication."""
   eval_thread = None
   if _TaskType.EVALUATOR in cluster_spec.jobs:
     eval_thread = threading.Thread(
         target=_run_single_worker,
-        args=(worker_fn, cluster_spec, _TaskType.EVALUATOR, 0),
+        args=(eval_fn, eval_strategy, cluster_spec, _TaskType.EVALUATOR, 0,
+              session_config),
         kwargs={
             "rpc_layer": rpc_layer,
         })
     eval_thread.start()
 
-  _run_single_worker(worker_fn, cluster_spec, None, None, rpc_layer)
+  _run_single_worker(
+      worker_fn,
+      strategy,
+      cluster_spec,
+      None,
+      None,
+      session_config,
+      rpc_layer=rpc_layer)
   if eval_thread:
     eval_thread.join()
 
-
-# TODO(yuefengz): propagate cluster_spec in the SPLIT_CLIENT mode.
+# TODO(yuefengz): propagate cluster_spec in the STANDALONE_CLIENT mode.
 # TODO(yuefengz): we may need a smart way to figure out whether the current task
 # is the special task when we support cluster_spec propagation.
 def run_distribute_coordinator(worker_fn,
-                               mode=CoordinatorMode.SPLIT_CLIENT,
+                               strategy,
+                               eval_fn=None,
+                               eval_strategy=None,
+                               mode=CoordinatorMode.STANDALONE_CLIENT,
                                cluster_spec=None,
                                task_type=None,
                                task_id=None,
-                               between_graph=False,
+                               session_config=None,
                                rpc_layer="grpc"):
   """Runs the coordinator for distributed TensorFlow.
 
   This function runs a split coordinator for distributed TensorFlow in its
-  default mode, i.e the SPLIT_CLIENT mode. Given a `cluster_spec` specifying
-  server addresses and their roles in a cluster, this coordinator will figure
-  out how to set them up, give the underlying function the right targets for
-  master sessions via a scope object and coordinate their training. The cluster
-  consisting of standard servers needs to be brought up either with the standard
-  server binary or with a binary running distribute coordinator with `task_type`
-  set to non-client type which will then turn into standard servers.
+  default mode, i.e the STANDALONE_CLIENT mode. Given a `cluster_spec`
+  specifying server addresses and their roles in a cluster, this coordinator
+  will figure out how to set them up, give the underlying function the right
+  targets for master sessions via a scope object and coordinate their training.
+  The cluster consisting of standard servers needs to be brought up either with
+  the standard server binary or with a binary running distribute coordinator
+  with `task_type` set to non-client type which will then turn into standard
+  servers.
 
   In addition to be the distribute coordinator, this is also the source of
   configurations for each job in the distributed training. As there are multiple
@@ -370,6 +474,14 @@ def run_distribute_coordinator(worker_fn,
   `worker_fn` depending whether it is between-graph training or in-graph
   replicated training.
 
+  The `strategy` object is expected to be a DistributionStrategy object which
+  has implemented methods needed by distributed coordinator such as
+  `configure(session_config, cluster_spec, task_type, task_id)` which configures
+  the strategy object for a specific task and `should_init` property which
+  instructs the distribute coordinator whether to run init ops for a task. The
+  distribute coordinator will make a copy of the `strategy` object, call its
+  `configure` method and pass it to `worker_fn` as an argument.
+
   The `worker_fn` defines the training logic and is called under a its own
   worker context which can be accessed to via `get_current_worker_context`. A
   worker context provides access to configurations for each task, e.g. the
@@ -407,22 +519,30 @@ def run_distribute_coordinator(worker_fn,
   If `cluster_spec` is not given in any format, it becomes local training and
   this coordinator will connect to a local session.
 
-  For evaluation, if "evaluator" exist in the cluster_spec, a separate thread
-  will be created with its `task_type` set to "evaluator". If "evaluator" is not
-  set in the cluster_spec, it entirely depends on the `worker_fn` for how to do
-  evaluation.
+  For evaluation, if "evaluator" exists in the cluster_spec, a separate thread
+  will be created to call `eval_fn` with its `task_type` set to "evaluator". If
+  `eval_fn` is not defined, fall back to `worker_fn`. This implies that
+  evaluation will be done on a single machine if there is an "evaluator" task.
+  If "evaluator" doesn't exit in the cluster_spec, it entirely depends on the
+  `worker_fn` for how to do evaluation.
 
   Args:
-    worker_fn: the function to be called and given the access to a coordinator
-      context object.
+    worker_fn: the function to be called. The function should accept a
+      `strategy` object and will be given access to a context object via a
+      context manager scope.
+    strategy: a DistributionStrategy object which specifying whether it should
+      run between-graph replicated training or not, whether to run init ops,
+      etc. This object will also be configured given `session_config`,
+      `cluster_spc`, `task_type` and `task_id`.
+    eval_fn: optional function for "evaluator" task.
+    eval_strategy: optional DistributionStrategy object for "evaluator" task.
     mode: in which mode this distribute coordinator runs.
     cluster_spec: a dict, ClusterDef or ClusterSpec specifying servers and roles
       in a cluster. If not set or empty, fall back to local training.
     task_type: the current task type, optional if this is a client.
     task_id: the current task id, optional if this is a client.
-    between_graph: a boolean. It is only useful when `cluster_spec` is set and
-      not empty. If true, it will use between-graph replicated training;
-      otherwise it will use in-graph replicated training.
+    session_config: an optional @{tf.ConfigProto} object which will be passed
+      to `strategy`'s `configure` method and used to create a session.
     rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
 
   Raises:
@@ -446,44 +566,76 @@ def run_distribute_coordinator(worker_fn,
           "`tf.train.ClusterDef` object")
     # TODO(yuefengz): validate cluster_spec.
 
+  rpc_layer = tf_config.get("rpc_layer", rpc_layer)
+  environment = tf_config.get("environment", None)
+
+  if cluster_spec:
+    logging.info(
+        "Running Distribute Coordinator with mode = %r, cluster_spec = %r, "
+        "task_type = %r, task_id = %r, environment = %r, rpc_layer = %r", mode,
+        cluster_spec.as_dict(), task_type, task_id, environment, rpc_layer)
+
   if not cluster_spec:
     # `mode` is ignored in the local case.
-    _run_single_worker(worker_fn, None, None, None, rpc_layer)
-  elif mode == CoordinatorMode.SPLIT_CLIENT:
+    logging.info("Running local Distribute Coordinator.")
+    _run_single_worker(worker_fn, strategy, None, None, None, session_config,
+                       rpc_layer)
+    if eval_fn:
+      _run_single_worker(eval_fn, eval_strategy or strategy, None, None, None,
+                         session_config, rpc_layer)
+  elif mode == CoordinatorMode.STANDALONE_CLIENT:
+    eval_fn = eval_fn or worker_fn
+    eval_strategy = eval_strategy or strategy
+
     # The client must know the cluster but servers in the cluster don't have to
     # know the client.
     if task_type in [_TaskType.CLIENT, None]:
-      if between_graph:
-        _run_between_graph_client(worker_fn, cluster_spec, rpc_layer)
+      if strategy.between_graph:
+        _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
+                                  cluster_spec, session_config, rpc_layer)
       else:
-        _run_in_graph_client(worker_fn, cluster_spec, rpc_layer)
+        _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
+                             cluster_spec, session_config, rpc_layer)
     else:
       # If not a client job, run the standard server.
       server = _run_std_server(
-          cluster_spec=cluster_spec, task_type=task_type, task_id=task_id)
+          cluster_spec=cluster_spec,
+          task_type=task_type,
+          task_id=task_id,
+          rpc_layer=rpc_layer,
+          environment=environment)
       server.join()
   else:
     if mode != CoordinatorMode.INDEPENDENT_WORKER:
       raise ValueError("Unexpected coordinator mode: %r" % mode)
 
+    eval_fn = eval_fn or worker_fn
+    eval_strategy = eval_strategy or strategy
+
     # Every one starts a standard server.
     server = _run_std_server(
-        cluster_spec=cluster_spec, task_type=task_type, task_id=task_id)
+        cluster_spec=cluster_spec,
+        task_type=task_type,
+        task_id=task_id,
+        rpc_layer=rpc_layer,
+        environment=environment)
 
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
-      if between_graph:
+      if strategy.between_graph:
         # All jobs run `worker_fn` if between-graph.
-        _run_single_worker(worker_fn, cluster_spec, task_type, task_id,
-                           rpc_layer)
+        _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
+                           task_id, session_config, rpc_layer)
       else:
         # Only one node runs `worker_fn` if in-graph.
-        context = _WorkerContext(cluster_spec, task_type, task_id, rpc_layer)
+        context = _WorkerContext(strategy, cluster_spec, task_type, task_id)
         if context.is_chief:
-          _run_single_worker(worker_fn, cluster_spec, None, None, rpc_layer)
+          _run_single_worker(worker_fn, strategy, cluster_spec, None, None,
+                             session_config, rpc_layer)
         else:
           server.join()
     elif task_type == _TaskType.EVALUATOR:
-      _run_single_worker(worker_fn, cluster_spec, task_type, task_id, rpc_layer)
+      _run_single_worker(eval_fn, eval_strategy, cluster_spec, task_type,
+                         task_id, session_config, rpc_layer)
     else:
       if task_type != _TaskType.PS:
         raise ValueError("Unexpected task_type: %r" % task_type)
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer_lib.py b/tensorflow/python/distribute/distribute_coordinator_context.py
index 87d1866e06..dee65ce883 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer_lib.py
+++ b/tensorflow/python/distribute/distribute_coordinator_context.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""The KFAC optimizer."""
+"""The context retrieval method for distribute coordinator."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.optimizer import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
+import threading
 
-_allowed_symbols = [
-    "KfacOptimizer",
-]
+_worker_context = threading.local()
 
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
+
+def get_current_worker_context():
+  """Returns the current task context."""
+  try:
+    return _worker_context.current
+  except AttributeError:
+    return None
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index 319c29ba2f..5dd57fa134 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for distribute coordinator."""
+"""Tests for Distribute Coordinator."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,8 +20,10 @@ from __future__ import print_function
 
 import contextlib
 import copy
+import json
 import os
 import sys
+import time
 import threading
 import six
 
@@ -37,6 +39,7 @@ except ImportError as _error:
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator
+from tensorflow.python.distribute import distribute_coordinator_context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
@@ -44,20 +47,22 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+
 
 CHIEF = distribute_coordinator._TaskType.CHIEF
 WORKER = distribute_coordinator._TaskType.WORKER
 PS = distribute_coordinator._TaskType.PS
 EVALUATOR = distribute_coordinator._TaskType.EVALUATOR
 
-SPLIT_CLIENT = distribute_coordinator.CoordinatorMode.SPLIT_CLIENT
+STANDALONE_CLIENT = distribute_coordinator.CoordinatorMode.STANDALONE_CLIENT
 INDEPENDENT_WORKER = distribute_coordinator.CoordinatorMode.INDEPENDENT_WORKER
 
-RUN_STD_SERVER_METHOD = "tensorflow.python.distribute.distribute_coordinator._run_std_server"
-
 NUM_WORKERS = 3
 NUM_PS = 2
 
+original_sys_exit = sys.exit
+
 
 def _bytes_to_str(maybe_bytes):
   if isinstance(maybe_bytes, six.string_types):
@@ -74,6 +79,57 @@ def _strip_protocol(target):
     return target
 
 
+class MockStrategy(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=None,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self._between_graph = between_graph
+    self._should_init = should_init
+    self._should_checkpoint = should_checkpoint
+    self._should_save_summary = should_save_summary
+
+  @property
+  def between_graph(self):
+    return self._between_graph
+
+  def configure(self,
+                session_options=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    del session_options, cluster_spec, task_type
+    if self._should_init is None:
+      if task_id == 0:
+        self._should_init = True
+      else:
+        self._should_init = False
+    if self._should_checkpoint is None:
+      if task_id == 0:
+        self._should_checkpoint = True
+      else:
+        self._should_checkpoint = False
+    if self._should_save_summary is None:
+      if task_id == 0:
+        self._should_save_summary = True
+      else:
+        self._should_save_summary = False
+
+  @property
+  def should_init(self):
+    return self._should_init
+
+  @property
+  def should_checkpoint(self):
+    return self._should_checkpoint
+
+  @property
+  def should_save_summary(self):
+    return self._should_save_summary
+
+
 class MockServer(object):
 
   def __init__(self):
@@ -108,6 +164,7 @@ class DistributeCoordinatorTestBase(test.TestCase):
     self._result_correct = 0
     self._lock = threading.Lock()
     self._worker_context = {}
+    self._strategy_property = {}
     self._std_servers = {}
     self._barrier = distribute_coordinator._Barrier(NUM_WORKERS)
 
@@ -142,8 +199,8 @@ class DistributeCoordinatorTestBase(test.TestCase):
       cluster_spec[EVALUATOR] = ["localhost:%s" % portpicker.pick_unused_port()]
     return cluster_spec
 
-  def _in_graph_worker_fn(self):
-    context = distribute_coordinator.get_current_worker_context()
+  def _in_graph_worker_fn(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
     with self._test_session(target=context.master_target) as sess:
       xs = []
@@ -164,22 +221,23 @@ class DistributeCoordinatorTestBase(test.TestCase):
     if result_value == expected:
       self._result_correct += 1
 
-  def _run_coordinator_in_thread(self, worker_fn, **kwargs):
+  def _run_coordinator_in_thread(self, worker_fn, strategy, **kwargs):
     t = threading.Thread(
         target=distribute_coordinator.run_distribute_coordinator,
-        args=(worker_fn,),
+        args=(worker_fn, strategy),
         kwargs=kwargs)
     t.start()
     return t
 
-  def _run_multiple_coordinator_in_threads(self, worker_fn, cluster_spec,
-                                           **kwargs):
+  def _run_multiple_coordinator_in_threads(self, worker_fn, strategy,
+                                           cluster_spec, **kwargs):
     threads = {}
     for task_type in cluster_spec.keys():
       threads[task_type] = []
       for task_id in range(len(cluster_spec[task_type])):
         t = self._run_coordinator_in_thread(
             worker_fn,
+            strategy,
             cluster_spec=cluster_spec,
             task_type=task_type,
             task_id=task_id,
@@ -187,8 +245,8 @@ class DistributeCoordinatorTestBase(test.TestCase):
         threads[task_type].append(t)
     return threads
 
-  def _between_graph_worker_fn(self):
-    context = distribute_coordinator.get_current_worker_context()
+  def _between_graph_worker_fn(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
     with self._test_session(target=context.master_target) as sess:
       with ops.device("/job:ps/task:0"):
@@ -234,14 +292,50 @@ class DistributeCoordinatorTestBase(test.TestCase):
         with self._lock:
           self._result_correct += 1
 
-  def _dump_worker_context(self):
+  def _between_graph_with_monitored_session(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
+    self.assertTrue(context is not None)
+    with ops.device("/job:ps/task:0"):
+      # TODO(yuefengz): investigate why not using resource variable will make
+      # the test flaky.
+      x = variable_scope.get_variable("x", initializer=10.0, use_resource=True)
+    with ops.device("/job:ps/task:1"):
+      y = variable_scope.get_variable("y", initializer=20.0, use_resource=True)
+
+    x_add = x.assign_add(2.0)
+    y_sub = y.assign_sub(2.0)
+    train_op = control_flow_ops.group([x_add, y_sub])
+
+    # The monitored session will run init or ready ops.
+    with monitored_session.MonitoredSession() as sess:
+      sess.run(train_op)
+
+      # Synchronize workers after one step to make sure they all have finished
+      # training.
+      if context.has_barrier:
+        context.wait_for_other_workers()
+      else:
+        self._barrier.wait()
+
+      x_val, y_val = sess.run([x, y])
+
+    self.assertEqual(x_val, 16.0)
+    self.assertEqual(y_val, 14.0)
+    if x_val == 16.0 and y_val == 14.0:
+      with self._lock:
+        self._result_correct += 1
+
+  def _dump_worker_context(self, strategy):
     """Dumps the propoerties of each worker context.
 
     It dumps the context properties to a dict mapping from task_type to a list
     of tuples of master_target, num_workers, is_chief and distribute_mode, where
     the list is indexed by the task_id.
+
+    Args:
+      strategy: a `DistributionStrategy` object.
     """
-    context = distribute_coordinator.get_current_worker_context()
+    context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
     task_type = str(context.task_type)
     task_id = context.task_id or 0
@@ -255,12 +349,32 @@ class DistributeCoordinatorTestBase(test.TestCase):
                                                   context.is_chief,
                                                   context.distributed_mode)
 
+  def _dump_strategy_property(self, strategy):
+    context = distribute_coordinator_context.get_current_worker_context()
+    self.assertTrue(context is not None)
+
+    self.assertEqual(context._strategy.should_init, strategy.should_init)
+    self.assertEqual(context.should_checkpoint, strategy.should_checkpoint)
+    self.assertEqual(context.should_save_summary, strategy.should_save_summary)
+
+    task_type = str(context.task_type)
+    task_id = context.task_id or 0
+    with self._lock:
+      if task_type not in self._strategy_property:
+        self._strategy_property[task_type] = []
+      while len(self._strategy_property[task_type]) <= task_id:
+        self._strategy_property[task_type].append(None)
+      self._strategy_property[task_type][task_id] = (
+          context._strategy.should_init, context.should_checkpoint,
+          context.should_save_summary)
+
   def _run_mock_std_server(self,
                            session_config=None,
                            cluster_spec=None,
                            task_type=None,
                            task_id=None,
-                           rpc_layer=None):
+                           rpc_layer=None,
+                           environment=None):
     task_type = str(task_type)
     task_id = task_id or 0
     with self._lock:
@@ -274,22 +388,32 @@ class DistributeCoordinatorTestBase(test.TestCase):
     return server
 
 
-class DistributeCoordinatorTestSplitMode(DistributeCoordinatorTestBase):
+class DistributeCoordinatorTestStandaloneMode(DistributeCoordinatorTestBase):
 
-  def testInGraphSplitMode(self):
-    """Test it runs in-graph replication in split client mode."""
+  def testInGraphStandaloneMode(self):
+    """Test it runs in-graph replication in standalone client mode."""
     distribute_coordinator.run_distribute_coordinator(
         self._in_graph_worker_fn,
-        cluster_spec=self._cluster_spec,
-        between_graph=False)
+        MockStrategy(between_graph=False),
+        cluster_spec=self._cluster_spec)
     self.assertEqual(self._result_correct, 1)
 
   def testBetweenGraph(self):
-    """Test it runs between-graph replication in split client mode."""
+    """Test it runs between-graph replication in standalone client mode."""
     distribute_coordinator.run_distribute_coordinator(
         self._between_graph_worker_fn,
-        cluster_spec=self._cluster_spec,
-        between_graph=True)
+        MockStrategy(between_graph=True),
+        cluster_spec=self._cluster_spec)
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
+  def testBetweenGraphWithMonitoredSession(self):
+    """Test monitored session in standalone client mode."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._between_graph_with_monitored_session,
+        MockStrategy(between_graph=True),
+        cluster_spec=self._cluster_spec)
 
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
@@ -298,8 +422,8 @@ class DistributeCoordinatorTestSplitMode(DistributeCoordinatorTestBase):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
-        cluster_spec=self._cluster_spec,
-        between_graph=True)
+        MockStrategy(between_graph=True),
+        cluster_spec=self._cluster_spec)
 
     # There is only one type of task and there three such tasks.
     self.assertEqual(len(self._worker_context), 1)
@@ -318,12 +442,30 @@ class DistributeCoordinatorTestSplitMode(DistributeCoordinatorTestBase):
         self._worker_context[WORKER][2],
         (_bytes_to_str(self._workers[2].target), NUM_WORKERS, False, True))
 
+  def testBetweenGraphStrategyProperties(self):
+    # Dumps properties of the strategy objects.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_strategy_property,
+        MockStrategy(between_graph=True, should_init=True),
+        cluster_spec=self._cluster_spec)
+
+    # There is only one type of task and there three such tasks.
+    self.assertEqual(len(self._strategy_property), 1)
+    self.assertTrue(WORKER in self._strategy_property)
+    self.assertEqual(len(self._strategy_property[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right properties of should_init,
+    # should_checkpoint and should_save_summary.
+    self.assertEqual(self._strategy_property[WORKER][0], (True, True, True))
+    self.assertEqual(self._strategy_property[WORKER][1], (True, False, False))
+    self.assertEqual(self._strategy_property[WORKER][2], (True, False, False))
+
   def testInGraphContext(self):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
-        cluster_spec=self._cluster_spec,
-        between_graph=False)
+        MockStrategy(between_graph=False),
+        cluster_spec=self._cluster_spec)
 
     # There is only a "None" task in the dumped task context.
     self.assertEqual(len(self._worker_context), 1)
@@ -339,7 +481,9 @@ class DistributeCoordinatorTestSplitMode(DistributeCoordinatorTestBase):
   def testLocalContext(self):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
-        self._dump_worker_context, cluster_spec=None, between_graph=True)
+        self._dump_worker_context,
+        MockStrategy(between_graph=False),
+        cluster_spec=None)
 
     # There is only a "None" task.
     self.assertEqual(len(self._worker_context), 1)
@@ -348,7 +492,7 @@ class DistributeCoordinatorTestSplitMode(DistributeCoordinatorTestBase):
 
     # Check whether each task has the right master_target, num_workers, is_chief
     # and distributed_mode.
-    self.assertEqual(self._worker_context["None"][0], ("local", 0, True, False))
+    self.assertEqual(self._worker_context["None"][0], ("", 0, True, False))
 
   def testBetweenGraphContextWithChief(self):
     # Adds a chief node, so there are NUM_WORKERS + 1 workers in total.
@@ -358,8 +502,8 @@ class DistributeCoordinatorTestSplitMode(DistributeCoordinatorTestBase):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
+        MockStrategy(between_graph=True),
         cluster_spec=cluster_spec,
-        between_graph=True,
         rpc_layer="grpc")
 
     # There are one CHIEF and three workers.
@@ -391,8 +535,8 @@ class DistributeCoordinatorTestSplitMode(DistributeCoordinatorTestBase):
     # Dumps the task contexts to the self._worker_context dict.
     distribute_coordinator.run_distribute_coordinator(
         self._dump_worker_context,
+        MockStrategy(between_graph=False),
         cluster_spec=cluster_spec,
-        between_graph=False,
         rpc_layer=None)
 
     # There are one "None" task and one EVALUATOR task.
@@ -417,8 +561,8 @@ class DistributeCoordinatorTestInpendentWorkerMode(
     cluster_spec = self._create_cluster_spec(num_workers=NUM_WORKERS)
     threads = self._run_multiple_coordinator_in_threads(
         self._in_graph_worker_fn,
+        MockStrategy(between_graph=False),
         cluster_spec,
-        between_graph=False,
         mode=INDEPENDENT_WORKER)
     threads[WORKER][0].join()
     self.assertEqual(self._result_correct, 1)
@@ -428,8 +572,22 @@ class DistributeCoordinatorTestInpendentWorkerMode(
         num_workers=NUM_WORKERS, num_ps=NUM_PS)
     threads = self._run_multiple_coordinator_in_threads(
         self._between_graph_worker_fn,
+        MockStrategy(between_graph=True),
+        cluster_spec,
+        mode=INDEPENDENT_WORKER)
+    for task_id in range(NUM_WORKERS):
+      threads[WORKER][task_id].join()
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
+  def testBetweenGraphWithMonitoredSession(self):
+    cluster_spec = self._create_cluster_spec(
+        num_workers=NUM_WORKERS, num_ps=NUM_PS)
+    threads = self._run_multiple_coordinator_in_threads(
+        self._between_graph_with_monitored_session,
+        MockStrategy(between_graph=True),
         cluster_spec,
-        between_graph=True,
         mode=INDEPENDENT_WORKER)
     for task_id in range(NUM_WORKERS):
       threads[WORKER][task_id].join()
@@ -444,9 +602,9 @@ class DistributeCoordinatorTestInpendentWorkerMode(
                                 self._run_mock_std_server):
       threads = self._run_multiple_coordinator_in_threads(
           self._dump_worker_context,
+          MockStrategy(between_graph=True),
           cluster_spec,
           mode=INDEPENDENT_WORKER,
-          between_graph=True,
           rpc_layer=None)
       for task_id in range(NUM_WORKERS):
         threads[WORKER][task_id].join()
@@ -476,6 +634,31 @@ class DistributeCoordinatorTestInpendentWorkerMode(
     self.assertFalse(self._std_servers[WORKER][1].joined)
     self.assertFalse(self._std_servers[WORKER][2].joined)
 
+  def testBetweenGraphStrategyProperties(self):
+    cluster_spec = self._create_cluster_spec(num_workers=NUM_WORKERS)
+    # Dumps properties of the strategy objects.
+    with test.mock.patch.object(distribute_coordinator, "_run_std_server",
+                                self._run_mock_std_server):
+      threads = self._run_multiple_coordinator_in_threads(
+          self._dump_strategy_property,
+          MockStrategy(between_graph=True, should_init=True),
+          cluster_spec,
+          mode=INDEPENDENT_WORKER,
+          rpc_layer=None)
+      for task_id in range(NUM_WORKERS):
+        threads[WORKER][task_id].join()
+
+    # There is only one type of task and there three such tasks.
+    self.assertEqual(len(self._strategy_property), 1)
+    self.assertTrue(WORKER in self._strategy_property)
+    self.assertEqual(len(self._strategy_property[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right properties of should_init,
+    # should_checkpoint and should_save_summary.
+    self.assertEqual(self._strategy_property[WORKER][0], (True, True, True))
+    self.assertEqual(self._strategy_property[WORKER][1], (True, False, False))
+    self.assertEqual(self._strategy_property[WORKER][2], (True, False, False))
+
   def testInGraphContext(self):
     cluster_spec = self._create_cluster_spec(num_workers=NUM_WORKERS)
     # Dumps the task contexts and std server arguments.
@@ -483,9 +666,9 @@ class DistributeCoordinatorTestInpendentWorkerMode(
                                 self._run_mock_std_server):
       threads = self._run_multiple_coordinator_in_threads(
           self._dump_worker_context,
+          MockStrategy(between_graph=False),
           cluster_spec,
           mode=INDEPENDENT_WORKER,
-          between_graph=False,
           rpc_layer=None)
       for task_id in range(NUM_WORKERS):
         threads[WORKER][task_id].join()
@@ -519,9 +702,9 @@ class DistributeCoordinatorTestInpendentWorkerMode(
                                 self._run_mock_std_server):
       threads = self._run_multiple_coordinator_in_threads(
           self._dump_worker_context,
+          MockStrategy(between_graph=False),
           cluster_spec,
           mode=INDEPENDENT_WORKER,
-          between_graph=False,
           rpc_layer=None)
       for task_id in range(NUM_WORKERS):
         threads[WORKER][task_id].join()
@@ -552,6 +735,63 @@ class DistributeCoordinatorTestInpendentWorkerMode(
     self.assertTrue(self._std_servers[WORKER][2].joined)
     self.assertFalse(self._std_servers[EVALUATOR][0].joined)
 
+  def testRunStdServerInGoogleEnvironment(self):
+    cluster_spec = {"worker": ["fake_worker"], "ps": ["localhost:0"]}
+    tf_config = {"cluster": cluster_spec, "environment": "google"}
+
+    joined = [False]
+
+    def _fake_sleep(_):
+      joined[0] = True
+      original_sys_exit(0)
+
+    def _thread_fn(cluster_spec):
+      distribute_coordinator.run_distribute_coordinator(
+          None,
+          None,
+          mode=INDEPENDENT_WORKER,
+          cluster_spec=cluster_spec,
+          task_type="ps",
+          task_id=0)
+
+    with test.mock.patch.dict(
+        "os.environ",
+        {"TF_CONFIG": json.dumps(tf_config)}), test.mock.patch.object(
+            time, "sleep", _fake_sleep):
+      t = threading.Thread(target=_thread_fn, args=(cluster_spec,))
+      t.start()
+      t.join()
+    self.assertTrue(joined[0])
+
+  def testRpcLayerEnvironmentVariable(self):
+    cluster_spec = {"worker": ["fake_worker"], "ps": ["fake_ps"]}
+    tf_config = {"cluster": cluster_spec, "rpc_layer": "cake"}
+
+    rpc_layer_from_coordinator = [None]
+
+    def _run_mock_server(cluster_spec=None,
+                         task_type=None,
+                         task_id=None,
+                         session_config=None,
+                         rpc_layer=None,
+                         environment=None):
+      del cluster_spec, task_type, task_id, session_config, environment
+      rpc_layer_from_coordinator[0] = rpc_layer
+      return MockServer()
+
+    with test.mock.patch.dict(
+        "os.environ",
+        {"TF_CONFIG": json.dumps(tf_config)}), test.mock.patch.object(
+            distribute_coordinator, "_run_std_server", _run_mock_server):
+      distribute_coordinator.run_distribute_coordinator(
+          None,
+          None,
+          mode=INDEPENDENT_WORKER,
+          cluster_spec=cluster_spec,
+          task_type="ps",
+          task_id=0)
+    self.assertEqual(rpc_layer_from_coordinator[0], "cake")
+
 
 if __name__ == "__main__":
   # TODO(yuefengz): find a smart way to terminite std server threads.
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
new file mode 100644
index 0000000000..202e19c420
--- /dev/null
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -0,0 +1,264 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training utilities for Estimator to use Distribute Coordinator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import six
+
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import server_lib
+
+# pylint: disable=protected-access
+CHIEF = dc._TaskType.CHIEF
+EVALUATOR = dc._TaskType.EVALUATOR
+PS = dc._TaskType.PS
+WORKER = dc._TaskType.WORKER
+
+# pylint: enable=protected-access
+
+
+def _count_ps(cluster_spec):
+  """Counts the number of parameter servers in cluster_spec."""
+  if not cluster_spec:
+    raise RuntimeError(
+        'Internal error: `_count_ps` does not expect empty cluster_spec.')
+
+  return len(cluster_spec.as_dict().get(PS, []))
+
+
+def _count_worker(cluster_spec, chief_task_type):
+  """Counts the number of workers (including chief) in cluster_spec."""
+  if not cluster_spec:
+    raise RuntimeError(
+        'Internal error: `_count_worker` does not expect empty cluster_spec.')
+
+  return (len(cluster_spec.as_dict().get(WORKER, [])) + len(
+      cluster_spec.as_dict().get(chief_task_type, [])))
+
+
+def _get_global_id(cluster_spec, task_type, task_id, chief_task_type):
+  """Returns the global id of the given task type in a cluster."""
+  if not task_type:
+    return 0
+
+  # Sort task names in cluster by "chief"/"master", "evaluator", "worker"
+  # and "ps". More details can be found at the documentation of
+  # @{tf.estimator.RunConfig.global_id_in_cluster}.
+  task_type_ordered_list = []
+  if chief_task_type in cluster_spec.jobs:
+    task_type_ordered_list = [chief_task_type]
+  task_type_ordered_list.extend([
+      t for t in sorted(cluster_spec.jobs) if t != chief_task_type and t != PS
+  ])
+  if PS in cluster_spec.jobs:
+    task_type_ordered_list.append(PS)
+
+  # Find the right gloabl_id for current task.
+  next_global_id = 0
+  for t in task_type_ordered_list:
+    if t == task_type:
+      return next_global_id + task_id
+    # `cluster_spec.job_tasks` returns all task addresses of type `t`.
+    next_global_id += len(cluster_spec.job_tasks(t))
+
+  # It is unexpected that it passes through all task_types in
+  # `task_type_ordered_list`.
+  raise RuntimeError('Internal Error: `task_type` ({}) is not in '
+                     'cluster_spec ({}).'.format(task_type, cluster_spec))
+
+
+def _init_run_config_from_worker_context(config, worker_context):
+  """Initializes run config from distribute coordinator's worker context."""
+
+  # pylint: disable=protected-access
+  config._service = None
+  config._cluster_spec = worker_context.cluster_spec
+  config._task_type = worker_context.task_type
+  config._task_id = worker_context.task_id
+  config._evaluation_master = worker_context.master_target
+  config._master = worker_context.master_target
+  config._is_chief = worker_context.is_chief
+
+  if config._cluster_spec:
+    # Distributed mode.
+    if config._task_type != EVALUATOR:
+
+      config._num_ps_replicas = _count_ps(config._cluster_spec)
+      config._num_worker_replicas = _count_worker(
+          config._cluster_spec, chief_task_type=CHIEF)
+      config._global_id_in_cluster = _get_global_id(
+          config._cluster_spec,
+          config._task_type,
+          config._task_id,
+          chief_task_type=CHIEF)
+    else:
+      # Evaluator task should not be aware of the other tasks.
+      config._cluster_spec = server_lib.ClusterSpec({})
+      config._num_ps_replicas = 0
+      config._num_worker_replicas = 0
+      config._global_id_in_cluster = None  # undefined
+  else:
+    # Local mode.
+    config._global_id_in_cluster = 0
+    config._num_ps_replicas = 0
+    config._num_worker_replicas = 1
+
+
+def init_run_config(config, tf_config):
+  """Initializes RunConfig for distribution strategies."""
+  # pylint: disable=protected-access
+  if (config._experimental_distribute and
+      config._experimental_distribute.train_distribute):
+    if config._train_distribute:
+      raise ValueError('Either `train_distribute` or'
+                       '`experimental_distribute.train_distribute` can be set.')
+    config._train_distribute = config._experimental_distribute.train_distribute
+
+  if (config._experimental_distribute and
+      config._experimental_distribute.eval_distribute):
+    if config._eval_distribute:
+      raise ValueError('Either `eval_distribute` or'
+                       '`experimental_distribute.eval_distribute` can be set.')
+    config._eval_distribute = config._experimental_distribute.eval_distribute
+
+  cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {}))
+  config._init_distributed_setting_from_environment_var({})
+
+  # Use distribute coordinator with STANDALONE_CLIENT mode if
+  # `experimental_distribute.remote_cluster` is set.
+  if (config._train_distribute and config._experimental_distribute and
+      config._experimental_distribute.remote_cluster):
+    if tf_config:
+      raise ValueError('Cannot set both TF_CONFIG environment variable and '
+                       '`experimental_distribute.remote_cluster`')
+    config._distribute_coordinator_mode = dc.CoordinatorMode.STANDALONE_CLIENT
+    config._cluster_spec = config._experimental_distribute.remote_cluster
+    logging.info('RunConfig initialized for Distribute Coordinator with '
+                 'STANDALONE_CLIENT mode')
+    return
+
+  # Don't use distribute coordinator if it is local training or cluster has a
+  # MASTER job or `train_distribute` is not specifed.
+  if (not tf_config or 'master' in cluster_spec.jobs or
+      not config._train_distribute):
+    config._distribute_coordinator_mode = None
+    config._init_distributed_setting_from_environment_var(tf_config)
+    config._maybe_overwrite_session_config_for_distributed_training()
+    logging.info('Not using Distribute Coordinator.')
+    return
+
+  # Use distribute coordinator with INDEPENDENT_WORKER mode otherwise.
+  assert tf_config
+
+  # Set the cluster_spec only since the distributed setting will come from
+  # distribute coordinator.
+  config._cluster_spec = cluster_spec
+  config._distribute_coordinator_mode = dc.CoordinatorMode.INDEPENDENT_WORKER
+  logging.info('RunConfig initialized for Distribute Coordinator with '
+               'INDEPENDENT_WORKER mode')
+
+
+def should_run_distribute_coordinator(config):
+  """Checks the config to see whether to run distribute coordinator."""
+  # pylint: disable=protected-access
+  if (not hasattr(config, '_distribute_coordinator_mode') or
+      config._distribute_coordinator_mode is None):
+    return False
+  if (not isinstance(config._distribute_coordinator_mode, six.string_types) or
+      config._distribute_coordinator_mode not in [
+          dc.CoordinatorMode.STANDALONE_CLIENT,
+          dc.CoordinatorMode.INDEPENDENT_WORKER
+      ]):
+    logging.warning('Unexpected distribute_coordinator_mode: %r',
+                    config._distribute_coordinator_mode)
+    return False
+  if not config.cluster_spec:
+    logging.warning('Running `train_and_evaluate` locally, ignoring '
+                    '`experimental_distribute_coordinator_mode`.')
+    return False
+  return True
+
+
+def train_and_evaluate(estimator, train_spec, eval_spec, executor_cls):
+  """Run distribute coordinator for Estimator's `train_and_evaluate`.
+
+  Args:
+    estimator: An `Estimator` instance to train and evaluate.
+    train_spec: A `TrainSpec` instance to specify the training specification.
+    eval_spec: A `EvalSpec` instance to specify the evaluation and export
+      specification.
+    executor_cls: the evaluation executor class of Estimator.
+
+  Raises:
+    ValueError: if `distribute_coordinator_mode` is None in RunConfig.
+  """
+  run_config = estimator.config
+  if not run_config._distribute_coordinator_mode:  # pylint: disable=protected-access
+    raise ValueError(
+        'Distribute coordinator mode is not specified in `RunConfig`.')
+
+  def _worker_fn(strategy):
+    """Function for worker task."""
+    local_estimator = copy.deepcopy(estimator)
+    # pylint: disable=protected-access
+    local_estimator._config._train_distribute = strategy
+    _init_run_config_from_worker_context(
+        local_estimator._config, dc_context.get_current_worker_context())
+    local_estimator._train_distribution = strategy
+    # pylint: enable=protected-access
+
+    local_estimator.train(
+        input_fn=train_spec.input_fn,
+        max_steps=train_spec.max_steps,
+        hooks=list(train_spec.hooks))
+
+  def _eval_fn(strategy):
+    """Function for evaluator task."""
+    local_estimator = copy.deepcopy(estimator)
+    # pylint: disable=protected-access
+    local_estimator._config._eval_distribute = strategy
+    _init_run_config_from_worker_context(
+        local_estimator._config, dc_context.get_current_worker_context())
+    local_estimator._eval_distribution = strategy
+
+    executor = executor_cls(local_estimator, train_spec, eval_spec)
+    executor._start_continuous_evaluation()
+    # pylint: enable=protected-access
+
+  # pylint: disable=protected-access
+  if (run_config._distribute_coordinator_mode ==
+      dc.CoordinatorMode.STANDALONE_CLIENT):
+    cluster_spec = run_config.cluster_spec
+    assert cluster_spec
+  else:
+    # The cluster_spec comes from TF_CONFIG environment variable if it is
+    # INDEPENDENT_WORKER mode.
+    cluster_spec = None
+
+  dc.run_distribute_coordinator(
+      _worker_fn,
+      run_config.train_distribute,
+      _eval_fn,
+      run_config.eval_distribute,
+      mode=run_config._distribute_coordinator_mode,
+      cluster_spec=cluster_spec,
+      session_config=run_config.session_config)
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
new file mode 100644
index 0000000000..360733eff6
--- /dev/null
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for multi-worker distribution strategies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.training import server_lib
+
+
+def normalize_cluster_spec(cluster_spec):
+  """Makes `cluster_spec` into a `ClusterSpec` object.
+
+  Args:
+    cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+      cluster configurations.
+
+  Returns:
+    a `ClusterSpec` object.
+
+  Raises:
+    ValueError: if `cluster_spec` is not a dict or a `ClusterSpec` or a
+      `ClusterDef`.
+  """
+  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+    return server_lib.ClusterSpec(cluster_spec)
+  elif not isinstance(cluster_spec, server_lib.ClusterSpec):
+    raise ValueError(
+        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+        "`tf.train.ClusterDef` object")
+  return cluster_spec
+
+
+def is_chief(cluster_spec, task_type, task_id):
+  """Returns whether the given task is chief in the cluster.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object specifying the
+      cluster configurations.
+    task_type: the task type in the cluster.
+    task_id: the task id in the cluster.
+
+  Returns:
+    a boolean indicating whether the given task is chief.
+
+  Raises:
+    ValueError: if `task_type` is not in the `cluster_spec` or `task_id` exceeds
+      the maximum id of the `task_type`.
+  """
+  cluster_spec = normalize_cluster_spec(cluster_spec)
+  if task_type not in cluster_spec.jobs:
+    raise ValueError(
+        "The task_type \"%s\" is not in the `cluster_spec`." % task_type)
+  if task_id >= cluster_spec.num_tasks(task_type):
+    raise ValueError("The `task_id` %d exceeds the maximum id of %s." % (
+        task_id, task_type))
+
+  if task_type == "chief":
+    return True
+
+  # If chief not in the cluster_spec, use the first worker as chief. This is
+  # common in CollectiveAllReduceStrategy.
+  if ("chief" not in cluster_spec.jobs and task_type == "worker" and
+      task_id == 0):
+    return True
+  return False
diff --git a/tensorflow/python/distribute/multi_worker_util_test.py b/tensorflow/python/distribute/multi_worker_util_test.py
new file mode 100644
index 0000000000..bdc49725c7
--- /dev/null
+++ b/tensorflow/python/distribute/multi_worker_util_test.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multi_worker_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.eager import test
+from tensorflow.python.training import server_lib
+
+
+class NormalizeClusterSpecTest(test.TestCase):
+
+  def assert_same_cluster(self, lhs, rhs):
+    self.assertEqual(
+        server_lib.ClusterSpec(lhs).as_dict(),
+        server_lib.ClusterSpec(rhs).as_dict())
+
+  def testDictAsInput(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assert_same_cluster(
+        cluster_spec, multi_worker_util.normalize_cluster_spec(cluster_spec))
+
+  def testClusterDefAsInput(self):
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = "chief"
+    job.tasks[0] = "127.0.0.1:1234"
+
+    job = cluster_def.job.add()
+    job.name = "worker"
+    job.tasks[0] = "127.0.0.1:8964"
+    job.tasks[1] = "127.0.0.1:2333"
+
+    job = cluster_def.job.add()
+    job.name = "ps"
+    job.tasks[0] = "127.0.0.1:1926"
+    job.tasks[1] = "127.0.0.1:3141"
+
+    self.assert_same_cluster(
+        cluster_def, multi_worker_util.normalize_cluster_spec(cluster_def))
+
+  def testClusterSpecAsInput(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    })
+    self.assert_same_cluster(
+        cluster_spec, multi_worker_util.normalize_cluster_spec(cluster_spec))
+
+  def testUnexpectedInput(self):
+    cluster_spec = ["127.0.0.1:8964", "127.0.0.1:2333"]
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+        "`tf.train.ClusterDef` object"):
+      multi_worker_util.normalize_cluster_spec(cluster_spec)
+
+
+class IsChiefTest(test.TestCase):
+
+  def testClusterWithChief(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertTrue(multi_worker_util.is_chief(cluster_spec, "chief", 0))
+    self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 0))
+
+  def testClusterWithoutChief(self):
+    cluster_spec = {"worker": ["127.0.0.1:8964", "127.0.0.1:2333"]}
+    self.assertTrue(multi_worker_util.is_chief(cluster_spec, "worker", 0))
+    self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))
+
+    with self.assertRaisesRegexp(
+        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
+      multi_worker_util.is_chief(cluster_spec, "chief", 0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "The `task_id` 2 exceeds the maximum id of worker."):
+      multi_worker_util.is_chief(cluster_spec, "worker", 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index de93b1e2e1..6f48d38b58 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -47,7 +47,6 @@ py_library(
         ":core",
         ":execute",
         ":function",
-        ":graph_callable",
         ":graph_only_ops",
         ":tape",
         ":test",
@@ -238,6 +237,7 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":graph_only_ops",
+        "//tensorflow/python:cond_v2_impl",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
@@ -254,41 +254,6 @@ py_library(
 )
 
 py_library(
-    name = "graph_callable",
-    srcs = ["graph_callable.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager:tape",
-    ],
-)
-
-py_test(
-    name = "graph_callable_test",
-    srcs = ["graph_callable_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":backprop",
-        ":graph_callable",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:function",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
-py_library(
     name = "backprop",
     srcs = ["backprop.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 553f761a14..7978383e55 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -180,10 +181,10 @@ def implicit_val_and_grad(f):
   ```
 
   Args:
-   f: function to be differentiated. If `f` returns a scalar, this scalar will
-     be differentiated. If `f` returns a tensor or list of tensors, by default
-     a scalar will be computed by adding all their values to produce a single
-     scalar.
+    f: function to be differentiated. If `f` returns a scalar, this scalar will
+      be differentiated. If `f` returns a tensor or list of tensors, by default
+      a scalar will be computed by adding all their values to produce a single
+      scalar.
 
   Returns:
     A function which, when called, returns a tuple pair.
@@ -255,10 +256,10 @@ def implicit_grad(f):
   ```
 
   Args:
-   f: function to be differentiated. If `f` returns a scalar, this scalar will
-     be differentiated. If `f` returns a tensor or list of tensors, by default
-     a scalar will be computed by adding all their values to produce a single
-     scalar.
+    f: function to be differentiated. If `f` returns a scalar, this scalar will
+      be differentiated. If `f` returns a tensor or list of tensors, by default
+      a scalar will be computed by adding all their values to produce a single
+      scalar.
 
   Returns:
     A function which, when called, returns a list of (gradient, variable) pairs.
@@ -343,24 +344,24 @@ def gradients_function(f, params=None):
   Note that only tensors with real or complex dtypes are differentiable.
 
   Args:
-   f: function to be differentiated. If `f` returns a scalar, this scalar will
-     be differentiated. If `f` returns a tensor or list of tensors, by default
-     a scalar will be computed by adding all their values to produce a single
-     scalar. If desired, the tensors can be elementwise multiplied by the
-     tensors passed as the `dy` keyword argument to the returned gradient
-     function.
-   params: list of parameter names of f or list of integers indexing the
-     parameters with respect to which we'll differentiate. Passing None
-     differentiates with respect to all parameters.
+    f: function to be differentiated. If `f` returns a scalar, this scalar will
+      be differentiated. If `f` returns a tensor or list of tensors, by default
+      a scalar will be computed by adding all their values to produce a single
+      scalar. If desired, the tensors can be elementwise multiplied by the
+      tensors passed as the `dy` keyword argument to the returned gradient
+      function.
+    params: list of parameter names of f or list of integers indexing the
+      parameters with respect to which we'll differentiate. Passing None
+      differentiates with respect to all parameters.
 
   Returns:
     function which, when called, returns the value of f and the gradient
-    of f with respect to all of `params`. The function takes an extra optional
-    keyword argument "dy". Setting it allows computation of vector jacobian
+    of `f` with respect to all of `params`. The function takes an extra optional
+    keyword argument `dy`. Setting it allows computation of vector jacobian
     products for vectors other than the vector of ones.
 
   Raises:
-   ValueError: if the params are not all strings or all integers.
+    ValueError: if the params are not all strings or all integers.
   """
 
   def decorated(*args, **kwds):
@@ -440,23 +441,24 @@ def val_and_grad_function(f, params=None):
   ```
 
   Args:
-   f: function to be differentiated. If `f` returns a scalar, this scalar will
-     be differentiated. If `f` returns a tensor or list of tensors, by default
-     a scalar will be computed by adding all their values to produce a single
-     scalar. If desired, the tensors can be elementwise multiplied by the
-     tensors passed as the `dy` keyword argument to the returned gradient
-     function.
-   params: list of parameter names of f or list of integers indexing the
-     parameters with respect to which we'll differentiate. Passing `None`
-     differentiates with respect to all parameters.
-
-  Returns: function which, when called, returns the value of f and the gradient
-   of f with respect to all of `params`. The function takes an extra optional
-   keyword argument "dy". Setting it allows computation of vector jacobian
-   products for vectors other than the vector of ones.
+    f: function to be differentiated. If `f` returns a scalar, this scalar will
+      be differentiated. If `f` returns a tensor or list of tensors, by default
+      a scalar will be computed by adding all their values to produce a single
+      scalar. If desired, the tensors can be elementwise multiplied by the
+      tensors passed as the `dy` keyword argument to the returned gradient
+      function.
+    params: list of parameter names of f or list of integers indexing the
+      parameters with respect to which we'll differentiate. Passing `None`
+      differentiates with respect to all parameters.
+
+  Returns:
+    function which, when called, returns the value of f and the gradient
+    of f with respect to all of `params`. The function takes an extra optional
+    keyword argument "dy". Setting it allows computation of vector jacobian
+    products for vectors other than the vector of ones.
 
   Raises:
-   ValueError: if the params are not all strings or all integers.
+    ValueError: if the params are not all strings or all integers.
   """
 
   def decorated(*args, **kwds):
@@ -557,7 +559,7 @@ def _aggregate_grads(gradients):
   if len(gradients) == 1:
     return gradients[0]
   if all([isinstance(g, ops.Tensor) for g in gradients]):
-    return math_ops.add_n(gradients)
+    return gen_math_ops.add_n(gradients)
   else:
     assert all([isinstance(g, (ops.Tensor, ops.IndexedSlices))
                 for g in gradients])
@@ -592,7 +594,9 @@ def _num_elements(grad):
 
 
 def _fast_fill(value, shape, dtype):
-  return array_ops.fill(shape, constant_op.constant(value, dtype=dtype))
+  return array_ops.fill(
+      constant_op.constant(shape, dtype=dtypes.int32),
+      constant_op.constant(value, dtype=dtype))
 
 
 def _zeros(shape, dtype):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 1a78559ac0..a2e8422671 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -77,19 +77,54 @@ class SubclassedKerasModel(keras.Model):
 
   def __init__(self):
     super(SubclassedKerasModel, self).__init__()
-    self.layer = keras.layers.Dense(
+    self.layer_a = keras.layers.Dense(
+        64, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_b = keras.layers.Dense(
+        128, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_c = keras.layers.Dense(
+        256, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_d = keras.layers.Dense(
+        256, kernel_initializer="ones", bias_initializer="zeros")
+    self.layer_e = keras.layers.Dense(
         10, kernel_initializer="ones", bias_initializer="zeros")
 
   def call(self, x):
-    return self.layer(x)
+    x = self.layer_a(x)
+    x = self.layer_b(x)
+    x = self.layer_c(x)
+    x = self.layer_d(x)
+    return self.layer_e(x)
 
 
 def make_keras_model():
-  x = keras.Input(shape=(10,))
-  y = keras.layers.Dense(
-      10, kernel_initializer="ones", bias_initializer="zeros")(
-          x)
-  return keras.Model(inputs=x, outputs=y)
+  model_input = keras.Input(shape=(10,))
+  x = keras.layers.Dense(
+      64, kernel_initializer="ones", bias_initializer="zeros")(model_input)
+  x = keras.layers.Dense(
+      128, kernel_initializer="ones", bias_initializer="zeros")(x)
+  x = keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros")(x)
+  x = keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros")(x)
+  x = keras.layers.Dense(
+      10, kernel_initializer="ones", bias_initializer="zeros")(x)
+  return keras.Model(inputs=model_input, outputs=x)
+
+
+def make_sequential_keras_model():
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(
+      64, kernel_initializer="ones", bias_initializer="zeros",
+      input_shape=(10,)))
+  model.add(keras.layers.Dense(
+      128, kernel_initializer="ones", bias_initializer="zeros"))
+  model.add(keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros"))
+  model.add(keras.layers.Dense(
+      256, kernel_initializer="ones", bias_initializer="zeros"))
+  model.add(keras.layers.Dense(
+      10, kernel_initializer="ones", bias_initializer="zeros"))
+  return model
 
 
 class MicroBenchmarks(test.Benchmark):
@@ -315,6 +350,21 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: f(m, m, transpose_b)
     self._run(func, num_iters, execution_mode=execution_mode)
 
+  def _benchmark_defun_matmul_forward_backward(self,
+                                               m,
+                                               transpose_b,
+                                               num_iters,
+                                               execution_mode=None):
+    f = function.defun(math_ops.matmul)
+
+    def func():
+      with backprop.GradientTape() as gt:
+        gt.watch(m)
+        y = f(m, m, transpose_b)
+      _ = gt.gradient(y, m)
+
+    self._run(func, num_iters, execution_mode=execution_mode)
+
   def _benchmark_read_variable(self, m, num_iters):
     self._run(m.value, num_iters)
 
@@ -386,6 +436,21 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul_forward_backward(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul_forward_backward(
+          m,
+          transpose_b=False,
+          num_iters=self._num_iters_2_by_2,
+          execution_mode=context.ASYNC)
+
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -638,6 +703,15 @@ class MicroBenchmarks(test.Benchmark):
     assert np.equal(func(), SubclassedKerasModel()(data)).all()
     self._run(func, 30000)
 
+  def benchmark_keras_model_sequential(self):
+    model = make_sequential_keras_model()
+    data = random_ops.random_uniform((10, 10))
+    func = lambda: model(data)
+    # Symmetry with benchmark_keras_model_functional
+    func()
+    assert np.equal(func(), make_keras_model()(data)).all()
+    self._run(func, 30000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index cc765725a4..cbd6f4cb75 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import pickle
 import threading
 
 import numpy as np
@@ -185,6 +187,17 @@ class TFETest(test_util.TensorFlowTestCase):
         device_count={'GPU': 0}))
     self.assertEquals(0, ctx.num_gpus())
 
+  def testPickle(self):
+    tmp_dir = self.get_temp_dir()
+    fname = os.path.join(tmp_dir, 't.pickle')
+    with open(fname, 'wb') as f:
+      t = constant_op.constant(10.0)
+      pickle.dump(t, f)
+
+    with open(fname, 'rb') as f:
+      t = pickle.load(f)
+      self.assertAllEqual(t.numpy(), 10.0)
+
   def testTensorPlacement(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index f87d88040f..9dc5648861 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -21,12 +21,12 @@ from __future__ import print_function
 
 import collections
 import functools
+import sys
 import threading
 
 import numpy as np
 import six
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
@@ -34,20 +34,27 @@ from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond_v2_impl
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import distribute
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
+# This is to avoid a circular dependency with cond_v2_impl
+# (function -> gradients_impl -> control_flow_ops -> cond_v2_impl).
+cond_v2_impl._function = sys.modules[__name__]  # pylint: disable=protected-access
+
 
 def create_substitute_placeholder(value, name, dtype=None):
   """Creates a placeholder for `value` and propagates shape info to it."""
@@ -112,10 +119,6 @@ class CapturingGraph(ops.Graph):
     # for resource tensors.
     self._last_op_using_resource_tensor = {}
 
-  # TODO(apassos) remove once the C API is used by default.
-  def _use_c_api_hack(self):
-    return True
-
   def clear_resource_control_flow_state(self):
     self._last_op_using_resource_tensor = {}
 
@@ -179,12 +182,19 @@ class CapturingGraph(ops.Graph):
         compute_device=compute_device)
 
 
+def _get_device_functions(ctx, graph):
+  """Returns a tuple of device functions representing the device stack."""
+  if ctx.executing_eagerly():
+    return (pydev.merge_device(ctx.device_name),)
+  else:
+    return tuple(graph._device_functions_outer_to_inner)  # pylint: disable=protected-access
+
+
 class FuncGraph(CapturingGraph):
   """Graph representing a function body.
 
   Attributes:
     name: The name of the function.
-
     inputs: Placeholder tensors representing the inputs to this function. The
       tensors are in this FuncGraph. This represents "regular" inputs as well as
       captured inputs (i.e. the values of self.captures), with the regular
@@ -195,16 +205,19 @@ class FuncGraph(CapturingGraph):
       by this function. The Tensors in this structure are the same as those of
       self.outputs. Note that this structure might contain Python `None`s.
     variables: Variables that should be watched during function execution.
+    outer_graph: The graph this function is defined in. May be another FuncGraph
+      or the global default Graph.
     seed: The graph-level random seed.
   """
 
-  def __init__(self, name, graph=None):
+  def __init__(self, name):
     """Construct a new FuncGraph.
 
+    The graph will inherit its graph key, collections, seed, device stack, and
+    distribution strategy stack from the current context or graph.
+
     Args:
       name: the name of the function.
-      graph: if specified, this FuncGraph will inherit its graph key,
-        collections, and seed from `graph`.
     """
     super(FuncGraph, self).__init__()
 
@@ -213,24 +226,33 @@ class FuncGraph(CapturingGraph):
     self.outputs = []
     self.structured_outputs = None
     self.variables = []
+    self.outer_graph = ops.get_default_graph()
 
-    if graph is not None:
-      # Inherit the graph key, since this is used for matching variables in
-      # optimizers.
-      self._graph_key = graph._graph_key  # pylint: disable=protected-access
-
-      # Copy the graph collections to ensure summaries and other things work.
-      # This lets the function access (but not mutate) collections of the
-      # containing graph, such as the global step and the summary writer
-      # collections.
-      for collection in graph.collections:
-        self.get_collection_ref(collection)[:] = graph.get_collection(
-            collection)
-
-      if context.executing_eagerly():
-        self.seed = context.global_seed()
-      else:
-        self.seed = graph.seed
+    graph = self.outer_graph
+
+    if context.executing_eagerly():
+      self.seed = context.global_seed()
+      self._xla_compile = (context.context().device_spec.device_type == "TPU")
+      self._add_device_to_stack(context.context().device_name)
+    else:
+      self.seed = graph.seed
+      self._xla_compile = getattr(graph, "_xla_compile", False)
+      self._device_function_stack = graph._device_function_stack.copy()  # pylint: disable=protected-access
+
+    # TODO(b/112165328, b/112906995): summaries depend on inheriting collections
+    # from the default graph even in eager mode. It'd be nice to not have a
+    # default graph with eager execution, so hopefully this will go away when we
+    # remove collections.
+    # pylint: disable=protected-access
+    self._collections = graph._collections
+    # TODO(b/112906995): distribution strategy depends on inheriting this stack
+    # from the default graph even in eager mode. Maybe it should be part of the
+    # eager context?
+    self._distribution_strategy_stack = graph._distribution_strategy_stack
+    # Inherit the graph key, since this is used for matching variables in
+    # optimizers.
+    self._graph_key = graph._graph_key
+    # pylint: enable=protected-access
 
   def capture(self, tensor, name=None):
     """Calls CapturingGraph.capture and updates self.inputs if necessary."""
@@ -242,77 +264,15 @@ class FuncGraph(CapturingGraph):
 
     return internal_tensor
 
-
-# pylint: disable=invalid-name
-class HelperContext(object):
-  """ControlFlowContext with a customizable AddOp method."""
-
-  def __init__(self, add_op_internal):
-    self._add_op_internal = add_op_internal
-    self._values = set()  # control flow code sometimes updates this.
-
-  def _AddOpInternal(self, op):
-    self._add_op_internal(op)
-
   @property
-  def outer_context(self):
-    return self._outer_context
-
-  def GetWhileContext(self):
-    if self._outer_context:
-      return self._outer_context.GetWhileContext()
-
-  def IsWhileContext(self):
-    return False
-
-  def IsCondContext(self):
-    return False
-
-  def IsXLAContext(self):
-    return False
-
-  def AddOp(self, op):  # pylint: disable=invalid-name
-    self._AddOpInternal(op)
-    if self._outer_context:
-      self._outer_context.AddOp(op)
-
-  def AddName(self, _):
-    pass
-
-  def AddInnerOp(self, op):
-    self._AddOpInternal(op)
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  def AddValue(self, val):
-    if self._outer_context:
-      return self._outer_context.AddValue(val)
-    else:
-      return val
+  def external_captures(self):
+    """External tensors captured by this function."""
+    return list(self.captures.keys())
 
-  def EnterGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.EnterGradientColocation(op, gradient_uid)
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    """Start building a gradient colocated with an op."""
-    if self._outer_context:
-      self._outer_context.ExitGradientColocation(op, gradient_uid)
-
-  def __enter__(self):
-    # pylint: disable=protected-access
-    self._g = ops.get_default_graph()
-    self._outer_context = self._g._get_control_flow_context()
-    self._g._set_control_flow_context(self)
-    self._nested_contexts = (
-        self._outer_context._nested_contexts
-        if self._outer_context is not None else None)
-    # pylint: enable=protected-access
-
-  def __exit__(self, *_):
-    self._g._set_control_flow_context(self._outer_context)  # pylint: disable=protected-access
-# pylint: enable=invalid-name
+  @property
+  def internal_captures(self):
+    """Placeholders in this function corresponding captured tensors."""
+    return list(self.captures.values())
 
 
 def _forward_name(n):
@@ -335,9 +295,6 @@ def _register(fn):
   context.context().add_function(fn)
 
 
-_xla_compile_attr = "_XlaCompile"
-
-
 # TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
 # so it doesn't have the definition-generating logic and is just a container for
 # an already-defined function.
@@ -350,18 +307,20 @@ class _EagerDefinedFunction(object):
   class may be provided as the value of these `func` attributes.
   """
 
-  def __init__(self, name, graph, operations, inputs, outputs, attrs):
+  def __init__(self, name, graph, inputs, outputs, attrs):
     """Initializes an eager defined function.
 
     Args:
       name: str, the name for the created function.
       graph: Graph, the graph containing the operations in the function
-      operations: list of Operation; the subset of operations in the graph
-        which will be in the function
       inputs: the tensors in the graph to be used as inputs to the function
       outputs: the tensors in the graph which will be outputs to the function
       attrs: dict mapping names of attributes to their AttrValue values
     """
+    operations = [
+        op for op in graph.get_operations()
+        if op not in set(arg.op for arg in inputs)
+    ]
     fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
         graph._c_graph,  # pylint: disable=protected-access
         compat.as_str(name),
@@ -379,7 +338,6 @@ class _EagerDefinedFunction(object):
       # It might be worth creating a convenient way to re-use status.
       pywrap_tensorflow.TF_FunctionSetAttrValueProto(
           fn, compat.as_str(name), serialized)
-    self._xla_compile = _xla_compile_attr in attrs
 
     # TODO(apassos) avoid creating a FunctionDef (specially to grab the
     # signature, but also in general it's nice not to depend on it.
@@ -395,6 +353,7 @@ class _EagerDefinedFunction(object):
     self.signature = function_def.signature
     self._num_outputs = len(self.signature.output_arg)
     self._output_types = [o.type for o in self.signature.output_arg]
+    self._output_shapes = [o.shape for o in outputs]
     self.grad_func_name = None
     self.python_grad_func = None
     self._c_func = c_api_util.ScopedTFFunction(fn)
@@ -415,7 +374,7 @@ class _EagerDefinedFunction(object):
   def stateful_ops(self):
     return self._stateful_ops
 
-  def call(self, ctx, args, output_shapes):
+  def call(self, ctx, args):
     """Calls this function with `args` as inputs.
 
     Function execution respects device annotations only if the function won't
@@ -424,8 +383,6 @@ class _EagerDefinedFunction(object):
     Args:
       ctx: a Context object
       args: a list of arguments to supply this function with.
-      output_shapes: shapes to which outputs should be set; ignored when
-        executing eagerly.
 
     Returns:
       The outputs of the function call.
@@ -433,10 +390,7 @@ class _EagerDefinedFunction(object):
 
     executing_eagerly = ctx.executing_eagerly()
 
-    xla_compile = self._xla_compile or (executing_eagerly and
-                                        ctx.device_spec.device_type == "TPU")
-
-    if xla_compile:
+    if self._graph._xla_compile:  # pylint: disable=protected-access
       # XLA compilation relies upon a custom kernel creator to run functions.
       signature = self.signature
       if executing_eagerly:
@@ -474,16 +428,11 @@ class _EagerDefinedFunction(object):
     if executing_eagerly:
       return outputs
     else:
-      for i, shape in enumerate(output_shapes):
+      for i, shape in enumerate(self._output_shapes):
         outputs[i].set_shape(shape)
       return outputs
 
 
-def _map_sequence_obj_to_idx(sequence):
-  """Maps objs in the sequence from id(obj) to sequence index."""
-  return {id(x): i for i, x in enumerate(sequence)}
-
-
 def _flatten(sequence):
   """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
@@ -500,179 +449,117 @@ def _flatten(sequence):
   return outputs
 
 
-# TODO(akshayka): Perhaps rename to something more appropriate.
-class GraphModeFunction(object):
+class Function(object):
   """Callable object encapsulating a function definition and its gradient.
 
-  `GraphModeFunction` is a callable that encapsulates a function definition and
+  `Function` is a callable that encapsulates a function definition and
   is differentiable under `tf.GradientTape` objects.
   """
 
-  def __init__(self,
-               name,
-               input_placeholders,
-               extra_inputs,
-               graph,
-               operations,
-               outputs,
-               python_func_outputs,
-               output_shapes,
-               variables=None,
-               attrs=None):
-    """Initialize a GraphModeFunction.
+  def __init__(self, func_graph, attrs=None):
+    """Initialize a Function.
 
     Args:
-      name: str the name of the created function
-      input_placeholders: list of placeholder values (tensors) to feed when
-        calling the wrapped function.
-      extra_inputs: Tensor inputs this function definition closed over which
-        are passed as arguments. Need to track so gradients are supported
-        correctly.
-      graph: the Graph from which the operations will be pulled. Used as
-        a context when computing gradients.
-      operations: the subset of Operations in the graph used in the function
-        definition.
-      outputs: a flat list of the Tensors in the graph used as outputs to the
-        function
-      python_func_outputs: a possibly nested python object which will be
-        returned by this function. The Tensors in this structure will be
-        replaced by their corresponding values in outputs. Note that this
-        structure might contain Python `None`s.
-      output_shapes: List of shapes of all tensors in outputs
-      variables: (optional) List of variables to watch during function
-        execution.
+      func_graph: An instance of FuncGraph: the function body to wrap.
       attrs: (optional) dict mapping names of attributes to their AttrValue
         values. Attributes in `attrs` will be included in this function's
         definition.
+
+    Raises:
+      ValueError: If number of input_placeholders is not equal to the number
+        of function inputs.
     """
+    self._func_graph = func_graph
+    self._captured_inputs = list(self._func_graph.captures.keys())
+    self._num_outputs = len(self._func_graph.outputs)
+    self._output_shapes = tuple(
+        output.shape for output in self._func_graph.outputs)
     self._attrs = attrs or {}
-    defined_function = _EagerDefinedFunction(
-        name, graph, operations, input_placeholders, outputs, self._attrs)
-    if len(input_placeholders) != len(defined_function.signature.input_arg):
-      raise ValueError("Internal error: invalid lengths. %s %s" % (
-          len(input_placeholders), len(defined_function.signature.input_arg)))
-    self._input_placeholders = input_placeholders
-    self._extra_inputs = list(extra_inputs)
-    self._graph = graph
-    self._backward_function = None
-    self._func_name = name
-    self._function_def = defined_function
-    self._num_outputs = len(defined_function.signature.output_arg)
-    self._python_func_outputs = python_func_outputs
-    self._python_returns = [python_func_outputs] if isinstance(
-        python_func_outputs,
-        (ops.Tensor, type(None))) else _flatten(python_func_outputs)
-    self._output_shapes = output_shapes
-    self._variables = variables if variables is not None else []
-
-    # Find the variables that are components of something distributed and
-    # put them into a {handle_tensor -> distributed variable object} map.
+    self._device_functions = tuple(
+        self._func_graph._device_functions_outer_to_inner)  # pylint: disable=protected-access
+
+    self._inference_function = _EagerDefinedFunction(
+        _inference_name(self._func_graph.name), self._func_graph,
+        self._func_graph.inputs, self._func_graph.outputs, self._attrs)
+    self._backward_graph_function = None
+
+    # Map holding distributed variables, keyed by resource handle tensors.
     self._distributed_variables = {}
-    strategy = distribute.get_distribution_strategy()
-    for variable in self._variables:
+    strategy = distribution_strategy_context.get_distribution_strategy()
+    for variable in self._func_graph.variables:
       # If variable is not distributed, unwrap returns [variable].
       component_variables = strategy.unwrap(variable)
-      # Only add to the dictionary when the variable is actually distributed,
-      # i.e. more than one component or the component is different from the
-      # variable itself. component_variables cannot be empty.
+      # Only update the dictionary when the variable is actually distributed.
       if (len(component_variables) > 1 or component_variables[0] != variable):
         for component_variable in component_variables:
           self._distributed_variables[component_variable.handle] = variable
 
-  @property
-  def variables(self):
-    return self._variables
+  def __call__(self, *args):
+    """Executes the wrapped function."""
+    ctx = context.context()
+    device_functions = _get_device_functions(ctx, ops.get_default_graph())
+    if device_functions != self._device_functions:
+      raise ValueError(
+          "The current device stack does not match the device stack under "
+          "which the TensorFlow function '%s' was created.\n"
+          "Current device stack: %s\n%s device stack: %s" %
+          (self._inference_function.name, device_functions,
+           self._inference_function.name, self._device_functions))
+
+    for v in self._func_graph.variables:
+      if v.trainable:
+        tape.watch_variable(v)
 
-  def _construct_backprop_function(self):
-    """Constructs the backprop function object for this function."""
-    filtered_outputs = [x for x in self._python_returns if x is not None]
-    # TODO(skyewm): use FuncGraph
-    backwards_graph = CapturingGraph()
-    backwards_graph._graph_key = self._graph._graph_key  # pylint: disable=protected-access
-    for collection in self._graph.collections:
-      backwards_graph.get_collection_ref(
-          collection)[:] = self._graph.get_collection(collection)
-    backwards_graph.seed = self._graph.seed
-    with backwards_graph.as_default():
-      self._out_grad_placeholders = [
-          graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
-      in_gradients = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
-          filtered_outputs,
-          self._input_placeholders,
-          grad_ys=self._out_grad_placeholders,
-          src_graph=self._graph)
-
-    backward_outputs = tuple(
-        grad for grad in _flatten(in_gradients) if grad is not None)
-    output_shapes = tuple(grad.shape for grad in backward_outputs)
-
-    extra_inputs = backwards_graph.captures.keys()
-    extra_placeholders = backwards_graph.captures.values()
-
-    forward_name = _forward_name(self._func_name)
-    # Note: we cannot have placeholder ops in the graph or the TPU compilation
-    # pass fails.
-    placeholder_ops = set([y.op for y in self._input_placeholders])
-    function_ops = [x for x in self._graph.get_operations()
-                    if x not in placeholder_ops]
-    self._forward_fdef = _EagerDefinedFunction(
-        forward_name, self._graph, function_ops,
-        self._input_placeholders, filtered_outputs + list(extra_inputs),
-        self._attrs)
-    all_inputs = self._out_grad_placeholders + list(extra_placeholders)
-    # Excluding input ops from the body as we do not intend to execute these
-    # operations when the function is executed.
-    all_ignored_ops = frozenset(x.op for x in all_inputs)
-    # Enforce a deterministic order of operations in the generated graph. This
-    # means rerunning the function-defining code will always define the same
-    # function, which is useful if we serialize this etc.
-    function_def_ops = tuple(x
-                             for x in sorted(backwards_graph.get_operations(),
-                                             key=lambda x: x.name)
-                             if x not in all_ignored_ops)
-    bname = _backward_name(self._func_name)
-    self._backward_function = GraphModeFunction(
-        bname, all_inputs, [], backwards_graph, function_def_ops,
-        backward_outputs, in_gradients, output_shapes, attrs=self._attrs)
+    captures = self._resolve_captured_inputs()
+    tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
+    args = tensor_inputs + captures
 
-  def _backprop_call(self, args):
-    """Calls the wrapped function and records the result on a tape.
+    if tape.should_record(tensor_inputs) or tape.should_record(captures):
+      return self._backprop_call(args)
 
-    (Only records results on a tape if the function has outputs)
+    outputs = self._inference_function.call(ctx, args)
+    return self._build_call_outputs(outputs)
 
-    Args:
-      args: All inputs to the function, including resolved extra inputs
-    Returns:
-      The call output.
-    """
-    ctx = context.context()
-    outputs = self._forward_fdef.call(ctx, args, self._output_shapes)
-    if isinstance(outputs, ops.Operation) or outputs is None:
-      return outputs
+  @property
+  def graph(self):
+    """Returns the graph from which this function was constructed."""
+    return self._func_graph
 
-    # `real_outputs` are the actual outputs of the inference graph function;
-    # `side_outputs` are the intermediate Tensors that were added as outputs to
-    # the forward graph function so that we can compute its gradient.
-    real_outputs = outputs[:self._num_outputs]
-    side_outputs = outputs[self._num_outputs:]
+  @property
+  def variables(self):
+    """Returns all variables touched by this function."""
+    return self._func_graph.variables
 
-    def backward_function(*args):
-      return self._backward_function(*(list(args) + side_outputs))  # pylint: disable=not-callable
+  @property
+  def inputs(self):
+    """Returns tensors in `self.graph` corresponding to arguments."""
+    return self._func_graph.inputs
 
-    tape.record_operation(
-        self._forward_fdef.signature.name,
-        real_outputs,
-        args,
-        backward_function)
+  @property
+  def outputs(self):
+    """Returns tensors in `self.graph` corresponding to return values."""
+    return self._func_graph.outputs
 
-    return self._build_call_outputs(real_outputs)
+  @property
+  def captured_inputs(self):
+    """Returns external Tensors captured by this function.
+
+    self.__call__(*args) passes `args + self.captured_inputs` to the function.
+    """
+    return self._captured_inputs
+
+  @property
+  def function_def(self):
+    """Returns a `FunctionDef` object representing this function."""
+    return self._inference_function.definition
 
   @property
   def output_shapes(self):
     """The function's output shapes."""
     # TODO(ebrevdo): Should we only keep the output shapes associated
     # with len(self._python_returns) outputs?
-    outputs_list = nest.flatten(self._python_func_outputs)
+    # TODO(akshayka): Consider removing this.
+    outputs_list = nest.flatten(self._func_graph.structured_outputs)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
@@ -686,23 +573,80 @@ class GraphModeFunction(object):
         else:
           outputs_list[i] = self._output_shapes[j]
           j += 1
-    return nest.pack_sequence_as(self._python_func_outputs, outputs_list)
+    return nest.pack_sequence_as(self._func_graph.structured_outputs,
+                                 outputs_list)
 
   @property
   def output_dtypes(self):
-    return nest.map_structure(
-        lambda x: x.dtype if x is not None else None, self._python_func_outputs)
+    # TODO(akshayka): Consider removing this.
+    return nest.map_structure(lambda x: x.dtype if x is not None else None,
+                              self._func_graph.structured_outputs)
 
-  @property
-  def captured_inputs(self):
-    return self._extra_inputs
+  def _construct_backprop_function(self):
+    """Constructs the backprop function object for this function."""
+    backwards_graph = FuncGraph(_backward_name(self._func_graph.name))
+    with backwards_graph.as_default():
+      gradients_wrt_outputs = [
+          graph_placeholder(x.dtype, x.shape) for x in self._func_graph.outputs
+      ]
+      gradients_wrt_inputs = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
+          self._func_graph.outputs,
+          self._func_graph.inputs,
+          grad_ys=gradients_wrt_outputs,
+          src_graph=self._func_graph)
+
+    self._forward_function = _EagerDefinedFunction(
+        _forward_name(
+            self._func_graph.name), self._func_graph, self._func_graph.inputs,
+        self._func_graph.outputs + list(backwards_graph.captures.keys()),
+        self._attrs)
 
-  @property
-  def name(self):
-    """Returns the name of the function in Eager-compatible format."""
-    return self._function_def.name.encode("utf-8")
+    # The ordering of `backwards_graph.inputs` is important: inputs of
+    # `self._backward_graph_function` correspond to outputs of
+    # `self._forward_function`.
+    backwards_graph.inputs = gradients_wrt_outputs + list(
+        backwards_graph.captures.values())
+    # Clear captures, since we pass them in as inputs.
+    backwards_graph.captures = {}
+    backwards_graph.outputs.extend(
+        grad for grad in _flatten(gradients_wrt_inputs) if grad is not None)
+    backwards_graph.structured_outputs = gradients_wrt_inputs
+    self._backward_graph_function = Function(
+        backwards_graph, attrs=self._attrs)
+
+  def _backprop_call(self, args):
+    """Calls the forward function and records the result on a tape.
+
+    (Only records results on a tape if the function has outputs)
+
+    Args:
+      args: All inputs to the function, including resolved captured inputs
+
+    Returns:
+      The call output.
+    """
+    if self._backward_graph_function is None:
+      self._construct_backprop_function()
+
+    ctx = context.context()
+    outputs = self._forward_function.call(ctx, args)
+    if isinstance(outputs, ops.Operation) or outputs is None:
+      return outputs
+
+    # `real_outputs` are the actual outputs of the inference graph function;
+    # `side_outputs` are the intermediate Tensors that were added as outputs to
+    # the forward graph function so that we can compute its gradient.
+    real_outputs = outputs[:self._num_outputs]
+    side_outputs = outputs[self._num_outputs:]
 
-  def _resolve_extra_inputs(self):
+    def backward_function(*args):
+      return self._backward_graph_function(*(list(args) + side_outputs))  # pylint: disable=not-callable
+
+    tape.record_operation(self._forward_function.signature.name, real_outputs,
+                          args, backward_function)
+    return self._build_call_outputs(real_outputs)
+
+  def _resolve_captured_inputs(self):
     """Resolve captured distributed variables to their current values.
 
     Some inputs can be distributed variables. Such variables yield a different
@@ -710,44 +654,23 @@ class GraphModeFunction(object):
     execution.
 
     Returns:
-      a list of resolved extra input tensors.
+      a list of resolved captured input tensors.
     """
     if self._distributed_variables:
-      # Loop over each extra_inputs and check if it corresponds to something
+      # Loop over each captured input and check if it corresponds to something
       # distributed. If so, get its _distributed_container and fetch the
       # component appropriate for the current execution context.
-      resolved_extra_inputs = self._extra_inputs[:]
-      for i, extra_input in enumerate(self._extra_inputs):
-        distributed_var = self._distributed_variables.get(extra_input, None)
+      resolved_captured_inputs = self._captured_inputs[:]
+      for i, captured_input in enumerate(self._captured_inputs):
+        distributed_var = self._distributed_variables.get(captured_input, None)
         if distributed_var is not None:
           # distributed variables override __getattr__ and substitute the
           # right component variable. In here, `distributed_var.handle`
           # actually does the equivalent of
           # distributed_var.get_current_component_var().handle.
-          resolved_extra_inputs[i] = distributed_var.handle
-      return resolved_extra_inputs
-
-    return self._extra_inputs
-
-  def __call__(self, *args):
-    """Executes the passed function in eager mode."""
-    for v in self._variables:
-      if v.trainable:
-        tape.watch_variable(v)
-
-    resolved_extra_inputs = self._resolve_extra_inputs()
-
-    tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
-    args = tensor_inputs + resolved_extra_inputs
-    if tape.should_record(tensor_inputs) or tape.should_record(
-        resolved_extra_inputs):
-      if self._backward_function is None:
-        self._construct_backprop_function()
-      return self._backprop_call(args)
-
-    ctx = context.context()
-    outputs = self._function_def.call(ctx, args, self._output_shapes)
-    return self._build_call_outputs(outputs)
+          resolved_captured_inputs[i] = distributed_var.handle
+      return resolved_captured_inputs
+    return self._captured_inputs
 
   def _build_call_outputs(self, result):
     """Maps the fdef output list to actual output structure.
@@ -757,12 +680,12 @@ class GraphModeFunction(object):
     Returns:
       The actual call output.
     """
-    if self._python_func_outputs is None:
+    if self._func_graph.structured_outputs is None:
       return result
 
     # Use `nest.flatten` instead of `_flatten` in order to preserve any
-    # IndexedSlices in `self._python_func_outputs`.
-    outputs_list = nest.flatten(self._python_func_outputs)
+    # IndexedSlices in `self._func_graph.structured_outputs`.
+    outputs_list = nest.flatten(self._func_graph.structured_outputs)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
@@ -776,13 +699,13 @@ class GraphModeFunction(object):
             j += 3
           else:
             outputs_list[i] = ops.IndexedSlices(
-                values=result[j],
-                indices=result[j + 1])
+                values=result[j], indices=result[j + 1])
             j += 2
         else:
           outputs_list[i] = result[j]
           j += 1
-    ret = nest.pack_sequence_as(self._python_func_outputs, outputs_list)
+    ret = nest.pack_sequence_as(self._func_graph.structured_outputs,
+                                outputs_list)
     return ret
 
 
@@ -798,20 +721,18 @@ def _get_defun_inputs_from_signature(signature):
 def _get_defun_inputs_from_args(args):
   """Maps python function args to graph-construction inputs."""
   function_inputs = [
-      graph_placeholder(arg.dtype, arg.shape) if isinstance(arg, ops.Tensor)
-      else arg for arg in nest.flatten(args)
+      graph_placeholder(arg.dtype, arg.shape)
+      if isinstance(arg, ops.Tensor) else arg for arg in nest.flatten(args)
   ]
   return nest.pack_sequence_as(args, function_inputs)
 
 
-def _trace_and_define_function(name, python_func, compiled, args, kwds,
-                               signature=None):
-  """Defines and returns graph-mode version of `python_func`.
+def func_graph_from_py_func(name, python_func, args, kwds, signature=None):
+  """Returns a `FuncGraph` generated from `python_func`.
 
   Args:
     name: an identifier for the function.
     python_func: the Python function to trace.
-    compiled: whether the graph function should be compiled through XLA.
     args: the positional args with which the Python function should be called;
       ignored if a signature is provided.
     kwds: the keyword args with which the Python function should be called;
@@ -823,15 +744,16 @@ def _trace_and_define_function(name, python_func, compiled, args, kwds,
       inputs.
 
   Returns:
-    A GraphModeFunction.
+    A FuncGraph.
 
   Raises:
     TypeError: If any of `python_func`'s return values is neither `None` nor a
       `Tensor`.
   """
-  func_graph = FuncGraph(_inference_name(name), graph=ops.get_default_graph())
-
+  func_graph = FuncGraph(name)
   with func_graph.as_default(), AutomaticControlDependencies() as a:
+    variable_scope.get_variable_scope().set_use_resource(True)
+
     if signature is None:
       func_args = _get_defun_inputs_from_args(args)
       func_kwds = _get_defun_inputs_from_args(kwds)
@@ -842,8 +764,7 @@ def _trace_and_define_function(name, python_func, compiled, args, kwds,
     # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
     func_graph.inputs.extend(
         x for x in nest.flatten(func_args) + nest.flatten(func_kwds)
-        if isinstance(x, ops.Tensor)
-    )
+        if isinstance(x, ops.Tensor))
 
     # Variables to help check whether mutation happens in calling the function
     # Copy the recursive list, tuple and map structure, but not base objects
@@ -868,6 +789,7 @@ def _trace_and_define_function(name, python_func, compiled, args, kwds,
     this_tape = tape.push_new_tape()
     try:
       func_outputs = python_func(*func_args, **func_kwds)
+      # invariant: `func_outputs` contains only Tensors and `None`s.
       func_outputs = nest.map_structure(convert, func_outputs)
 
       def check_mutation(n1, n2):
@@ -887,53 +809,34 @@ def _trace_and_define_function(name, python_func, compiled, args, kwds,
 
       check_mutation(func_args_before, func_args)
       check_mutation(func_kwds_before, func_kwds)
-
     finally:
       tape.pop_tape(this_tape)
+
     func_graph.structured_outputs = func_outputs
+    # Returning a closed-over tensor does not trigger convert_to_tensor.
+    func_graph.outputs.extend(
+        func_graph.capture(x)
+        for x in _flatten(func_graph.structured_outputs)
+        if x is not None)
+
+    # Some captured variables might be components of DistributedValues.
+    # Instead of storing non-distributed component variables, we
+    # store their distributed containers so we can retrieve the correct
+    # component variables at call-time.
     variables = list(this_tape.watched_variables())
-
-    # Some variables captured by the tape can come from a DistributedValue.
-    # At call time, DistributedValue can return another variable (e.g. if
-    # the function is run on a different device). Thus, instead of storing
-    # the specific captured variable, we replace it with its distributed
-    # container.
-    strategy = distribute.get_distribution_strategy()
+    strategy = distribution_strategy_context.get_distribution_strategy()
     for i, variable in enumerate(variables):
       # If variable is not distributed value_container returns itself.
       variables[i] = strategy.value_container(variable)
-
     func_graph.variables = variables
 
-    # Returning a closed-over tensor as an output does not trigger a
-    # call to convert_to_tensor, so we manually capture all such tensors.
-    func_graph.outputs.extend(
-        func_graph.capture(x) for x in _flatten(func_graph.structured_outputs)
-        if x is not None
-    )
-
-    output_shapes = tuple(
-        x.shape if isinstance(x, ops.Tensor) else None
-        for x in func_graph.outputs)
-
-  all_ignored_ops = frozenset(x.op for x in func_graph.inputs)
-  operations = tuple(x for x in func_graph.get_operations()
-                     if x not in all_ignored_ops)
-  # Register any other functions defined in the graph
-  # TODO(ashankar): Oh lord, forgive me for this lint travesty.
+  # Register any other functions defined in the graph.
   if context.executing_eagerly():
     for f in func_graph._functions.values():  # pylint: disable=protected-access
       # TODO(ashankar): What about the gradient registry?
       _register(f._c_func.func)  # pylint: disable=protected-access
 
-  attrs = {}
-  if compiled:
-    attrs[_xla_compile_attr] = attr_value_pb2.AttrValue(b=True)
-
-  return GraphModeFunction(
-      func_graph.name, func_graph.inputs, func_graph.captures.keys(),
-      func_graph, operations, func_graph.outputs, func_graph.structured_outputs,
-      output_shapes, func_graph.variables, attrs)
+  return func_graph
 
 
 _TensorType = collections.namedtuple("_TensorType", ["dtype", "shape"])
@@ -982,13 +885,13 @@ def _deterministic_dict_values(dictionary):
   return tuple(dictionary[key] for key in sorted(dictionary))
 
 
-class _PolymorphicFunction(object):
+class PolymorphicFunction(object):
   """Wrapper class for the graph functions defined for a Python function.
 
   See the documentation for `defun` for more information on the semantics of
   defined functions.
 
-  _PolymorphicFunction class is thread-compatible meaning that minimal
+  PolymorphicFunction class is thread-compatible meaning that minimal
   usage of defuns (defining and calling) is thread-safe, but if users call other
   methods or invoke the base `python_function` themselves, external
   synchronization is necessary.
@@ -997,8 +900,7 @@ class _PolymorphicFunction(object):
   def __init__(self,
                python_function,
                name,
-               input_signature=None,
-               compiled=False):
+               input_signature=None):
     """Initializes a polymorphic function.
 
     Args:
@@ -1007,14 +909,10 @@ class _PolymorphicFunction(object):
       input_signature: a possibly nested sequence of `TensorSpec` objects
         specifying the input signature of this function. If `None`, a separate
         function is instantiated for each inferred input signature.
-      compiled: if True, the framework will attempt to compile func with XLA.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
         argspec has keyword arguments.
-      TypeError: if `input_signature` contains anything other than
-        `TensorSpec` objects, or (if not None) is anything other than a tuple or
-        list.
     """
 
     if isinstance(python_function, functools.partial):
@@ -1026,8 +924,7 @@ class _PolymorphicFunction(object):
       self._args_to_prepend = tuple()
       self._kwds_to_include = {}
     self._name = name
-    self._compiled = compiled
-    self._arguments_to_functions = {}
+    self._function_cache = collections.OrderedDict()
     self._variables = []
 
     self._lock = threading.Lock()
@@ -1062,15 +959,40 @@ class _PolymorphicFunction(object):
 
       self._input_signature = tuple(input_signature)
       self._flat_input_signature = tuple(nest.flatten(input_signature))
-      if any(not isinstance(arg, tensor_spec.TensorSpec)
-             for arg in self._flat_input_signature):
-        raise TypeError("Invalid input_signature %s; input_signature must be "
-                        "a possibly nested sequence of TensorSpec objects.")
+
+  def __call__(self, *args, **kwds):
+    """Calls a graph function specialized to the inputs."""
+    graph_function, inputs = self._maybe_define_function(*args, **kwds)
+    return graph_function(*inputs)
+
+  @property
+  def python_function(self):
+    """Returns the wrapped Python function."""
+    return self._python_function
+
+  # TODO(akshayka): Remove this property.
+  @property
+  def variables(self):
+    """Returns the union of all variables referenced by cached `Function`s`."""
+    return self._variables
+
+  def get_concrete_function(self, *args, **kwargs):
+    """Returns a `Function` object specialized to inputs and execution context.
+
+    `args` and `kwargs` are ignored if this `PolymorphicFunction` was created
+    with an `input_signature`.
+
+    Args:
+      *args: inputs to specialize on.
+      **kwargs: inputs to specialize on.
+    """
+    graph_function, _ = self._maybe_define_function(*args, **kwargs)
+    return graph_function
 
   def __get__(self, instance, owner):
     """Makes it possible to defun instance methods."""
     del owner
-    # `instance` here is the instance that this `_PolymorphicFunction` was
+    # `instance` here is the instance that this `PolymorphicFunction` was
     # accessed through; e.g., for
     #
     #   class Foo(object):
@@ -1080,29 +1002,36 @@ class _PolymorphicFunction(object):
     #       ...
     #
     #   foo = Foo()
-    #   foo.bar()  # `foo.bar` is a `_PolymorphicFunction` instance
+    #   foo.bar()  # `foo.bar` is a `PolymorphicFunction` instance
     #
     # then `instance` will be `foo` (and `owner` will be `Foo`).
     return functools.partial(self.__call__, instance)
 
-  def _cache_key(self, args, kwds):
-    """Computes the cache key given inputs."""
+  def _cache_key(self, args, kwds, ctx, graph):
+    """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwds) if kwds else args
       cache_key = tuple(_encode_arg(arg) for arg in inputs)
     else:
       del args, kwds
       cache_key = self._flat_input_signature
+
     # The graph, or whether we're executing eagerly, should be a part of the
     # cache key so we don't improperly capture tensors such as variables.
-    return cache_key + (context.executing_eagerly() or ops.get_default_graph(),)
+    execution_context = ctx.executing_eagerly() or graph
+
+    # Putting the device in the cache key ensures that call-site device
+    # annotations are respected.
+    device_functions = _get_device_functions(ctx, graph)
+
+    return cache_key + (execution_context, device_functions)
 
   def _canonicalize_function_inputs(self, *args, **kwds):
     """Canonicalizes `args` and `kwds`.
 
     Canonicalize the inputs to the Python function using its fullargspec. In
     particular, we parse the varags and kwargs that this
-    `_PolymorphicFunction` was called with into a tuple corresponding to the
+    `PolymorphicFunction` was called with into a tuple corresponding to the
     Python function's positional (named) arguments and a dictionary
     corresponding to its kwargs.
 
@@ -1156,8 +1085,9 @@ class _PolymorphicFunction(object):
       if any(not isinstance(arg, ops.Tensor) for arg in flat_inputs):
         raise ValueError("When input_signature is provided, all inputs to "
                          "the Python function must be Tensors.")
-      tensor_specs = [tensor_spec.TensorSpec.from_tensor(tensor)
-                      for tensor in flat_inputs]
+      tensor_specs = [
+          tensor_spec.TensorSpec.from_tensor(tensor) for tensor in flat_inputs
+      ]
       if any(not spec.is_compatible_with(other)
              for spec, other in zip(self._flat_input_signature, tensor_specs)):
         raise ValueError("Python inputs incompatible with input_signature: "
@@ -1182,42 +1112,33 @@ class _PolymorphicFunction(object):
     """
 
     args, kwds = self._canonicalize_function_inputs(*args, **kwds)
-    cache_key = self._cache_key(args, kwds)
+    cache_key = self._cache_key(args, kwds, context.context(),
+                                ops.get_default_graph())
     with self._lock:
       try:
-        graph_function = self._arguments_to_functions.get(cache_key, None)
+        graph_function = self._function_cache.get(cache_key, None)
       except TypeError:
         raise TypeError("Arguments supplied to `defun`-generated functions "
                         "must be hashable.")
 
       if graph_function is None:
-        graph_function = _trace_and_define_function(
-            self._name, self._python_function, self._compiled, args, kwds,
-            self._input_signature)
+        graph_function = Function(
+            func_graph_from_py_func(self._name, self._python_function, args,
+                                    kwds, self._input_signature))
         self._variables.extend(
             [v for v in graph_function.variables if v not in self._variables])
-        self._arguments_to_functions[cache_key] = graph_function
+        self._function_cache[cache_key] = graph_function
       return graph_function, (args, kwds)
 
-  def __call__(self, *args, **kwds):
-    """Calls a graph function specialized for this input signature."""
-    graph_function, inputs = self._maybe_define_function(*args, **kwds)
-    return graph_function(*inputs)
-
-  def call_python_function(self, *args, **kwargs):
-    """Directly calls the wrapped python function."""
-    return self._python_function(*args, **kwargs)
 
-  @property
-  def variables(self):
-    """Returns a list of variables used in any of the defined functions."""
-    return self._variables
+def _validate_signature(signature):
+  if any(not isinstance(arg, tensor_spec.TensorSpec)
+         for arg in nest.flatten(signature)):
+    raise TypeError("Invalid input_signature %s; input_signature must be "
+                    "a possibly nested sequence of TensorSpec objects.")
 
 
-# TODO(akshayka): Remove the `compiled` flag and create a separate
-# API for xla compilation (`defun` is already complicated enough
-# as it is, and the keyword argument makes 'compiled' an overloaded concept)
-def defun(func=None, input_signature=None, compiled=False):
+def defun(func=None, input_signature=None):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
@@ -1322,18 +1243,60 @@ def defun(func=None, input_signature=None, compiled=False):
   generates and placed in the eager context if executing eagerly or into an
   outer graph otherwise.
 
-  _Tracing and Input Signatures_.
-  The signature of inputs supplied to `F` is defined to be a tuple of the shapes
-  and dtypes of Tensor-typed arguments and the values of non-Tensor arguments,
-  where "arguments" includes both args and kwargs. Every time `F` is invoked,
-  the signature of its inputs are inferred. The first time `F(*args, **kwargs)`
-  is invoked with a particular signature, `f(*args, **kwargs)` is executed and
-  all the TensorFlow operations that `f` executes, along with the Tensors that
-  flow between them, are recorded in a TensorFlow graph. `F` caches this graph
-  and binds it to the inputs' signature; every subsequent invocation of `F` with
-  inputs conforming to this signature will immediately retrieve the cached graph
-  and pass it to the TensorFlow runtime for execution.
+  _Input Signatures_
+  By default, `F = tf.contrib.eager.defun(f)` instantiates a separate graph
+  for every unique sequence of the shapes and dtypes of Tensor arguments and
+  the values of Python objects it is invoked with. For example, calling
+  `F(tf.random_uniform([2])` will execute a different graph than
+  `F(tf.random_uniform([3])` because the two inputs have different shapes.
+  The first time that `F(*args, **kwargs)` is called with a particular sequence
+  of Tensor shapes and dtypes and Python values, it constructs a graph by
+  tracing the execution of `f(*args, **kwargs)`; this graph is bound to an
+  input signature inferred from `(*args, **kwargs)` and cached for future reuse.
+
+  `tf.contrib.eager.defun` caches graphs for your convenience, letting you
+  define TensorFlow functions without explicitly specifying their signatures.
+  However, this policy is conservative and potentially expensive; for example,
+  when different invocations of your function have differently-shaped Tensor
+  inputs, this policy might generate more graph functions than necessary. To
+  eliminate such costs, `tf.contrib.eager.defun` allows you to supply an
+  optional `input_signature` argument specifying the shapes and dtypes of the
+  inputs. In particular, the shapes may be partially unspecified, with `None`s
+  in the unknown dimensions.  When an input signature is provided,
+  `tf.contrib.eager.defun` will only instantiate a single graph for the
+  decorated Python function. The following is an example:
 
+  ```python
+  import tensorflow as tf
+
+  # The first `TensorSpec` below describes the shape and dtype of `words`,
+  # and the second describes the shape and dtype of `another_tensor`. Note that
+  # the last dimension of the `words` `TensorSpec` is left unspecified.
+  @tf.contrib.eager.defun(input_signature=[
+    tf.contrib.eager.TensorSpec(shape=[50, 300, None], dtype=tf.float32),
+    tf.contrib.eager.TensorSpec(shape=[300, 100], dtype=tf.float32)
+  ])
+  def my_sequence_model(words, another_tensor):
+    ...
+
+  # Note how the third dimension of the first input can vary freely.
+  words = tf.random_uniform(([50, 300, 10])
+  second_input = tf.random_uniform([300, 100])
+  my_sequence_model(words, second_input)
+
+  words = tf.random_uniform(([50, 300, 20])
+  my_sequence_model(words, second_input)
+
+  # Passing an input with an incompatible shape will raise an error.
+  words = tf.random_uniform(([50, 100, 20])
+  my_sequence_model(words, second_input)  # <---- This will raise an error.
+
+  ```
+
+  Python functions that are compiled with an `input_signature` must only accept
+  Tensors as arguments and must not take unnamed keyword arguments (**kwargs).
+
+  _Tracing_
   Be aware that because `F` only logs TensorFlow operations, all the other
   Python code that `f` executes will only shape the _construction_ of the graphs
   that `F` executes: the Python code won't be executed when the graphs
@@ -1466,9 +1429,10 @@ def defun(func=None, input_signature=None, compiled=False):
     func: function to be compiled. If `func` is None, returns a
       decorator that can be invoked with a single argument - `func`. The
       end result is equivalent to providing all the arguments up front.
-      In other words, defun(compiled=True)(func) is equivalent to
-      defun(func, compiled=True). The former allows the following use case:
-        @tf.contrib.eager.defun(compiled=True)
+      In other words, defun(input_signature=...)(func) is equivalent to
+      defun(func, input_signature=...). The former allows
+      the following use case:
+        @tf.contrib.eager.defun(input_signature=...)
         def foo(...):
           ...
 
@@ -1479,17 +1443,20 @@ def defun(func=None, input_signature=None, compiled=False):
       signature is specified, every input to `func` must be a `Tensor`, and
       `func` cannot accept `**kwargs`.
 
-    compiled: If True, an attempt to compile `func` with XLA will be made.
-      If it fails, function will be run normally. Experimental.  Currently
-      supported only for execution on TPUs. For the vast majority of users,
-      this argument should be False.
-
   Returns:
      If `func` is not None, returns a callable that will execute the compiled
      function (and return zero or more `tf.Tensor` objects).
      If `func` is None, returns a decorator that, when invoked with a single
      `func` argument, returns a callable equivalent to the case above.
+
+  Raises:
+    TypeError: If `input_signature` is neither `None` nor a sequence of
+      `tf.contrib.eager.TensorSpec` objects.
   """
+
+  if input_signature is not None:
+    _validate_signature(input_signature)
+
   # TODO(apassos): deal with captured global state. Deal with control flow.
   def decorated(function):
     try:
@@ -1498,8 +1465,7 @@ def defun(func=None, input_signature=None, compiled=False):
       name = "function"
     return tf_decorator.make_decorator(
         function,
-        _PolymorphicFunction(
-            function, name, input_signature=input_signature, compiled=compiled))
+        PolymorphicFunction(function, name, input_signature=input_signature))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
@@ -1515,51 +1481,6 @@ def defun(func=None, input_signature=None, compiled=False):
   return decorated
 
 
-def make_defun_op(func, *args, **kwds):
-  """Compile func into graph_mode, assuming func arguments are *args, **kwargs.
-
-  `make_defun_op` converts a function that constructs a TensorFlow graph into
-  a function object and attaches it to the graph.  The resulting function
-  object can be queried for its properties, and called directly with different
-  inputs to execute.
-
-  More details on use cases and limitations are available in the
-  documentation for `defun`.
-
-  Example:
-  ```python
-  def f(x, y):
-    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
-
-  def g(x, y):
-    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
-
-  z = tf.constant([[0.0, 0.0]])
-  g_op = make_defun_op(g, z, z)
-
-  assert g_op.output_shapes == tf.TensorShape([])
-  assert g_op.output_types == tf.float32
-
-  x = tf.constant([[2.0, 3.0]])
-  y = tf.constant([[3.0, -2.0]])
-
-  # The plain function and defun-compiled function should return the same value.
-  assert f(x, y).numpy() == g_op(x, y).numpy()
-  ```
-
-  Args:
-    func: function to be compiled.
-    *args: List arguments to pass to `func` when attaching to the graph.
-    **kwds: Keyword arguments to pass to `func` when attaching to the graph.
-
-  Returns:
-     A wrapper object which can be queried for its output properties,
-     and which can be called directly the way a `@defun` wrapped function
-     can.
-  """
-  return _trace_and_define_function(func.__name__, func, False, args, kwds)
-
-
 class AutomaticControlDependencies(object):
   """Context manager to automatically add control dependencies.
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 0488dc9752..8381d2f55c 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -130,16 +130,16 @@ class FunctionTest(test.TestCase):
     with ops.Graph().as_default():
       self.assertEqual(f().shape, ())
 
-  def testBasicDefunOpGraphMode(self):
+  def testBasicGraphFunction(self):
     matmul = function.defun(math_ops.matmul)
 
+    @function.defun
     def sq(a):
       return matmul(a, a)
 
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
 
-    sq_op = function.make_defun_op(sq, t)
-
+    sq_op = sq.get_concrete_function(t)
     self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2]))
     out = sq_op(t)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
@@ -211,33 +211,44 @@ class FunctionTest(test.TestCase):
     random_seed.set_random_seed(1)
     self.assertAllEqual(f(), x)
 
-  def testNestedInputsDefunOpGraphMode(self):
+  def testSymGradGatherNd(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+
+      @function.defun
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertAllEqual(sess.run(g), [[1.0]])
+
+  def testNestedInputsGraphFunction(self):
     matmul = function.defun(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
+    @function.defun
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-
     inputs = pair({'a': t}, {'b': t})
-    sq_op = function.make_defun_op(a_times_b, inputs)
-
+    sq_op = a_times_b.get_concrete_function(inputs)
     self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2]))
     out = sq_op(inputs)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
-  def testNestedOutputDefunOpGraphMode(self):
+  def testNestedOutputGraphFunction(self):
     matmul = function.defun(math_ops.matmul)
 
+    @function.defun
     def sq(a):
       return (matmul(a, a), {'b': constant_op.constant(1.0)})
 
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
 
-    sq_op = function.make_defun_op(sq, t)
-
+    sq_op = sq.get_concrete_function(t)
     self.assertEqual(sq_op.output_shapes,
                      (tensor_shape.TensorShape([2, 2]),
                       {'b': tensor_shape.TensorShape([])}))
@@ -247,28 +258,28 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
     self.assertAllEqual(b['b'].numpy(), 1.0)
 
-  def testDefunOpGraphModeWithGradients(self):
+  def testGraphFunctionWithGradients(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
 
+    @function.defun
     def step():
       def inner():
         return v * v
 
       return backprop.implicit_grad(inner)()[0][0]
 
-    step_op = function.make_defun_op(step)
-
+    step_op = step.get_concrete_function()
     self.assertEqual(step_op.output_dtypes, dtypes.float32)
     self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
     self.assertAllEqual(step_op(), 2.0)
 
-  def testDefunOpGraphModeNoneOutput(self):
+  def testGraphFunctionNoneOutput(self):
+    @function.defun
     def fn(unused_a, unused_b):
       return None
 
     x = constant_op.constant(1)
-    fn_op = function.make_defun_op(fn, x, x)
-
+    fn_op = fn.get_concrete_function(x, x)
     self.assertEqual(fn_op.output_dtypes, None)
     self.assertEqual(fn_op.output_shapes, None)
     self.assertAllEqual(fn_op(x, x), None)
@@ -309,13 +320,13 @@ class FunctionTest(test.TestCase):
     x = random_ops.random_uniform([2, 2]).numpy()
     defined = function.defun(f)
     defined(x)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
 
     x = random_ops.random_uniform([2, 2]).numpy()
     defined(x)
     # A NumPy array with different values but the same shape and dtype
     # shouldn't trigger another function definition.
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
 
   def testDefunCapturedInt32(self):
     x = constant_op.constant(1, dtype=dtypes.int32)
@@ -346,6 +357,47 @@ class FunctionTest(test.TestCase):
 
     self.assertEqual(3.0, float(test_assign_add()))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testTensorInitializationInFunctionRaisesError(self):
+    error_msg = ('Tensor-typed variable initializers must either be '
+                 'wrapped in an init_scope or callable.*')
+
+    @function.defun
+    def tensor_init():
+      with self.assertRaisesRegexp(ValueError, error_msg):
+        resource_variable_ops.ResourceVariable(constant_op.constant(2.0))
+
+    tensor_init()
+
+  @test_util.run_in_graph_and_eager_modes
+  def testCallableTensorInitializationInFunction(self):
+
+    @function.defun
+    def tensor_init():
+      v = resource_variable_ops.ResourceVariable(
+          lambda: constant_op.constant(2.0))
+      return v.read_value()
+
+    value = tensor_init()
+    if not context.executing_eagerly():
+      self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(self.evaluate(value), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitScopeTensorInitializationInFunction(self):
+
+    @function.defun
+    def tensor_init():
+      with ops.init_scope():
+        const = constant_op.constant(2.0)
+      v = resource_variable_ops.ResourceVariable(const)
+      return v.read_value()
+
+    value = tensor_init()
+    if not context.executing_eagerly():
+      self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(self.evaluate(value), 2.0)
+
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
 
@@ -397,6 +449,18 @@ class FunctionTest(test.TestCase):
       compiled = function.defun(f)
       compiled()
 
+  @test_util.run_in_graph_and_eager_modes
+  def testDefunForcesResourceVariables(self):
+
+    def variable_creator():
+      return variables.Variable(0.0).read_value()
+
+    defined = function.defun(variable_creator)
+    defined()  # Create the variable.
+    self.assertEqual(len(defined.variables), 1)
+    self.assertIsInstance(
+        defined.variables[0], resource_variable_ops.ResourceVariable)
+
   def testDefunDifferentiable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
 
@@ -434,6 +498,22 @@ class FunctionTest(test.TestCase):
       op = call()
       self.assertAllEqual(sess.run(op), 2.0)
 
+  def testSymbolicGradientVariableZerosLike(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      @function.defun
+      def f(x, v):
+        v.read_value()
+        return x * x
+
+      x = constant_op.constant(1.0)
+      l = f(x, v)
+      _, dv = gradients_impl.gradients(l, [x, v])
+      with self.test_session():
+        v.initializer.run()
+        self.assertAllEqual(dv.eval(), 0.0)
+
   def testGraphModeManyFunctions(self):
     with context.graph_mode(), self.test_session():
 
@@ -605,17 +685,19 @@ class FunctionTest(test.TestCase):
   def testReturningIndexedSlicesWithDefun(self):
 
     def validate(indexed_slice):
+      @function.defun
       def f():
         return indexed_slice
 
-      output = function.defun(f)()
+      output = f()
       self.assertTrue(isinstance(output, ops.IndexedSlices))
       self.assertAllEqual(indexed_slice.values, output.values)
       self.assertAllEqual(indexed_slice.indices, output.indices)
       self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape)
 
       self.assertEqual(
-          function.make_defun_op(f).output_shapes, indexed_slice.values.shape)
+          f.get_concrete_function().output_shapes,
+          indexed_slice.values.shape)
 
     arg = ops.IndexedSlices(
         values=constant_op.constant([1, 2]),
@@ -938,39 +1020,86 @@ class FunctionTest(test.TestCase):
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
   def testDeviceAnnotationsRespected(self):
 
-    @function.defun
     def multi_device_fn():
       with ops.device('/cpu:0'):
-        s1 = iterator_ops.Iterator.from_structure(
+        s0 = iterator_ops.Iterator.from_structure(
             (dtypes.float32,)).string_handle()
       with ops.device('/cpu:1'):
-        s2 = iterator_ops.Iterator.from_structure(
+        s1 = iterator_ops.Iterator.from_structure(
             (dtypes.float32,)).string_handle()
       with ops.device('/cpu:2'):
-        s3 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
-      with ops.device(''):
-        # TODO(akshayka): This is unfortunate and brittle. It prevents
-        # `Iterator.from_structure` from assigning the iterator op to 'cpu:0'.
-        #  Remove this hack once we have a way of obtaining metadata about
-        #  function execution.
-        s4 = iterator_ops.Iterator.from_structure(
+        s2 = iterator_ops.Iterator.from_structure(
             (dtypes.float32,)).string_handle()
-      return s1, s2, s3, s4
+      s3 = iterator_ops.Iterator.from_structure(
+          (dtypes.float32,)).string_handle()
+      return s0, s1, s2, s3
 
-    with ops.device('/cpu:3'):
-      outputs = self.evaluate(multi_device_fn())
+    defined = function.defun(multi_device_fn)
+    outputs = self.evaluate(defined())
+    self.assertEqual(len(defined._function_cache), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
-    self.assertIn(compat.as_bytes('CPU:3'), outputs[3])
 
-    with ops.device('/cpu:0'):
-      outputs = self.evaluate(multi_device_fn())
+    with ops.device('/cpu:3'):
+      outputs = self.evaluate(defined())
+    self.assertEqual(len(defined._function_cache), 2)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
-    self.assertIn(compat.as_bytes('CPU:0'), outputs[3])
+    self.assertIn(compat.as_bytes('CPU:3'), outputs[3])
+
+    # This should retrieve the call-site-device agnostic function
+    defined()
+    self.assertEqual(len(defined._function_cache), 2)
+
+    # And this should retrieve the function created for '/cpu:3'
+    with ops.device('/cpu:3'):
+      defined()
+    self.assertEqual(len(defined._function_cache), 2)
+
+  @test_util.run_in_graph_and_eager_modes(
+      config=config_pb2.ConfigProto(device_count={'CPU': 2}))
+  def testCallingGraphFunctionOnIncompatibleDeviceRaisesError(self):
+
+    def func():
+      return constant_op.constant(0)
+
+    defined = function.defun(func)
+    with ops.device('cpu:0'):
+      cpu_graph_function = defined.get_concrete_function()
+
+    with ops.device('cpu:0'):
+      self.assertEqual(
+          self.evaluate(cpu_graph_function()), self.evaluate(func()))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'The current device stack does not match the device stack under '
+        'which the TensorFlow function \'.*func.*\' was created.\n'
+        'Current device stack: .*\n.*func.* device stack.*'):
+      with ops.device('cpu:1'):
+        cpu_graph_function()
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'The current device stack does not match the device stack under '
+        'which the TensorFlow function \'.*func.*\' was created.\n'
+        'Current device stack: .*\n.*func.* device stack.*'):
+      with ops.device(None):
+        cpu_graph_function()
+
+    default_graph_function = defined.get_concrete_function()
+    self.assertEqual(
+        self.evaluate(default_graph_function()), self.evaluate(func()))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'The current device stack does not match the device stack under '
+        'which the TensorFlow function \'.*func.*\' was created.\n'
+        'Current device stack: .*\n.*func.* device stack.*'):
+      with ops.device('cpu:1'):
+        default_graph_function()
 
   def testVariablesAreTracked(self):
     v = resource_variable_ops.ResourceVariable(1.0)
@@ -999,26 +1128,31 @@ class FunctionTest(test.TestCase):
 
     defined = function.defun(func)
     defined(0, baz=20)
+
+    def cache_keys():
+      """Sanitizes cache keys of non-input metadata."""
+      return tuple(key[:3] for key in defined._function_cache)
+
     # `True` corresponds to the fact that we're executing eagerly
-    self.assertIn((0, 1, 20, True), defined._arguments_to_functions)
+    self.assertIn((0, 1, 20), cache_keys())
 
     defined(1)  # bar=1, baz=2
-    self.assertIn((1, 1, 2, True), defined._arguments_to_functions)
+    self.assertIn((1, 1, 2), cache_keys())
 
     # This matches the previous call.
     defined(foo=1)
-    self.assertEqual(len(defined._arguments_to_functions), 2)
+    self.assertEqual(len(defined._function_cache), 2)
 
     defined(1, 2, 3)
-    self.assertIn((1, 2, 3, True), defined._arguments_to_functions)
+    self.assertIn((1, 2, 3), cache_keys())
 
     # This matches the previous call.
     defined(1, bar=2, baz=3)
-    self.assertEqual(len(defined._arguments_to_functions), 3)
+    self.assertEqual(len(defined._function_cache), 3)
 
     # This matches the previous call.
     defined(1, baz=3, bar=2)
-    self.assertEqual(len(defined._arguments_to_functions), 3)
+    self.assertEqual(len(defined._function_cache), 3)
 
   def testFunctoolsPartialUnwrappedCorrectly(self):
 
@@ -1044,7 +1178,7 @@ class FunctionTest(test.TestCase):
     defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2])
     out = defined(a)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
     self.assertAllEqual(out, a)
 
     def bar(a):
@@ -1055,13 +1189,13 @@ class FunctionTest(test.TestCase):
     defined = function.defun(bar, input_signature=signature)
     a = array_ops.ones([2, 1])
     out = defined(a)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
     self.assertAllEqual(out, a)
 
     # Changing the second dimension shouldn't create a new function.
     b = array_ops.ones([2, 3])
     out = defined(b)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
     self.assertAllEqual(out, b)
 
   def testNestedInputSignatures(self):
@@ -1078,7 +1212,7 @@ class FunctionTest(test.TestCase):
     a = array_ops.ones([2, 1])
     b = array_ops.ones([1])
     out = defined([a, a], b)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
     nest.assert_same_structure(out, [[a, a], b])
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], a)
@@ -1089,7 +1223,7 @@ class FunctionTest(test.TestCase):
     b = array_ops.ones([2, 5])
     c = array_ops.ones([1])
     out = defined([a, b], c)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
     nest.assert_same_structure(out, [[a, b], c])
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], b)
@@ -1125,13 +1259,13 @@ class FunctionTest(test.TestCase):
     # Signatures must consist exclusively of `TensorSpec` objects.
     signature = [(2, 3), tensor_spec.TensorSpec([2, 3], dtypes.float32)]
     with self.assertRaisesRegexp(TypeError, 'Invalid input_signature.*'):
-      function.defun(foo, input_signature=signature)(1, 2)
+      function.defun(foo, input_signature=signature)
 
     # Signatures must be either lists or tuples on their outermost levels.
     signature = {'t1': tensor_spec.TensorSpec([], dtypes.float32)}
     with self.assertRaisesRegexp(TypeError, 'input_signature must be either a '
                                  'tuple or a list.*'):
-      function.defun(foo, input_signature=signature)(1, 2)
+      function.defun(foo, input_signature=signature)
 
   def testInputsIncompatibleWithSignatureRaisesError(self):
 
@@ -1185,22 +1319,22 @@ class FunctionTest(test.TestCase):
     integer = constant_op.constant(2, dtypes.int64)
 
     out1, out2 = foo(flt, integer)
-    self.assertEqual(len(foo._arguments_to_functions), 1)
+    self.assertEqual(len(foo._function_cache), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt=flt, integer=integer)
-    self.assertEqual(len(foo._arguments_to_functions), 1)
+    self.assertEqual(len(foo._function_cache), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(integer=integer, flt=flt)
-    self.assertEqual(len(foo._arguments_to_functions), 1)
+    self.assertEqual(len(foo._function_cache), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt, integer=integer)
-    self.assertEqual(len(foo._arguments_to_functions), 1)
+    self.assertEqual(len(foo._function_cache), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
@@ -1230,27 +1364,27 @@ class FunctionTest(test.TestCase):
     a = constant_op.constant(2.0)
     b = constant_op.constant([1.0, 2.0])
     one = defined(a, b)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
 
     two = defined(a=a, b=b)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
 
     three = defined(b=b, a=a)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
 
     four = defined(a, b=b)
-    self.assertEqual(len(defined._arguments_to_functions), 1)
+    self.assertEqual(len(defined._function_cache), 1)
 
     # The next call corresponds to a new input signature, hence
     # we expect another function to be defined.
     five = defined(b, a)
-    self.assertEqual(len(defined._arguments_to_functions), 2)
+    self.assertEqual(len(defined._function_cache), 2)
 
     six = defined(a=b, b=a)
-    self.assertEqual(len(defined._arguments_to_functions), 2)
+    self.assertEqual(len(defined._function_cache), 2)
 
     seven = defined(b=a, a=b)
-    self.assertEqual(len(defined._arguments_to_functions), 2)
+    self.assertEqual(len(defined._function_cache), 2)
 
     self.assertAllEqual(one, [1.0, 2.0])
     self.assertAllEqual(two, [1.0, 2.0])
@@ -1335,7 +1469,7 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(state, [0])
 
     # Whereas calling the python function directly should create a side-effect.
-    side_effecting_function.call_python_function()
+    side_effecting_function.python_function()
     self.assertAllEqual(state, [0, 0])
 
 
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
deleted file mode 100644
index 7105d2e399..0000000000
--- a/tensorflow/python/eager/graph_callable.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Decorator that produces a callable object that executes a TensorFlow graph.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-
-from tensorflow.python.eager import context
-from tensorflow.python.eager import function
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops as tf_ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
-
-
-def _default_initializer(name, shape, dtype):
-  """The default initializer for variables."""
-  # pylint: disable=protected-access
-  store = variable_scope._get_default_variable_store()
-  initializer = store._get_default_initializer(name, shape=shape, dtype=dtype)
-  # pylint: enable=protected-access
-  return initializer[0]
-
-
-class _CapturedVariable(object):
-  """Variable captured by graph_callable.
-
-  Internal to the implementation of graph_callable. Created only by
-  _VariableCapturingScope and used only to read the variable values when calling
-  the function after the variables are initialized.
-  """
-
-  def __init__(self, name, initializer, shape, dtype, trainable):
-    self.name = name
-    if initializer is None:
-      initializer = _default_initializer(name, shape, dtype)
-    initial_value = lambda: initializer(shape, dtype=dtype)
-
-    with context.eager_mode():
-      self.variable = resource_variable_ops.ResourceVariable(
-          initial_value=initial_value, name=name, dtype=dtype,
-          trainable=trainable)
-    self.shape = shape
-    self.dtype = dtype
-    self.placeholder = None
-    self.trainable = trainable
-
-  def read(self, want_gradients=True):
-    if want_gradients and self.trainable:
-      v = tape.watch_variable(self.variable)
-    else:
-      v = self.variable
-    return v.read_value()
-
-
-class _VariableCapturingScope(object):
-  """Variable-scope-like object which captures tf.get_variable calls.
-
-  This is responsible for the main difference between the initialization version
-  of a function object and the calling version of a function object.
-
-  capturing_scope replaces calls to tf.get_variable with placeholder tensors to
-  be fed the variable's current value. TODO(apassos): these placeholders should
-  instead be objects implementing a similar API to tf.Variable, for full
-  compatibility.
-
-  initializing_scope replaces calls to tf.get_variable with creation of
-  variables and initialization of their values. This allows eventual support of
-  initialized_value and friends.
-
-  TODO(apassos): once the eager mode layers API is implemented support eager
-  func-to-object as well.
-  """
-
-  def __init__(self):
-    self.variables = {}
-    self.tf_variables = {}
-
-  @contextlib.contextmanager
-  def capturing_scope(self):
-    """Context manager to capture variable creations.
-
-    Replaces variable accesses with placeholders.
-
-    Yields:
-      nothing
-    """
-    # TODO(apassos) ignoring the regularizer and partitioner here; figure out
-    # how to deal with these.
-    def _custom_getter(  # pylint: disable=missing-docstring
-        getter=None,
-        name=None,
-        shape=None,
-        dtype=dtypes.float32,
-        initializer=None,
-        regularizer=None,
-        reuse=None,
-        trainable=None,
-        collections=None,
-        caching_device=None,  # pylint: disable=redefined-outer-name
-        partitioner=None,
-        validate_shape=True,
-        use_resource=None,
-        aggregation=variable_scope.VariableAggregation.NONE,
-        synchronization=variable_scope.VariableSynchronization.AUTO):
-      del getter, regularizer, partitioner, validate_shape, use_resource, dtype
-      del collections, initializer, trainable, reuse, caching_device, shape
-      del aggregation, synchronization
-      assert name in self.variables
-      v = self.variables[name]
-      return v.variable
-
-    scope = variable_scope.get_variable_scope()
-    with variable_scope.variable_scope(scope, custom_getter=_custom_getter):
-      yield
-
-  @contextlib.contextmanager
-  def initializing_scope(self):
-    """Context manager to capture variable creations.
-
-    Forcibly initializes all created variables.
-
-    Yields:
-      nothing
-    """
-    # TODO(apassos) ignoring the regularizer and partitioner here; figure out
-    # how to deal with these.
-    def _custom_getter(  # pylint: disable=missing-docstring
-        getter=None,
-        name=None,
-        shape=None,
-        dtype=dtypes.float32,
-        initializer=None,
-        regularizer=None,
-        reuse=None,
-        trainable=None,
-        collections=None,
-        caching_device=None,  # pylint: disable=redefined-outer-name
-        partitioner=None,
-        validate_shape=True,
-        use_resource=None,
-        aggregation=variable_scope.VariableAggregation.NONE,
-        synchronization=variable_scope.VariableSynchronization.AUTO):
-      del getter, regularizer, collections, caching_device, partitioner
-      del use_resource, validate_shape, aggregation, synchronization
-      if name in self.tf_variables:
-        if reuse:
-          return self.tf_variables[name].initialized_value()
-        else:
-          raise ValueError("Specified reuse=%s but tried to reuse variables."
-                           % reuse)
-      # TODO(apassos): ensure this is on the same device as above
-      v = _CapturedVariable(name, initializer, shape, dtype, trainable)
-      self.variables[name] = v
-
-      graph_mode_resource = v.variable.handle
-      if initializer is None:
-        initializer = _default_initializer(name, shape, dtype)
-      resource_variable_ops.shape_safe_assign_variable_handle(
-          graph_mode_resource, v.variable.shape, initializer(shape, dtype))
-      return v.variable
-
-    scope = variable_scope.get_variable_scope()
-    with variable_scope.variable_scope(scope, custom_getter=_custom_getter):
-      yield
-
-
-class _InitializingFunctionObject(object):
-  """Responsible for deciding which version of func-to-object to call.
-
-  call_fn is the version which calls the function with the current values of the
-  variables and init_fn is the version which calls the function to initialize
-  all variables.
-
-  TODO(apassos): figure out a way to support initializing only _some_
-  variables. This requires a way to pull out a variable's initialization code
-  from the graph, which might not be possible in general.
-  """
-
-  def __init__(self, call_fn, init_fn, shape_and_dtypes):
-    self._init_fn = init_fn
-    self._call_fn = call_fn
-    self.shape_and_dtypes = shape_and_dtypes
-    self.flattened_shapes = [tensor_shape.as_shape(sd.shape) for sd in
-                             nest.flatten(self.shape_and_dtypes)]
-
-  @property
-  def variables(self):
-    return self._call_fn.variables
-
-  def __call__(self, *args):
-    nest.assert_same_structure(self.shape_and_dtypes, args, check_types=False)
-    if not all([
-        shape.is_compatible_with(arg.shape)
-        for shape, arg in zip(self.flattened_shapes, nest.flatten(args))
-    ]):
-      raise ValueError(
-          "Declared shapes do not match argument shapes: Expected %s, found %s."
-          % (self.flattened_shapes, [arg.shape for arg in nest.flatten(args)]))
-
-    initialized = [resource_variable_ops.var_is_initialized_op(
-        v.handle).numpy() for v in self._call_fn.variables]
-    if all(x for x in initialized):
-      for v in self._call_fn.variables:
-        if v.trainable:
-          tape.watch_variable(v)
-      return self._call_fn(*args)
-    elif all(not x for x in initialized):
-      return self._init_fn(*args)
-    else:
-      raise ValueError("Some, but not all, variables are initialized.")
-
-
-def _get_graph_callable_inputs(shape_and_dtypes):
-  """Maps specified shape_and_dtypes to graph inputs."""
-  ret = []
-  for x in shape_and_dtypes:
-    if isinstance(x, ShapeAndDtype):
-      ret.append(array_ops.placeholder(x.dtype, x.shape))
-    elif isinstance(x, (tuple, list)):
-      ret.append(_get_graph_callable_inputs(x))
-    else:
-      raise errors.InvalidArgumentError(
-          None, None, "Expected the argument to @graph_callable to be a "
-          "(possibly nested) list or tuple of ShapeAndDtype objects, "
-          "but got an object of type: %s" % type(x))
-
-  return tuple(ret) if isinstance(shape_and_dtypes, tuple) else ret
-
-
-def _graph_callable_internal(func, shape_and_dtypes):
-  """Defines and returns a template version of func.
-
-  Under the hood we make two function objects, each wrapping a different version
-  of the graph-mode code. One version immediately runs variable initialization
-  before making the variable's Tensors available for use, while the other
-  version replaces the Variables with placeholders which become function
-  arguments and get the current variable's value.
-
-  Limitations in (2) and (4) are because this does not implement a graph-mode
-  Variable class which has a convert_to_tensor(as_ref=True) method and a
-  initialized_value method. This is fixable.
-
-  Args:
-    func: The tfe Python function to compile.
-    shape_and_dtypes: A possibly nested list or tuple of ShapeAndDtype objects.
-
-  Raises:
-    ValueError: If any one of func's outputs is not a Tensor.
-
-  Returns:
-    Callable graph object.
-  """
-  container = tf_ops.get_default_graph()._container  # pylint: disable=protected-access
-  graph_key = tf_ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  with context.graph_mode():
-    # This graph will store both the initialization and the call version of the
-    # wrapped function. It will later be used by the backprop code to build the
-    # backprop graph, if necessary.
-    tmp_graph = function.CapturingGraph()
-    # Inherit the graph key from the original graph to ensure optimizers don't
-    # misbehave.
-    tmp_graph._container = container  # pylint: disable=protected-access
-    tmp_graph._graph_key = graph_key  # pylint: disable=protected-access
-    with tmp_graph.as_default():
-      # Placeholders for the non-variable inputs.
-      func_inputs = _get_graph_callable_inputs(shape_and_dtypes)
-      func_num_args = len(tf_inspect.getfullargspec(func).args)
-      if len(func_inputs) != func_num_args:
-        raise TypeError("The number of arguments accepted by the decorated "
-                        "function `%s` (%d) must match the number of "
-                        "ShapeAndDtype objects passed to the graph_callable() "
-                        "decorator (%d)." %
-                        (func.__name__, func_num_args, len(func_inputs)))
-
-      # First call the function to generate a graph which can initialize all
-      # variables. As a side-effect this will populate the variable capturing
-      # scope's view of which variables exist.
-      variable_captures = _VariableCapturingScope()
-      with variable_captures.initializing_scope(
-          ), function.AutomaticControlDependencies() as a:
-        func_outputs = func(*func_inputs)
-        outputs_list = nest.flatten(func_outputs)
-        for i, x in enumerate(outputs_list):
-          if x is not None:
-            outputs_list[i] = a.mark_as_return(x)
-      if len(outputs_list) == 1 and outputs_list[0] is None:
-        outputs_list = []
-      output_shapes = [x.shape for x in outputs_list]
-      if not all(isinstance(x, tf_ops.Tensor) for x in outputs_list):
-        raise ValueError("Found non-tensor output in %s" % str(outputs_list))
-      initializing_operations = tmp_graph.get_operations()
-
-      # Call the function again, now replacing usages of variables with
-      # placeholders. This assumes the variable capturing scope created above
-      # knows about all variables.
-      tmp_graph.clear_resource_control_flow_state()
-      with variable_captures.capturing_scope(
-          ), function.AutomaticControlDependencies() as a:
-        captured_outputs = func(*func_inputs)
-      captured_outlist = nest.flatten(captured_outputs)
-      for i, x in enumerate(captured_outlist):
-        if x is not None:
-          captured_outlist[i] = a.mark_as_return(x)
-      capturing_operations = tmp_graph.get_operations()[
-          len(initializing_operations):]
-
-  sorted_variables = sorted(variable_captures.variables.values(),
-                            key=lambda x: x.name)
-
-  extra_inputs = tmp_graph.captures.keys()
-  extra_placeholders = tmp_graph.captures.values()
-
-  flat_inputs = [x for x in nest.flatten(func_inputs)
-                 if isinstance(x, tf_ops.Tensor)]
-  placeholder_inputs = flat_inputs+ list(extra_placeholders)
-
-  func_def_outputs = [x for x in outputs_list if isinstance(x, tf_ops.Tensor)]
-  initialization_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
-  # TODO(ashankar): Oh lord, forgive me for this lint travesty.
-  # Also, what about the gradient registry of these functions? Those need to be
-  # addressed as well.
-  for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register(f._c_func.func)  # pylint: disable=protected-access
-  initializer_function = function.GraphModeFunction(
-      initialization_name,
-      placeholder_inputs,
-      extra_inputs,
-      tmp_graph,
-      initializing_operations,
-      func_def_outputs,
-      func_outputs,
-      output_shapes)
-
-  capture_func_def_outputs = [
-      x for x in captured_outlist if isinstance(x, tf_ops.Tensor)]
-  captured_function_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
-  captured_function = function.GraphModeFunction(
-      captured_function_name,
-      placeholder_inputs,
-      extra_inputs,
-      tmp_graph,
-      capturing_operations,
-      capture_func_def_outputs,
-      captured_outputs,
-      output_shapes,
-      variables=[x.variable for x in sorted_variables])
-
-  return _InitializingFunctionObject(captured_function, initializer_function,
-                                     shape_and_dtypes)
-
-
-class ShapeAndDtype(object):
-  """Data type that packages together shape and type information.
-
-  Used for arguments to graph callables. See graph_callable() for an example.
-  """
-
-  def __init__(self, shape, dtype):
-    self.shape = shape
-    self.dtype = dtype
-
-
-def graph_callable(shape_and_dtypes):
-  """Decorator that produces a callable that executes a TensorFlow graph.
-
-  When applied on a function that constructs a TensorFlow graph, this decorator
-  produces a callable object that:
-
-  1. Executes the graph when invoked. The first call will initialize any
-     variables defined in the graph.
-
-  2. Provides a .variables() method to return the list of TensorFlow variables
-     defined in the graph.
-
-  Note that the wrapped function is not allowed to change the values of the
-  variables, just use them.
-
-  The return value of the wrapped function must be one of the following:
-  (1) None,  (2) a Tensor, or (3) a possibly nested sequence of Tensors.
-
-  Example:
-
-  ```python
-  @tfe.graph_callable([tfe.ShapeAndDtype(shape(), dtype=dtypes.float32)])
-  def foo(x):
-    v = tf.get_variable('v', initializer=tf.ones_initializer(), shape=())
-    return v + x
-
-  ret = foo(tfe.Tensor(2.0))  # `ret` here is a Tensor with value 3.0.
-
-  foo.variables[0].assign(7.0)  # Modify the value of variable `v`.
-  ret = foo(tfe.Tensor(2.0))  # `ret` here now is a Tensor with value 9.0.
-  ```
-  Args:
-    shape_and_dtypes: A possibly nested list or tuple of ShapeAndDtype objects
-      that specifies shape and type information for each of the callable's
-      arguments. The length of this list must be equal to the number of
-      arguments accepted by the wrapped function.
-
-  Returns:
-    A callable graph object.
-  """
-  # TODO(alive,apassos): support initialized_value and friends from tf.Variable.
-  assert context.executing_eagerly(), (
-      "graph_callable can only be used when Eager execution is enabled.")
-  def decorator(func):
-    return tf_decorator.make_decorator(func,
-                                       _graph_callable_internal(
-                                           func, shape_and_dtypes))
-
-  return decorator
diff --git a/tensorflow/python/eager/graph_callable_test.py b/tensorflow/python/eager/graph_callable_test.py
deleted file mode 100644
index b9e6ca2a93..0000000000
--- a/tensorflow/python/eager/graph_callable_test.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import graph_callable
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-
-
-class GraphCallableTest(test.TestCase):
-
-  def testBasic(self):
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-    def my_function(x):
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=())
-      return v + x
-
-    self.assertEqual(
-        2, my_function(constant_op.constant(2, dtype=dtypes.float32)).numpy())
-
-    my_function.variables[0].assign(1.)
-    self.assertEqual(
-        3, my_function(constant_op.constant(2, dtype=dtypes.float32)).numpy())
-
-  def testFunctionWithoutReturnValue(self):
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-    def my_function(x):
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=())
-      v.assign(x)
-
-    my_function(constant_op.constant(4, dtype=dtypes.float32))
-    self.assertAllEqual(4, my_function.variables[0].read_value())
-
-  def testFunctionWithoutReturnValueAndArgs(self):
-
-    @graph_callable.graph_callable([])
-    def my_function():
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=())
-      v.assign(4)
-
-    my_function()
-    self.assertAllEqual(4, my_function.variables[0].read_value())
-
-  def testVariableAPI(self):
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-    def my_function(x):
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=())
-      return v.read_value() + x
-
-    self.assertEqual(
-        2, my_function(constant_op.constant(2, dtype=dtypes.float32)).numpy())
-
-    my_function.variables[0].assign(1.)
-    self.assertEqual(
-        3, my_function(constant_op.constant(2, dtype=dtypes.float32)).numpy())
-
-  def testTensorShape(self):
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(1), dtype=dtypes.float32)])
-    def my_function(x):
-      _ = x.get_shape()
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=[x.shape[0]])
-      self.assertEqual(v.shape[0], x.shape[0])
-      return v + x
-
-    self.assertEqual([2.],
-                     my_function(
-                         constant_op.constant([2.],
-                                              dtype=dtypes.float32)).numpy())
-
-  def testUpdatesAreOrdered(self):
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-    def my_function(x):
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=())
-      v.assign(x + 1)
-      v.assign(v * x)
-      return v.read_value()
-
-    self.assertAllEqual(my_function(constant_op.constant(2.0)), 6.0)
-
-  def testEmptyInitializer(self):
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(1), dtype=dtypes.float32)])
-    def my_function(x):
-      v = variable_scope.get_variable("v", shape=[1])
-      return x + 0 * v
-
-    self.assertEqual([2.],
-                     my_function(
-                         constant_op.constant([2.],
-                                              dtype=dtypes.float32)).numpy())
-
-  def testMismatchingNumArgs(self):
-    # pylint: disable=anomalous-backslash-in-string
-    with self.assertRaisesRegexp(TypeError,
-                                 "The number of arguments accepted by the "
-                                 "decorated function `my_function` \(2\) must "
-                                 "match the number of ShapeAndDtype objects "
-                                 "passed to the graph_callable\(\) decorator "
-                                 "\(1\)."):
-      @graph_callable.graph_callable([
-          graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-      def my_function(x, y):  # pylint: disable=unused-variable
-        return x + y
-    # pylint: enable=anomalous-backslash-in-string
-
-  def testPureFunction(self):
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.int32)])
-    def f(x):
-      return math_ops.add(x, constant_op.constant(3))
-
-    self.assertAllEqual(5, f(constant_op.constant(2)))
-
-  def testNestedFunction(self):
-    # TensorFlow function (which is what would be used in TensorFlow graph
-    # construction).
-    @function.Defun(dtypes.int32, dtypes.int32)
-    def add(a, b):
-      return math_ops.add(a, b)
-
-    # A graph_callable that will invoke the TensorFlow function.
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.int32)])
-    def add_one(x):
-      return add(x, 1)
-
-    self.assertAllEqual(3, add_one(constant_op.constant(2)))
-
-  # TODO(ashankar): Make this work.
-  # The problem is that the two graph_callables (for add_one and add_two)
-  # are both trying to register the FunctionDef corresponding to "add".
-  def DISABLED_testRepeatedUseOfSubFunction(self):
-
-    @function.Defun(dtypes.int32, dtypes.int32)
-    def add(a, b):
-      return math_ops.add(a, b)
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.int32)])
-    def add_one(x):
-      return add(x, 1)
-
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.int32)])
-    def add_two(x):
-      return add(x, 2)
-
-    two = constant_op.constant(2)
-    self.assertAllEqual(3, add_one(two))
-    self.assertAllEqual(4, add_two(two))
-
-  def testNestedSequenceInputs(self):
-    sd = graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)
-    @graph_callable.graph_callable([[sd, tuple([sd, sd]), sd]])
-    def my_op(inputs):
-      a, b, c = inputs
-      e, f = b
-      v = variable_scope.get_variable(
-          "my_v", initializer=init_ops.zeros_initializer(), shape=())
-      return [a + a + v, tuple([e + e, f + f]), c + c], a + e + f + c + v
-
-    inputs = [constant_op.constant(1.),
-              [constant_op.constant(2.), constant_op.constant(3.)],
-              constant_op.constant(4.)]
-    ret = my_op(inputs)
-    self.assertEqual(len(ret), 2.)
-    self.assertAllEqual(ret[1], 10.)
-
-    my_op.variables[0].assign(1.)
-    ret = my_op(inputs)
-    self.assertAllEqual(ret[1], 11.)
-
-  def testVariableShapeIsTensorShape(self):
-    @graph_callable.graph_callable([])
-    def my_function():
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=())
-      self.assertIsInstance(v.get_shape(), tensor_shape.TensorShape)
-
-    my_function()
-
-  def testIncorrectlyShapedInputs(self):
-    @graph_callable.graph_callable(
-        [graph_callable.ShapeAndDtype(shape=(3), dtype=dtypes.float32)])
-    def my_function(x):
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.zeros_initializer(), shape=())
-      return v + x
-
-    with self.assertRaises(ValueError):
-      my_function([1, 2])
-
-    self.assertTrue(([1, 2, 3] == my_function(
-        constant_op.constant([1, 2, 3], dtype=dtypes.float32)).numpy()).all())
-
-  def testGradients(self):
-    @graph_callable.graph_callable([])
-    def my_function():
-      v = variable_scope.get_variable(
-          "v", initializer=init_ops.constant_initializer(3.), shape=())
-      return v * v
-
-    grad_fn = backprop.implicit_grad(my_function)
-    grads_and_vars = list(zip(*grad_fn()))
-    self.assertAllEqual(6., grads_and_vars[0][0])
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 15d2ccf9d2..c12bf89f8f 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -800,9 +800,6 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   EagerTensorType = &_EagerTensorType;
   Py_INCREF(EagerTensorType);
 #endif
-  // We disable instance based attribute lookup. Its not clear if these
-  // dictionaries are correctly initialized in the first place.
-  EagerTensorType->tp_dictoffset = 0;
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
 
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 8b423f76de..ef7c217190 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -404,18 +404,21 @@ class _EnsembleGrower(object):
     training_ops.append(grow_op)
   """
 
-  def __init__(self, tree_ensemble, tree_hparams):
+  def __init__(self, tree_ensemble, tree_hparams, feature_ids_list):
     """Initializes a grower object.
 
     Args:
       tree_ensemble: A TreeEnsemble variable.
       tree_hparams: TODO. collections.namedtuple for hyper parameters.
+      feature_ids_list: a list of lists of feature ids for each bucket size.
+
     Raises:
       ValueError: when pruning mode is invalid or pruning is used and no tree
       complexity is set.
     """
     self._tree_ensemble = tree_ensemble
     self._tree_hparams = tree_hparams
+    self._feature_ids_list = feature_ids_list
     # pylint: disable=protected-access
     self._pruning_mode_parsed = boosted_trees_ops.PruningMode.from_str(
         tree_hparams.pruning_mode)
@@ -440,14 +443,12 @@ class _EnsembleGrower(object):
     """
 
   @abc.abstractmethod
-  def grow_tree(self, stats_summaries_list, feature_ids_list,
-                last_layer_nodes_range):
+  def grow_tree(self, stats_summaries_list, last_layer_nodes_range):
     """Grows a tree, if ready, based on provided statistics.
 
     Args:
       stats_summaries_list: List of stats summary tensors, representing sums of
         gradients and hessians for each feature bucket.
-      feature_ids_list: a list of lists of feature ids for each bucket size.
       last_layer_nodes_range: A tensor representing ids of the nodes in the
         current layer, to be split.
 
@@ -455,6 +456,10 @@ class _EnsembleGrower(object):
       An op for growing a tree.
     """
 
+  def chief_init_op(self):
+    """Ops that chief needs to run to initialize the state."""
+    return control_flow_ops.no_op()
+
   #  ============= Helper methods ===========
 
   def _center_bias_fn(self, center_bias_var, mean_gradients, mean_hessians):
@@ -468,7 +473,7 @@ class _EnsembleGrower(object):
     return center_bias_var.assign(continue_centering)
 
   def _grow_tree_from_stats_summaries(self, stats_summaries_list,
-                                      feature_ids_list, last_layer_nodes_range):
+                                      last_layer_nodes_range):
     """Updates ensemble based on the best gains from stats summaries."""
     node_ids_per_feature = []
     gains_list = []
@@ -476,11 +481,11 @@ class _EnsembleGrower(object):
     left_node_contribs_list = []
     right_node_contribs_list = []
     all_feature_ids = []
-    assert len(stats_summaries_list) == len(feature_ids_list)
+    assert len(stats_summaries_list) == len(self._feature_ids_list)
 
     max_splits = _get_max_splits(self._tree_hparams)
 
-    for i, feature_ids in enumerate(feature_ids_list):
+    for i, feature_ids in enumerate(self._feature_ids_list):
       (numeric_node_ids_per_feature, numeric_gains_list,
        numeric_thresholds_list, numeric_left_node_contribs_list,
        numeric_right_node_contribs_list) = (
@@ -516,12 +521,13 @@ class _EnsembleGrower(object):
 
 
 class _InMemoryEnsembleGrower(_EnsembleGrower):
-  """A base class for ensemble growers."""
+  """An in-memory ensemble grower."""
 
-  def __init__(self, tree_ensemble, tree_hparams):
+  def __init__(self, tree_ensemble, tree_hparams, feature_ids_list):
 
     super(_InMemoryEnsembleGrower, self).__init__(
-        tree_ensemble=tree_ensemble, tree_hparams=tree_hparams)
+        tree_ensemble=tree_ensemble, tree_hparams=tree_hparams,
+        feature_ids_list=feature_ids_list)
 
   def center_bias(self, center_bias_var, gradients, hessians):
     # For in memory, we already have a full batch of gradients and hessians,
@@ -531,83 +537,98 @@ class _InMemoryEnsembleGrower(_EnsembleGrower):
     mean_heassians = array_ops.expand_dims(math_ops.reduce_mean(hessians, 0), 0)
     return self._center_bias_fn(center_bias_var, mean_gradients, mean_heassians)
 
-  def grow_tree(self, stats_summaries_list, feature_ids_list,
-                last_layer_nodes_range):
+  def grow_tree(self, stats_summaries_list, last_layer_nodes_range):
     # For in memory, we already have full data in one batch, so we can grow the
     # tree immediately.
     return self._grow_tree_from_stats_summaries(
-        stats_summaries_list, feature_ids_list, last_layer_nodes_range)
+        stats_summaries_list, last_layer_nodes_range)
 
 
 class _AccumulatorEnsembleGrower(_EnsembleGrower):
-  """A base class for ensemble growers."""
+  """An accumulator based ensemble grower."""
 
   def __init__(self, tree_ensemble, tree_hparams, stamp_token,
-               n_batches_per_layer, bucket_size_list, is_chief):
+               n_batches_per_layer, bucket_size_list, is_chief, center_bias,
+               feature_ids_list):
     super(_AccumulatorEnsembleGrower, self).__init__(
-        tree_ensemble=tree_ensemble, tree_hparams=tree_hparams)
+        tree_ensemble=tree_ensemble, tree_hparams=tree_hparams,
+        feature_ids_list=feature_ids_list)
     self._stamp_token = stamp_token
     self._n_batches_per_layer = n_batches_per_layer
     self._bucket_size_list = bucket_size_list
     self._is_chief = is_chief
+    self._growing_accumulators = []
+    self._chief_init_ops = []
+    max_splits = _get_max_splits(self._tree_hparams)
+    for i, feature_ids in enumerate(self._feature_ids_list):
+      accumulator = data_flow_ops.ConditionalAccumulator(
+          dtype=dtypes.float32,
+          # The stats consist of grads and hessians (the last dimension).
+          shape=[len(feature_ids), max_splits, self._bucket_size_list[i], 2],
+          shared_name='numeric_stats_summary_accumulator_' + str(i))
+      self._chief_init_ops.append(
+          accumulator.set_global_step(self._stamp_token))
+      self._growing_accumulators.append(accumulator)
+    self._center_bias = center_bias
+    if center_bias:
+      self._bias_accumulator = data_flow_ops.ConditionalAccumulator(
+          dtype=dtypes.float32,
+          # The stats consist of grads and hessians means only.
+          # TODO(nponomareva): this will change for a multiclass
+          shape=[2, 1],
+          shared_name='bias_accumulator')
+      self._chief_init_ops.append(
+          self._bias_accumulator.set_global_step(self._stamp_token))
 
   def center_bias(self, center_bias_var, gradients, hessians):
     # For not in memory situation, we need to accumulate enough of batches first
     # before proceeding with centering bias.
 
     # Create an accumulator.
+    if not self._center_bias:
+      raise RuntimeError('center_bias called but bias centering is disabled.')
     bias_dependencies = []
-    bias_accumulator = data_flow_ops.ConditionalAccumulator(
-        dtype=dtypes.float32,
-        # The stats consist of grads and hessians means only.
-        # TODO(nponomareva): this will change for a multiclass
-        shape=[2, 1],
-        shared_name='bias_accumulator')
-
     grads_and_hess = array_ops.stack([gradients, hessians], axis=0)
     grads_and_hess = math_ops.reduce_mean(grads_and_hess, axis=1)
 
-    apply_grad = bias_accumulator.apply_grad(grads_and_hess, self._stamp_token)
+    apply_grad = self._bias_accumulator.apply_grad(
+        grads_and_hess, self._stamp_token)
     bias_dependencies.append(apply_grad)
 
     # Center bias if enough batches were processed.
     with ops.control_dependencies(bias_dependencies):
       if not self._is_chief:
         return control_flow_ops.no_op()
+      def _set_accumulators_stamp():
+        return control_flow_ops.group(
+            [acc.set_global_step(self._stamp_token + 1) for acc in
+             self._growing_accumulators])
 
       def center_bias_from_accumulator():
-        accumulated = array_ops.unstack(bias_accumulator.take_grad(1), axis=0)
-        return self._center_bias_fn(center_bias_var,
-                                    array_ops.expand_dims(accumulated[0], 0),
-                                    array_ops.expand_dims(accumulated[1], 0))
+        accumulated = array_ops.unstack(self._bias_accumulator.take_grad(1),
+                                        axis=0)
+        center_bias_op = self._center_bias_fn(
+            center_bias_var,
+            array_ops.expand_dims(accumulated[0], 0),
+            array_ops.expand_dims(accumulated[1], 0))
+        with ops.control_dependencies([center_bias_op]):
+          return control_flow_ops.cond(center_bias_var,
+                                       control_flow_ops.no_op,
+                                       _set_accumulators_stamp)
 
       center_bias_op = control_flow_ops.cond(
-          math_ops.greater_equal(bias_accumulator.num_accumulated(),
+          math_ops.greater_equal(self._bias_accumulator.num_accumulated(),
                                  self._n_batches_per_layer),
           center_bias_from_accumulator,
           control_flow_ops.no_op,
           name='wait_until_n_batches_for_bias_accumulated')
       return center_bias_op
 
-  def grow_tree(self, stats_summaries_list, feature_ids_list,
-                last_layer_nodes_range):
-    # For not in memory situation, we need to accumulate enough of batches first
-    # before proceeding with building a tree layer.
-    max_splits = _get_max_splits(self._tree_hparams)
-
-    # Prepare accumulators.
-    accumulators = []
+  def grow_tree(self, stats_summaries_list, last_layer_nodes_range):
     dependencies = []
-    for i, feature_ids in enumerate(feature_ids_list):
+    for i in range(len(self._feature_ids_list)):
       stats_summaries = stats_summaries_list[i]
-      accumulator = data_flow_ops.ConditionalAccumulator(
-          dtype=dtypes.float32,
-          # The stats consist of grads and hessians (the last dimension).
-          shape=[len(feature_ids), max_splits, self._bucket_size_list[i], 2],
-          shared_name='numeric_stats_summary_accumulator_' + str(i))
-      accumulators.append(accumulator)
-
-      apply_grad = accumulator.apply_grad(
+      apply_grad = self._growing_accumulators[i].apply_grad(
           array_ops.stack(stats_summaries, axis=0), self._stamp_token)
       dependencies.append(apply_grad)
 
@@ -617,7 +638,8 @@ class _AccumulatorEnsembleGrower(_EnsembleGrower):
         return control_flow_ops.no_op()
 
       min_accumulated = math_ops.reduce_min(
-          array_ops.stack([acc.num_accumulated() for acc in accumulators]))
+          array_ops.stack([acc.num_accumulated() for acc in
+                           self._growing_accumulators]))
 
       def grow_tree_from_accumulated_summaries_fn():
         """Updates tree with the best layer from accumulated summaries."""
@@ -625,10 +647,11 @@ class _AccumulatorEnsembleGrower(_EnsembleGrower):
         stats_summaries_list = []
         stats_summaries_list = [
             array_ops.unstack(accumulator.take_grad(1), axis=0)
-            for accumulator in accumulators
+            for accumulator in self._growing_accumulators
         ]
         grow_op = self._grow_tree_from_stats_summaries(
-            stats_summaries_list, feature_ids_list, last_layer_nodes_range)
+            stats_summaries_list, last_layer_nodes_range
+        )
         return grow_op
 
       grow_model = control_flow_ops.cond(
@@ -638,6 +661,10 @@ class _AccumulatorEnsembleGrower(_EnsembleGrower):
           name='wait_until_n_batches_accumulated')
       return grow_model
 
+  def chief_init_op(self):
+    """Ops that chief needs to run to initialize the state."""
+    return control_flow_ops.group(self._chief_init_ops)
+
 
 def _bt_model_fn(
     features,
@@ -683,29 +710,50 @@ def _bt_model_fn(
   Raises:
     ValueError: mode or params are invalid, or features has the wrong type.
   """
-  is_single_machine = (config.num_worker_replicas <= 1)
   sorted_feature_columns = sorted(feature_columns, key=lambda tc: tc.name)
-  center_bias = tree_hparams.center_bias
-
-  if train_in_memory:
-    assert n_batches_per_layer == 1, (
-        'When train_in_memory is enabled, input_fn should return the entire '
-        'dataset as a single batch, and n_batches_per_layer should be set as '
-        '1.')
-    if (not config.is_chief or config.num_worker_replicas > 1 or
-        config.num_ps_replicas > 0):
-      raise ValueError('train_in_memory is supported only for '
-                       'non-distributed training.')
-  worker_device = control_flow_ops.no_op().device
-  train_op = []
   with ops.name_scope(name) as name:
     # Prepare.
     global_step = training_util.get_or_create_global_step()
     bucket_size_list, feature_ids_list = _group_features_by_num_buckets(
         sorted_feature_columns)
+    # Create Ensemble resources.
+    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
+
+    # Create logits.
+    if mode != model_fn.ModeKeys.TRAIN:
+      input_feature_list = _get_transformed_features(features,
+                                                     sorted_feature_columns)
+      logits = boosted_trees_ops.predict(
+          # For non-TRAIN mode, ensemble doesn't change after initialization,
+          # so no local copy is needed; using tree_ensemble directly.
+          tree_ensemble_handle=tree_ensemble.resource_handle,
+          bucketized_features=input_feature_list,
+          logits_dimension=head.logits_dimension)
+      return head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=control_flow_ops.no_op,
+          logits=logits)
+
+    # ============== Training graph ==============
+    center_bias = tree_hparams.center_bias
+    is_single_machine = (config.num_worker_replicas <= 1)
+
+    if train_in_memory:
+      assert n_batches_per_layer == 1, (
+          'When train_in_memory is enabled, input_fn should return the entire '
+          'dataset as a single batch, and n_batches_per_layer should be set as '
+          '1.')
+      if (not config.is_chief or config.num_worker_replicas > 1 or
+          config.num_ps_replicas > 0):
+        raise ValueError('train_in_memory is supported only for '
+                         'non-distributed training.')
+    worker_device = control_flow_ops.no_op().device
+    train_op = []
     # Extract input features and set up cache for training.
     training_state_cache = None
-    if mode == model_fn.ModeKeys.TRAIN and train_in_memory:
+    if train_in_memory:
       # cache transformed features as well for in-memory training.
       batch_size = array_ops.shape(labels)[0]
       input_feature_list, input_cache_op = (
@@ -717,65 +765,62 @@ def _bt_model_fn(
     else:
       input_feature_list = _get_transformed_features(features,
                                                      sorted_feature_columns)
-      if mode == model_fn.ModeKeys.TRAIN and example_id_column_name:
+      if example_id_column_name:
         example_ids = features[example_id_column_name]
         training_state_cache = _CacheTrainingStatesUsingHashTable(
             example_ids, head.logits_dimension)
+    if training_state_cache:
+      cached_tree_ids, cached_node_ids, cached_logits = (
+          training_state_cache.lookup())
+    else:
+      # Always start from the beginning when no cache is set up.
+      batch_size = array_ops.shape(labels)[0]
+      cached_tree_ids, cached_node_ids, cached_logits = (
+          array_ops.zeros([batch_size], dtype=dtypes.int32),
+          _DUMMY_NODE_ID * array_ops.ones([batch_size], dtype=dtypes.int32),
+          array_ops.zeros(
+              [batch_size, head.logits_dimension], dtype=dtypes.float32))
 
-    # Create Ensemble resources.
-    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-    # Variable that determines whether bias centering is needed.
-    center_bias_var = variable_scope.variable(
-        initial_value=center_bias, name='center_bias_needed', trainable=False)
-    # Create logits.
-    if mode != model_fn.ModeKeys.TRAIN:
-      logits = boosted_trees_ops.predict(
-          # For non-TRAIN mode, ensemble doesn't change after initialization,
-          # so no local copy is needed; using tree_ensemble directly.
-          tree_ensemble_handle=tree_ensemble.resource_handle,
+    if is_single_machine:
+      local_tree_ensemble = tree_ensemble
+      ensemble_reload = control_flow_ops.no_op()
+    else:
+      # Have a local copy of ensemble for the distributed setting.
+      with ops.device(worker_device):
+        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+            name=name + '_local', is_local=True)
+      # TODO(soroush): Do partial updates if this becomes a bottleneck.
+      ensemble_reload = local_tree_ensemble.deserialize(
+          *tree_ensemble.serialize())
+    with ops.control_dependencies([ensemble_reload]):
+      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
+       last_layer_nodes_range) = local_tree_ensemble.get_states()
+      partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
+          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
           bucketized_features=input_feature_list,
           logits_dimension=head.logits_dimension)
+    logits = cached_logits + partial_logits
+
+    if train_in_memory:
+      grower = _InMemoryEnsembleGrower(tree_ensemble, tree_hparams,
+                                       feature_ids_list=feature_ids_list)
     else:
-      if is_single_machine:
-        local_tree_ensemble = tree_ensemble
-        ensemble_reload = control_flow_ops.no_op()
-      else:
-        # Have a local copy of ensemble for the distributed setting.
-        with ops.device(worker_device):
-          local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
-              name=name + '_local', is_local=True)
-        # TODO(soroush): Do partial updates if this becomes a bottleneck.
-        ensemble_reload = local_tree_ensemble.deserialize(
-            *tree_ensemble.serialize())
+      grower = _AccumulatorEnsembleGrower(tree_ensemble, tree_hparams,
+                                          stamp_token, n_batches_per_layer,
+                                          bucket_size_list, config.is_chief,
+                                          center_bias=center_bias,
+                                          feature_ids_list=feature_ids_list)
 
-      if training_state_cache:
-        cached_tree_ids, cached_node_ids, cached_logits = (
-            training_state_cache.lookup())
-      else:
-        # Always start from the beginning when no cache is set up.
-        batch_size = array_ops.shape(labels)[0]
-        cached_tree_ids, cached_node_ids, cached_logits = (
-            array_ops.zeros([batch_size], dtype=dtypes.int32),
-            _DUMMY_NODE_ID * array_ops.ones([batch_size], dtype=dtypes.int32),
-            array_ops.zeros(
-                [batch_size, head.logits_dimension], dtype=dtypes.float32))
-
-      with ops.control_dependencies([ensemble_reload]):
-        (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
-         last_layer_nodes_range) = local_tree_ensemble.get_states()
-        summary.scalar('ensemble/num_trees', num_trees)
-        summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
-        summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
-
-        partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
-            tree_ensemble_handle=local_tree_ensemble.resource_handle,
-            cached_tree_ids=cached_tree_ids,
-            cached_node_ids=cached_node_ids,
-            bucketized_features=input_feature_list,
-            logits_dimension=head.logits_dimension)
-
-      logits = cached_logits + partial_logits
+    summary.scalar('ensemble/num_trees', num_trees)
+    summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
+    summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
 
+    # Variable that determines whether bias centering is needed.
+    center_bias_var = variable_scope.variable(
+        initial_value=center_bias, name='center_bias_needed', trainable=False,
+        use_resource=True)
     # Create training graph.
     def _train_op_fn(loss):
       """Run one training iteration."""
@@ -814,24 +859,20 @@ def _bt_model_fn(
                 axis=0) for f in feature_ids
         ]
         stats_summaries_list.append(summaries)
-
-      if train_in_memory and is_single_machine:
-        grower = _InMemoryEnsembleGrower(tree_ensemble, tree_hparams)
+      if center_bias:
+        update_model = control_flow_ops.cond(
+            center_bias_var,
+            functools.partial(
+                grower.center_bias,
+                center_bias_var,
+                gradients,
+                hessians,
+            ),
+            functools.partial(grower.grow_tree, stats_summaries_list,
+                              last_layer_nodes_range))
       else:
-        grower = _AccumulatorEnsembleGrower(tree_ensemble, tree_hparams,
-                                            stamp_token, n_batches_per_layer,
-                                            bucket_size_list, config.is_chief)
-
-      update_model = control_flow_ops.cond(
-          center_bias_var,
-          functools.partial(
-              grower.center_bias,
-              center_bias_var,
-              gradients,
-              hessians,
-          ),
-          functools.partial(grower.grow_tree, stats_summaries_list,
-                            feature_ids_list, last_layer_nodes_range))
+        update_model = grower.grow_tree(stats_summaries_list,
+                                        last_layer_nodes_range)
       train_op.append(update_model)
 
       with ops.control_dependencies([update_model]):
@@ -846,15 +887,26 @@ def _bt_model_fn(
       labels=labels,
       train_op_fn=_train_op_fn,
       logits=logits)
-  if mode == model_fn.ModeKeys.TRAIN:
-    # Add an early stop hook.
-    estimator_spec = estimator_spec._replace(
-        training_hooks=estimator_spec.training_hooks +
-        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
-                             tree_hparams.n_trees, tree_hparams.max_depth),))
+  # Add an early stop hook.
+  estimator_spec = estimator_spec._replace(
+      training_hooks=estimator_spec.training_hooks +
+      (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                           tree_hparams.n_trees, tree_hparams.max_depth),),
+      training_chief_hooks=[GrowerInitializationHook(grower.chief_init_op())] +
+      list(estimator_spec.training_chief_hooks))
   return estimator_spec
 
 
+class GrowerInitializationHook(session_run_hook.SessionRunHook):
+  """A SessionRunHook handles initialization of `_EnsembleGrower`."""
+
+  def __init__(self, init_op):
+    self._init_op = init_op
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_op)
+
+
 def _create_classification_head(n_classes,
                                 weight_column=None,
                                 label_vocabulary=None):
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index ec597e4686..08026a93c5 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -173,6 +173,26 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
     self.assertAllClose(eval_res['accuracy'], 1.0)
 
+  def testTrainTwiceAndEvaluateBinaryClassifier(self):
+    input_fn = _make_train_input_fn(is_classification=True)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=5,
+        max_depth=10)
+
+    num_steps = 2
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    est.train(input_fn, steps=num_steps)
+
+    self._assert_checkpoint(
+        est.model_dir, global_step=num_steps * 2,
+        finalized_trees=0, attempted_layers=4)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+
   def testInferBinaryClassifier(self):
     train_input_fn = _make_train_input_fn(is_classification=True)
     predict_input_fn = numpy_io.numpy_input_fn(
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index da9a64c2bc..06593f9520 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -335,8 +335,8 @@ def _check_dense_labels_match_logits_and_reshape(
               'Expected labels dimension=%s.  Received %s. '
               'Suggested Fix:'
               'If your classifier expects one-hot encoding label,'
-              'check your n_classes argument to the estimator'
-              'and/or the shape of your label.'
+              'check your n_classes argument to the estimator '
+              'and/or the shape of your label. '
               'Otherwise, check the shape of your label.' %
               (expected_labels_dimension, dim1))
       expected_labels_shape = array_ops.concat(
diff --git a/tensorflow/python/estimator/canned/prediction_keys.py b/tensorflow/python/estimator/canned/prediction_keys.py
index 16890ec09a..daa275b46b 100644
--- a/tensorflow/python/estimator/canned/prediction_keys.py
+++ b/tensorflow/python/estimator/canned/prediction_keys.py
@@ -32,3 +32,4 @@ class PredictionKeys(object):
   LOGITS = 'logits'
   PREDICTIONS = 'predictions'
   PROBABILITIES = 'probabilities'
+  TOP_K = 'top_k'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index ee3c3bba7b..3849188c58 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -120,7 +120,8 @@ class Estimator(object):
                warm_start_from=None):
     """Constructs an `Estimator` instance.
 
-    See @{$estimators} for more information. To warm-start an `Estimator`:
+    See [estimators](https://tensorflow.org/guide/estimators) for more information.
+    To warm-start an `Estimator`:
 
     ```python
     estimator = tf.estimator.DNNClassifier(
@@ -152,9 +153,9 @@ class Estimator(object):
           * `params`: Optional `dict` of hyperparameters.  Will receive what
                  is passed to Estimator in `params` parameter. This allows
                  to configure Estimators from hyper parameter tuning.
-          * `config`: Optional configuration object. Will receive what is passed
-                 to Estimator in `config` parameter, or the default `config`.
-                 Allows updating things in your `model_fn` based on
+          * `config`: Optional `estimator.RunConfig` object. Will receive what
+                 is passed to Estimator as its `config` parameter, or a default
+                 value. Allows setting up things in your `model_fn` based on
                  configuration such as `num_ps_replicas`, or `model_dir`.
 
         * Returns:
@@ -166,7 +167,7 @@ class Estimator(object):
         path will be resolved. If `None`, the model_dir in `config` will be used
         if set. If both are set, they must be same. If both are `None`, a
         temporary directory will be used.
-      config: Configuration object.
+      config: `estimator.RunConfig` configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
       warm_start_from: Optional string filepath to a checkpoint or SavedModel to
@@ -184,8 +185,8 @@ class Estimator(object):
     """
     Estimator._assert_members_are_not_overridden(self)
 
-    config = maybe_overwrite_model_dir_and_session_config(config, model_dir)
-    self._config = config
+    self._config = maybe_overwrite_model_dir_and_session_config(config,
+                                                                model_dir)
 
     # The distribute field contains an instance of DistributionStrategy.
     self._train_distribution = self._config.train_distribute
@@ -285,8 +286,10 @@ class Estimator(object):
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-        See @{$premade_estimators#create_input_functions} for more information.
-        The function should construct and return one of the following:  * A
+        See [Premade
+        Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
+        the following:  * A
         `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple
         `(features, labels)` with same constraints as below. * A tuple
         `(features, labels)`: Where `features` is a `tf.Tensor` or a dictionary
@@ -321,6 +324,14 @@ class Estimator(object):
       ValueError: If both `steps` and `max_steps` are not `None`.
       ValueError: If either `steps` or `max_steps <= 0`.
     """
+    if self.config.task_type in (run_config.TaskType.EVALUATOR,
+                                 run_config.TaskType.PS):
+      raise ValueError(
+          'Train has been called wrong configuration. Please use '
+          'tf.estimator.train_and_evaluate which calls propper API according '
+          'to given configuration. Current configuration: {}.'.format(
+              self.config))
+
     with context.graph_mode():
       if (steps is not None) and (max_steps is not None):
         raise ValueError('Can not provide both steps and max_steps.')
@@ -394,7 +405,8 @@ class Estimator(object):
 
     Args:
       input_fn: A function that constructs the input data for evaluation. See
-        @{$premade_estimators#create_input_functions} for more information. The
+        [Premade Estimators](https://tensorflow.org/guide/premade#create_input_functions}
+        for more information. The
         function should construct and return one of the following:  * A
         `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple
         `(features, labels)` with same constraints as below. * A tuple
@@ -419,7 +431,11 @@ class Estimator(object):
     Returns:
       A dict containing the evaluation metrics specified in `model_fn` keyed by
       name, as well as an entry `global_step` which contains the value of the
-      global step for which this evaluation was performed.
+      global step for which this evaluation was performed. For canned
+      estimators, the dict contains the `loss` (mean loss per mini-batch) and
+      the `average_loss` (mean loss per sample). Canned classifiers also return
+      the `accuracy`. Canned regressors also return the `label/mean` and the
+      `prediction/mean`.
 
     Raises:
       ValueError: If `steps <= 0`.
@@ -450,9 +466,7 @@ class Estimator(object):
             output_dir=self.eval_dir(name))
 
       with ops.Graph().as_default():
-        # TODO(priyag): Support distributed eval on TPUs.
-        if (self._eval_distribution
-            and self._eval_distribution.__class__.__name__ != 'TPUStrategy'):
+        if self._eval_distribution:
           with self._eval_distribution.scope():
             return _evaluate()
         else:
@@ -478,8 +492,9 @@ class Estimator(object):
       input_fn: A function that constructs the features. Prediction continues
         until `input_fn` raises an end-of-input exception
         (`tf.errors.OutOfRangeError` or `StopIteration`).
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        See [Premade
+        Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
 
           * A `tf.data.Dataset` object: Outputs of `Dataset` object must have
@@ -568,14 +583,10 @@ class Estimator(object):
       return
 
     allowed_overrides = set([
-        '_call_input_fn', '_call_model_fn',
-        '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
-        '_create_global_step', '_create_and_assert_global_step',
+        '_create_and_assert_global_step',
         '_tf_api_names', '_tf_api_names_v1', '_estimator_api_names',
         '_estimator_api_names_v1', '_estimator_api_constants',
         '_estimator_api_constants_v1',
-        '_validate_features_in_predict_input',
-        '_add_meta_graph_for_mode'
     ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
@@ -599,8 +610,7 @@ class Estimator(object):
     """Exports inference graph as a `SavedModel` into the given dir.
 
     For a detailed guide, see
-    @{$saved_model#using_savedmodel_with_estimators$Using SavedModel with
-    Estimators}.
+    [Using SavedModel with Estimators](https://tensorflow.org/guide/saved_model#using_savedmodel_with_estimators).
 
     This method builds a new graph by first calling the
     `serving_input_receiver_fn` to obtain feature `Tensor`s, and then calling
@@ -1024,16 +1034,21 @@ class Estimator(object):
                       'QueueRunner. That means predict yields forever. '
                       'This is probably a mistake.')
 
-  def _get_features_and_labels_from_input_fn(self, input_fn, mode,
-                                             distribution=None):
-    """Extracts the `features` and labels from return values of `input_fn`."""
+  def _get_iterator_from_input_fn(self, input_fn, mode, distribution=None):
     if distribution is not None:
       result = distribution.distribute_dataset(
           lambda: self._call_input_fn(input_fn, mode))
     else:
       result = self._call_input_fn(input_fn, mode)
 
-    return estimator_util.parse_input_fn_result(result)
+    iterator = result.make_initializable_iterator()
+    input_hooks = [estimator_util._DatasetInitializerHook(iterator)]  # pylint: disable=protected-access
+    return iterator, input_hooks
+
+  def _get_features_and_labels_from_input_fn(self, input_fn, mode):
+    """Extracts the `features` and labels from return values of `input_fn`."""
+    return estimator_util.parse_input_fn_result(
+        self._call_input_fn(input_fn, mode))
 
   def _extract_batch_length(self, preds_evaluated):
     """Extracts batch length of predictions."""
@@ -1226,32 +1241,24 @@ class Estimator(object):
       # We want to create the iterations variable outside the distribution scope
       # as that is just stored on the host and mainly used to drive the loop
       # and doesn't need to be a Mirrored/Device variable.
-      steps_per_run_variable = training.get_or_create_steps_per_run_variable()
+      if is_tpu_strategy:
+        steps_per_run_variable = training.get_or_create_steps_per_run_variable()
       with self._train_distribution.scope():
         random_seed.set_random_seed(self._config.tf_random_seed)
+        iterator, input_hooks = self._get_iterator_from_input_fn(
+            input_fn, model_fn_lib.ModeKeys.TRAIN, self._train_distribution)
+        worker_hooks.extend(input_hooks)
+        global_step_tensor = self._create_and_assert_global_step(g)
+        # we want to add to the global collection in the main thread not the
+        # tower threads.
+        ops.add_to_collection(
+            training_util.GLOBAL_STEP_READ_KEY,
+            self._train_distribution.read_var(global_step_tensor))
 
         if is_tpu_strategy:
-          # Create the iterator for run_on_dataset function
-          # TODO(sourabhbajaj): refactor this out to call a function on the
-          # strategy
-          dataset = self._train_distribution.distribute_dataset(
-              lambda: self._call_input_fn(input_fn,  # pylint: disable=g-long-lambda
-                                          model_fn_lib.ModeKeys.TRAIN))
-          iterator = dataset.make_initializable_iterator()
-          worker_hooks.append(
-              estimator_util._DatasetInitializerHook(iterator))  # pylint: disable=protected-access
-
-          global_step_tensor = self._create_and_assert_global_step(g)
-          # we want to add to the global collection in the main thread not the
-          # tower threads.
-          ops.add_to_collection(
-              training_util.GLOBAL_STEP_READ_KEY,
-              self._train_distribution.read_var(global_step_tensor))
-
           # Create a step_fn from the train_op of grouped_estimator_spec
-          def step_fn(ctx, inputs):
+          def step_fn(ctx, features, labels):
             """A single step that is passed to run_on_dataset."""
-            features, labels = inputs
             estimator_spec = self._train_distribution.call_for_each_tower(
                 self._call_model_fn,
                 features,
@@ -1272,26 +1279,22 @@ class Estimator(object):
               step_fn, iterator, iterations=steps_per_run_variable,
               initial_loop_values={'loss': initial_training_loss})
           distributed_train_op = ctx.run_op
-          tpu_result = ctx.last_step_outputs
+          loss = ctx.last_step_outputs['loss']
           grouped_estimator_spec = ctx.non_tensor_outputs['estimator_spec']
         else:
-          features, labels, input_hooks = (
-              self._get_features_and_labels_from_input_fn(
-                  input_fn, model_fn_lib.ModeKeys.TRAIN,
-                  self._train_distribution))
-          worker_hooks.extend(input_hooks)
-          global_step_tensor = self._create_and_assert_global_step(g)
-          # we want to add to the global collection in the main thread not the
-          # tower threads.
-          ops.add_to_collection(
-              training_util.GLOBAL_STEP_READ_KEY,
-              self._train_distribution.read_var(global_step_tensor))
+          features, labels = iterator.get_next()
           grouped_estimator_spec = self._train_distribution.call_for_each_tower(
               self._call_model_fn,
               features,
               labels,  # although this will be None it seems
               model_fn_lib.ModeKeys.TRAIN,
               self.config)
+          loss = self._train_distribution.unwrap(
+              self._train_distribution.reduce(
+                  distribute_lib.get_loss_reduction(),
+                  grouped_estimator_spec.loss,
+                  destinations='/device:CPU:0'))[0]
+          distributed_train_op = grouped_estimator_spec.train_op
 
         scaffold = _combine_distributed_scaffold(
             grouped_estimator_spec.scaffold, self._train_distribution)
@@ -1305,21 +1308,10 @@ class Estimator(object):
             grouped_estimator_spec.training_hooks)
         training_chief_hooks = get_hooks_from_the_first_device(
             grouped_estimator_spec.training_chief_hooks)
-
-        # TODO(sourabhbajaj): Merge the two code paths and clean up the code
-        if is_tpu_strategy:
-          loss = tpu_result['loss']
-          worker_hooks.append(
-              estimator_util.StrategyInitFinalizeHook(
-                  self._train_distribution.initialize,
-                  self._train_distribution.finalize))
-        else:
-          loss = self._train_distribution.unwrap(
-              self._train_distribution.reduce(
-                  distribute_lib.get_loss_reduction(),
-                  grouped_estimator_spec.loss,
-                  destinations='/device:CPU:0'))[0]
-          distributed_train_op = grouped_estimator_spec.train_op
+        worker_hooks.append(
+            estimator_util.StrategyInitFinalizeHook(
+                self._train_distribution.initialize,
+                self._train_distribution.finalize))
 
         estimator_spec = model_fn_lib.EstimatorSpec(
             mode=grouped_estimator_spec.mode,
@@ -1420,31 +1412,18 @@ class Estimator(object):
     """Builds the graph and related hooks to run evaluation."""
     random_seed.set_random_seed(self._config.tf_random_seed)
     self._create_and_assert_global_step(ops.get_default_graph())
-    features, labels, input_hooks = (
-        self._get_features_and_labels_from_input_fn(
-            input_fn, model_fn_lib.ModeKeys.EVAL, self._eval_distribution))
 
     if self._eval_distribution:
-      (loss_metric, scaffold, evaluation_hooks, eval_metric_ops) = (
-          self._call_model_fn_eval_distributed(features, labels, self.config))
+      (scaffold, evaluation_hooks, input_hooks, update_op, eval_dict) = (
+          self._call_model_fn_eval_distributed(input_fn, self.config))
     else:
-      (loss_metric, scaffold, evaluation_hooks, eval_metric_ops) = (
-          self._call_model_fn_eval(features, labels, self.config))
+      (scaffold, evaluation_hooks, input_hooks, update_op, eval_dict) = (
+          self._call_model_fn_eval(input_fn, self.config))
 
     global_step_tensor = training_util.get_global_step(ops.get_default_graph())
     # Call to warm_start has to be after model_fn is called.
     self._maybe_warm_start(checkpoint_path)
 
-    if model_fn_lib.LOSS_METRIC_KEY in eval_metric_ops:
-      raise ValueError(
-          'Metric with name "%s" is not allowed, because Estimator ' %
-          (model_fn_lib.LOSS_METRIC_KEY) +
-          'already defines a default metric with the same name.')
-    eval_metric_ops[model_fn_lib.LOSS_METRIC_KEY] = loss_metric
-
-    update_op, eval_dict = _extract_metric_update_ops(eval_metric_ops,
-                                                      self._eval_distribution)
-
     if ops.GraphKeys.GLOBAL_STEP in eval_dict:
       raise ValueError(
           'Metric with name `global_step` is not allowed, because Estimator '
@@ -1469,26 +1448,70 @@ class Estimator(object):
 
     return scaffold, update_op, eval_dict, all_hooks
 
-  def _call_model_fn_eval(self, features, labels, config):
+  def _call_model_fn_eval(self, input_fn, config):
+    """Call model_fn for evaluation and handle return values."""
+    features, labels, input_hooks = self._get_features_and_labels_from_input_fn(
+        input_fn, model_fn_lib.ModeKeys.EVAL)
+
     estimator_spec = self._call_model_fn(
         features, labels, model_fn_lib.ModeKeys.EVAL, config)
-    loss_metric = metrics_lib.mean(estimator_spec.loss)
-    return (loss_metric, estimator_spec.scaffold,
-            estimator_spec.evaluation_hooks, estimator_spec.eval_metric_ops)
+    eval_metric_ops = _verify_and_create_loss_metric(
+        estimator_spec.eval_metric_ops, estimator_spec.loss)
+    update_op, eval_dict = _extract_metric_update_ops(eval_metric_ops)
+    return (estimator_spec.scaffold, estimator_spec.evaluation_hooks,
+            input_hooks, update_op, eval_dict)
 
-  def _call_model_fn_eval_distributed(self, features, labels, config):
+  def _call_model_fn_eval_distributed(self, input_fn, config):
     """Call model_fn in distribution mode and handle return values."""
-    grouped_estimator_spec = self._eval_distribution.call_for_each_tower(
-        self._call_model_fn, features, labels,
-        model_fn_lib.ModeKeys.EVAL, config)
+
+    iterator, input_hooks = self._get_iterator_from_input_fn(
+        input_fn, model_fn_lib.ModeKeys.EVAL, self._eval_distribution)
+
+    is_tpu_strategy = (
+        self._eval_distribution.__class__.__name__ == 'TPUStrategy')
+
+    if is_tpu_strategy:
+      def step_fn(ctx, features, labels):
+        """Runs one step of the eval computation and captures outputs."""
+        estimator_spec = self._eval_distribution.call_for_each_tower(
+            self._call_model_fn, features, labels, model_fn_lib.ModeKeys.EVAL,
+            config)
+        eval_metric_ops = _verify_and_create_loss_metric(
+            estimator_spec.eval_metric_ops, estimator_spec.loss,
+            self._eval_distribution)
+        update_op, eval_dict = _extract_metric_update_ops(
+            eval_metric_ops, self._eval_distribution)
+        ctx.set_non_tensor_output(name='estimator_spec', output=estimator_spec)
+        ctx.set_non_tensor_output(name='eval_dict', output=eval_dict)
+        return update_op
+
+      # TODO(priyag): Fix eval step hook to account for steps_per_run.
+      ctx = self._eval_distribution.run_steps_on_dataset(
+          step_fn, iterator, iterations=self._eval_distribution.steps_per_run)
+      update_op = ctx.run_op
+      eval_dict = ctx.non_tensor_outputs['eval_dict']
+      grouped_estimator_spec = ctx.non_tensor_outputs['estimator_spec']
+    else:
+      features, labels = iterator.get_next()
+      grouped_estimator_spec = self._eval_distribution.call_for_each_tower(
+          self._call_model_fn, features, labels,
+          model_fn_lib.ModeKeys.EVAL, config)
+      eval_metric_ops = _verify_and_create_loss_metric(
+          grouped_estimator_spec.eval_metric_ops, grouped_estimator_spec.loss,
+          self._eval_distribution)
+      update_op, eval_dict = _extract_metric_update_ops(
+          eval_metric_ops, self._eval_distribution)
+
     scaffold = _combine_distributed_scaffold(
         grouped_estimator_spec.scaffold, self._eval_distribution)
     evaluation_hooks = self._eval_distribution.unwrap(
         grouped_estimator_spec.evaluation_hooks)[0]
-    loss_metric = self._eval_distribution.call_for_each_tower(
-        metrics_lib.mean, grouped_estimator_spec.loss)
-    return (loss_metric, scaffold,
-            evaluation_hooks, grouped_estimator_spec.eval_metric_ops)
+    evaluation_hooks = evaluation_hooks + (
+        estimator_util.StrategyInitFinalizeHook(
+            self._eval_distribution.initialize,
+            self._eval_distribution.finalize),)
+
+    return (scaffold, evaluation_hooks, input_hooks, update_op, eval_dict)
 
   def _evaluate_run(self, checkpoint_path, scaffold, update_op, eval_dict,
                     all_hooks, output_dir):
@@ -1524,6 +1547,23 @@ class Estimator(object):
       warm_starting_util.warm_start(*self._warm_start_settings)
 
 
+def _verify_and_create_loss_metric(eval_metric_ops, loss, distribution=None):
+  """Creates a metric for loss and throws an error if one already exists."""
+  if model_fn_lib.LOSS_METRIC_KEY in eval_metric_ops:
+    raise ValueError(
+        'Metric with name "%s" is not allowed, because Estimator ' %
+        (model_fn_lib.LOSS_METRIC_KEY) +
+        'already defines a default metric with the same name.')
+
+  if distribution is None:
+    loss_metric = metrics_lib.mean(loss)
+  else:
+    loss_metric = distribution.call_for_each_tower(
+        metrics_lib.mean, loss)
+  eval_metric_ops[model_fn_lib.LOSS_METRIC_KEY] = loss_metric
+  return eval_metric_ops
+
+
 def maybe_overwrite_model_dir_and_session_config(config, model_dir):
   """Overwrite estimator config by `model_dir` and `session_config` if needed.
 
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index e3f22d9010..d316742a83 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -58,6 +58,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.ops.random_ops import random_uniform
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -158,16 +159,7 @@ class EstimatorInheritanceConstraintTest(test.TestCase):
       def __init__(self):
         super(_Estimator, self).__init__(model_fn=dummy_model_fn)
 
-      def _call_input_fn(self, input_fn, mode):
-        return input_fn()
-
-      def _create_global_step(self, graph):
-        pass
-
-      def _convert_train_steps_to_hooks(self, steps, max_steps):
-        pass
-
-      def _convert_eval_steps_to_hooks(self, steps):
+      def _tf_api_names(self):
         pass
 
     _Estimator()
@@ -473,6 +465,29 @@ class EstimatorTrainTest(test.TestCase):
     est.train(InputFn(), steps=1)
     self.assertEqual(1, input_fn_call_count[0])
 
+  def test_nested_input_fn(self):
+    expected_params = {'batch_size': 10}
+
+    def _input_fn():
+      dataset_features = dataset_ops.Dataset.from_tensor_slices(
+          (random_uniform([4]),
+           random_uniform([4, 100], maxval=100, dtype=dtypes.int32)))
+      dataset_labels = dataset_ops.Dataset.from_tensor_slices(
+          random_uniform([4, 10]))
+      dataset = dataset_ops.Dataset.zip((dataset_features, dataset_labels))
+      dataset = dataset.repeat(-1)
+      iterator = dataset.make_initializable_iterator()
+      return iterator.get_next()
+
+    def _model_fn(features, labels, mode, params, config):
+      del params, config
+      return model_fn_global_step_incrementer(features, labels, mode)
+
+    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
+    est = estimator.Estimator(
+        model_fn=_model_fn, params=expected_params, config=expected_config)
+    est.train(_input_fn, steps=4)
+
   def test_input_fn_args(self):
     expected_mode = model_fn_lib.ModeKeys.TRAIN
     expected_params = {'batch_size': 10}
@@ -940,6 +955,19 @@ class EstimatorTrainTest(test.TestCase):
     est = estimator.Estimator(model_fn=_model_fn)
     est.train(dummy_input_fn, steps=1)
 
+  def test_config_should_not_be_evaluator_or_ps(self):
+
+    class FakeEvaluatorConfig(run_config.RunConfig):
+
+      @property
+      def task_type(self):
+        return run_config.TaskType.EVALUATOR
+
+    est = estimator.Estimator(
+        model_fn=dummy_model_fn, config=FakeEvaluatorConfig())
+    with self.assertRaisesRegexp(ValueError, 'train_and_evaluate'):
+      est.train(dummy_input_fn, steps=1)
+
 
 def _model_fn_with_eval_metric_ops(features, labels, mode, params):
   _, _ = features, labels
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 3d171f7811..55aace5fa9 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -217,6 +217,29 @@ class TensorServingInputReceiver(
         receiver_tensors_alternatives=receiver.receiver_tensors_alternatives)
 
 
+class UnsupervisedInputReceiver(ServingInputReceiver):
+  """A return type for a training_input_receiver_fn or eval_input_receiver_fn.
+
+  This differs from SupervisedInputReceiver in that it does not require a set
+  of labels.
+
+  The expected return values are:
+    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the features to be passed to the model.
+    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
+      or `SparseTensor`, specifying input nodes where this receiver expects to
+      be fed by default.  Typically, this is a single placeholder expecting
+      serialized `tf.Example` protos.
+  """
+
+  def __new__(cls, features, receiver_tensors):
+    return super(UnsupervisedInputReceiver, cls).__new__(
+        cls,
+        features=features,
+        receiver_tensors=receiver_tensors,
+        receiver_tensors_alternatives=None)
+
+
 class SupervisedInputReceiver(
     collections.namedtuple('SupervisedInputReceiver',
                            ['features', 'labels', 'receiver_tensors'])):
@@ -288,13 +311,33 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
 
 
 def _placeholder_from_tensor(t, default_batch_size=None):
+  """Creates a placeholder that matches the dtype and shape of passed tensor.
+
+  Args:
+    t: Tensor or EagerTensor
+    default_batch_size: the number of query examples expected per batch.
+        Leave unset for variable batch size (recommended).
+
+  Returns:
+    Placeholder that matches the passed tensor.
+  """
   batch_shape = tensor_shape.TensorShape([default_batch_size])
   shape = batch_shape.concatenate(t.get_shape()[1:])
 
   # Reuse the feature tensor's op name (t.op.name) for the placeholder,
   # excluding the index from the tensor's name (t.name):
   # t.name = "%s:%d" % (t.op.name, t._value_index)
-  return array_ops.placeholder(dtype=t.dtype, shape=shape, name=t.op.name)
+  try:
+    name = t.op.name
+  except AttributeError:
+    # In Eager mode, tensors don't have ops or names, and while they do have
+    # IDs, those are not maintained across runs. The name here is used
+    # primarily for debugging, and is not critical to the placeholder.
+    # So, in order to make this Eager-compatible, continue with an empty
+    # name if none is available.
+    name = None
+
+  return array_ops.placeholder(dtype=t.dtype, shape=shape, name=name)
 
 
 def _placeholders_from_receiver_tensors_dict(input_vals,
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index 1d475adb43..3eed1ab163 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -163,6 +163,29 @@ class ServingInputReceiverTest(test_util.TensorFlowTestCase):
       _ = export.ServingInputReceiver(feature, receiver_tensor)
 
 
+class UnsupervisedInputReceiverTest(test_util.TensorFlowTestCase):
+
+  # Since this is basically a wrapper around ServingInputReceiver, we only
+  # have a simple sanity check to ensure that it works.
+
+  def test_unsupervised_input_receiver_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0":
+            constant_op.constant([0]),
+        u"feature1":
+            constant_op.constant([1]),
+        "feature2":
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    export.UnsupervisedInputReceiver(features, receiver_tensors)
+
+
 class SupervisedInputReceiverTest(test_util.TensorFlowTestCase):
 
   def test_input_receiver_constructor(self):
@@ -393,6 +416,7 @@ class ExportTest(test_util.TensorFlowTestCase):
         tensor_shape.unknown_shape(),
         v.receiver_tensors["feature_2"].shape)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_build_raw_serving_input_receiver_fn(self):
     features = {"feature_1": constant_op.constant(["hello"]),
                 "feature_2": constant_op.constant([42])}
@@ -411,6 +435,7 @@ class ExportTest(test_util.TensorFlowTestCase):
           dtypes.int32,
           serving_input_receiver.receiver_tensors["feature_2"].dtype)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_build_raw_supervised_input_receiver_fn(self):
     features = {"feature_1": constant_op.constant(["hello"]),
                 "feature_2": constant_op.constant([42])}
@@ -431,6 +456,7 @@ class ExportTest(test_util.TensorFlowTestCase):
       self.assertEqual(
           dtypes.int32, input_receiver.receiver_tensors["feature_2"].dtype)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_build_raw_supervised_input_receiver_fn_raw_tensors(self):
     features = {"feature_1": constant_op.constant(["hello"]),
                 "feature_2": constant_op.constant([42])}
@@ -454,6 +480,7 @@ class ExportTest(test_util.TensorFlowTestCase):
       self.assertEqual(set(["input", "label"]),
                        set(input_receiver.receiver_tensors.keys()))
 
+  @test_util.run_in_graph_and_eager_modes
   def test_build_raw_supervised_input_receiver_fn_batch_size(self):
     features = {"feature_1": constant_op.constant(["hello"]),
                 "feature_2": constant_op.constant([42])}
@@ -466,6 +493,7 @@ class ExportTest(test_util.TensorFlowTestCase):
       self.assertEqual([10], input_receiver.receiver_tensors["feature_1"].shape)
       self.assertEqual([10], input_receiver.features["feature_1"].shape)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_build_raw_supervised_input_receiver_fn_overlapping_keys(self):
     features = {"feature_1": constant_op.constant(["hello"]),
                 "feature_2": constant_op.constant([42])}
@@ -474,6 +502,7 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       export.build_raw_supervised_input_receiver_fn(features, labels)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_build_supervised_input_receiver_fn_from_input_fn(self):
     def dummy_input_fn():
       return ({"x": constant_op.constant([[1], [1]]),
@@ -491,6 +520,7 @@ class ExportTest(test_util.TensorFlowTestCase):
       self.assertEqual(set(["x", "y", "label"]),
                        set(input_receiver.receiver_tensors.keys()))
 
+  @test_util.run_in_graph_and_eager_modes
   def test_build_supervised_input_receiver_fn_from_input_fn_args(self):
     def dummy_input_fn(feature_key="x"):
       return ({feature_key: constant_op.constant([[1], [1]]),
diff --git a/tensorflow/python/estimator/exporter_test.py b/tensorflow/python/estimator/exporter_test.py
index c4b006955c..fcccfbde7a 100644
--- a/tensorflow/python/estimator/exporter_test.py
+++ b/tensorflow/python/estimator/exporter_test.py
@@ -323,6 +323,43 @@ class LatestExporterTest(test.TestCase):
     self.assertTrue(gfile.Exists(export_dir_3))
     self.assertTrue(gfile.Exists(export_dir_4))
 
+  def test_garbage_collect_exports_with_trailing_delimiter(self):
+    export_dir_base = tempfile.mkdtemp() + "export/"
+    gfile.MkDir(export_dir_base)
+    export_dir_1 = _create_test_export_dir(export_dir_base)
+    export_dir_2 = _create_test_export_dir(export_dir_base)
+    export_dir_3 = _create_test_export_dir(export_dir_base)
+    export_dir_4 = _create_test_export_dir(export_dir_base)
+
+    self.assertTrue(gfile.Exists(export_dir_1))
+    self.assertTrue(gfile.Exists(export_dir_2))
+    self.assertTrue(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
+    def _serving_input_receiver_fn():
+      return array_ops.constant([1]), None
+
+    exporter = exporter_lib.LatestExporter(
+        name="latest_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        exports_to_keep=1)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    # Garbage collect all but the most recent 2 exports,
+    # where recency is determined based on the timestamp directory names.
+    with test.mock.patch.object(gfile, "ListDirectory") as mock_list_directory:
+      mock_list_directory.return_value = [
+          os.path.basename(export_dir_1) + b"/",
+          os.path.basename(export_dir_2) + b"/",
+          os.path.basename(export_dir_3) + b"/",
+          os.path.basename(export_dir_4) + b"/",
+          ]
+      exporter.export(estimator, export_dir_base, None, None, False)
+
+    self.assertFalse(gfile.Exists(export_dir_1))
+    self.assertFalse(gfile.Exists(export_dir_2))
+    self.assertFalse(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
 
 def _create_test_export_dir(export_dir_base):
   export_dir = _get_timestamped_export_dir(export_dir_base)
diff --git a/tensorflow/python/estimator/gc.py b/tensorflow/python/estimator/gc.py
index 9f8a463ec1..03ad33dd6b 100644
--- a/tensorflow/python/estimator/gc.py
+++ b/tensorflow/python/estimator/gc.py
@@ -201,9 +201,11 @@ def _get_paths(base_dir, parser):
   raw_paths = gfile.ListDirectory(base_dir)
   paths = []
   for r in raw_paths:
-    p = parser(Path(os.path.join(compat.as_str_any(base_dir),
-                                 compat.as_str_any(r)),
-                    None))
+    # ListDirectory() return paths with "/" at the last if base_dir was GCS URL
+    r = compat.as_str_any(r)
+    if r[-1] == '/':
+      r = r[0:len(r)-1]
+    p = parser(Path(os.path.join(compat.as_str_any(base_dir), r), None))
     if p:
       paths.append(p)
   return sorted(paths)
diff --git a/tensorflow/python/estimator/gc_test.py b/tensorflow/python/estimator/gc_test.py
index 2cbdd511d1..53c3d4ca2a 100644
--- a/tensorflow/python/estimator/gc_test.py
+++ b/tensorflow/python/estimator/gc_test.py
@@ -140,6 +140,17 @@ class GcTest(test_util.TensorFlowTestCase):
       gfile.MakeDirs(os.path.join(compat.as_str_any(base_dir), "42"))
       gc._get_paths(base_dir, _create_parser(base_dir))
 
+  def testGcsDirWithSeparator(self):
+    base_dir = "gs://bucket/foo"
+    with test.mock.patch.object(gfile, "ListDirectory") as mock_list_directory:
+      # gfile.ListDirectory returns directory names with separator '/'
+      mock_list_directory.return_value = ["0/", "1/"]
+      self.assertEqual(
+          gc._get_paths(base_dir, _create_parser(base_dir)),
+          [
+              gc.Path(os.path.join(base_dir, "0"), 0),
+              gc.Path(os.path.join(base_dir, "1"), 1)
+          ])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index a5f07fea3b..6361c6acc1 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -33,9 +33,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import models
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine.base_layer import Layer
-from tensorflow.python.keras.engine.network import Network
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
@@ -43,12 +40,10 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import data_structures
 
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -92,184 +87,78 @@ def _any_weight_initialized(keras_model):
   return False
 
 
-def _create_ordered_io(keras_model, estimator_io, is_input=True):
-  """Create a list of tensors from IO dictionary based on Keras IO order.
+def _convert_estimator_io_to_keras(keras_model, features, labels):
+  """Converts estimator features and labels to keras input and target tensors.
 
   Args:
-    keras_model: An instance of compiled keras model.
-    estimator_io: The features or labels (dict or plain array) from model_fn.
-    is_input: True if dictionary is for inputs.
+    keras_model: a compiled `tf.keras.Model` instance, used to determine the
+      order of the returned lists.
+    features: Dict of tensors or `None`.
+    labels: Dict of tensors, a single tensor, or `None`.
 
   Returns:
-    A list of tensors based on Keras IO order.
-
-  Raises:
-    ValueError: if dictionary keys cannot be found in Keras model input_names
-      or output_names.
-  """
-  if isinstance(estimator_io, (list, tuple)):
-    # Case currently not supported by most built-in input_fn,
-    # but it's good to have for sanity
-    return [_convert_tensor(x) for x in estimator_io]
-  elif isinstance(estimator_io, dict):
-    if is_input:
-      if keras_model._is_graph_network:
-        keras_io_names = keras_model.input_names
-      else:
-        keras_io_names = [
-            'input_%d' % i for i in range(1, len(estimator_io) + 1)]
-    else:
-      if keras_model._is_graph_network:
-        keras_io_names = keras_model.output_names
-      else:
-        keras_io_names = [
-            'output_%d' % i for i in range(1, len(estimator_io) + 1)]
-
-    for key in estimator_io:
-      if key not in keras_io_names:
-        raise ValueError(
-            'Cannot find %s with name "%s" in Keras Model. '
-            'It needs to match one '
-            'of the following: %s' % ('input' if is_input else 'output', key,
-                                      ', '.join(keras_io_names)))
-    tensors = [_convert_tensor(estimator_io[io_name])
-               for io_name in keras_io_names]
-    return tensors
-  else:
-    # Plain array.
-    return _convert_tensor(estimator_io)
-
-
-def _in_place_subclassed_model_reset(model):
-  """Substitute for model cloning that works for subclassed models.
-
-  Subclassed models cannot be cloned because their topology is not serializable.
-  To "instantiate" an identical model in a new TF graph, we reuse the original
-  model object, but we clear its state.
-
-  After calling this function on a model instance, you can use the model
-  instance as if it were a model clone (in particular you can use it in a new
-  graph).
-
-  This method clears the state of the input model. It is thus destructive.
-  However the original state can be restored fully by calling
-  `_in_place_subclassed_model_state_restoration`.
-
-  Args:
-    model: Instance of a Keras model created via subclassing.
-
-  Raises:
-    ValueError: In case the model uses a subclassed model as inner layer.
+    Tuple of (
+      list of input tensors or `None`,
+      list of target tensors or `None`)
+    The order of tensors is determined by the order set in the keras model.
   """
-  assert not model._is_graph_network  # Only makes sense for subclassed networks
-  # Retrieve all layers tracked by the model as well as their attribute names
-  attributes_cache = {}
-  for name in dir(model):
-    try:
-      value = getattr(model, name)
-    except (AttributeError, ValueError, TypeError):
-      continue
-    if isinstance(value, Layer):
-      attributes_cache[name] = value
-      assert value in model._layers
-    elif isinstance(value, (list, tuple)) and name not in ('layers', '_layers'):
-      # Handle case: list/tuple of layers (also tracked by the Network API).
-      if value and all(isinstance(val, Layer) for val in value):
-        raise ValueError('We do not support the use of list-of-layers '
-                         'attributes in subclassed models used with '
-                         '`model_to_estimator` at this time. Found list '
-                         'model: %s' % name)
-
-  # Replace layers on the model with fresh layers
-  layers_to_names = {value: key for key, value in attributes_cache.items()}
-  original_layers = model._layers[:]
-  model._layers = data_structures.NoDependency([])
-  for layer in original_layers:  # We preserve layer order.
-    config = layer.get_config()
-    # This will not work for nested subclassed models used as layers.
-    # This would be theoretically possible to support, but would add complexity.
-    # Only do it if users complain.
-    if isinstance(layer, Network) and not layer._is_graph_network:
-      raise ValueError('We do not support the use of nested subclassed models '
-                       'in `model_to_estimator` at this time. Found nested '
-                       'model: %s' % layer)
-    fresh_layer = layer.__class__.from_config(config)
-    name = layers_to_names[layer]
-    setattr(model, name, fresh_layer)
-
-  # Cache original model build attributes (in addition to layers)
-  if (not hasattr(model, '_original_attributes_cache') or
-      model._original_attributes_cache is None):
-    if model.built:
-      attributes_to_cache = [
-          'inputs',
-          'outputs',
-          '_feed_outputs',
-          '_feed_output_names',
-          '_feed_output_shapes',
-          '_feed_loss_fns',
-          'loss_weights_list',
-          'targets',
-          '_feed_targets',
-          'sample_weight_modes',
-          'weighted_metrics',
-          'metrics_names',
-          'metrics_tensors',
-          'metrics_updates',
-          'stateful_metric_names',
-          'total_loss',
-          'sample_weights',
-          '_feed_sample_weights',
-          'train_function',
-          'test_function',
-          'predict_function',
-          '_collected_trainable_weights',
-          '_feed_inputs',
-          '_feed_input_names',
-          '_feed_input_shapes',
-          'optimizer',
-      ]
-      for name in attributes_to_cache:
-        attributes_cache[name] = getattr(model, name)
-  model._original_attributes_cache = data_structures.NoDependency(
-      attributes_cache)
-  # Reset built state
-  model.built = False
-  model.inputs = None
-  model.outputs = None
-
-
-def _in_place_subclassed_model_state_restoration(model):
-  """Restores the original state of a model after it was "reset".
-
-  This undoes this action of `_in_place_subclassed_model_reset`.
 
-  Args:
-    model: Instance of a Keras model created via subclassing, on which
-      `_in_place_subclassed_model_reset` was previously called.
-  """
-  assert not model._is_graph_network
-  # Restore layers and build attributes
-  if (hasattr(model, '_original_attributes_cache') and
-      model._original_attributes_cache is not None):
-    # Models have sticky attribute assignment, so we want to be careful to add
-    # back the previous attributes and track Layers by their original names
-    # without adding dependencies on "utility" attributes which Models exempt
-    # when they're constructed.
-    model._layers = data_structures.NoDependency([])
-    for name, value in model._original_attributes_cache.items():
-      if not isinstance(value, checkpointable.CheckpointableBase):
-        # If this value is not already checkpointable, it's probably that way
-        # for a reason; we don't want to start tracking data structures that the
-        # original Model didn't.
-        value = data_structures.NoDependency(value)
-      setattr(model, name, value)
-    model._original_attributes_cache = None
-  else:
-    # Restore to the state of a never-called model.
-    model.built = False
-    model.inputs = None
-    model.outputs = None
+  def _to_ordered_tensor_list(obj, key_order, obj_name, order_name):
+    """Convert obj to an ordered list of tensors.
+
+    Args:
+      obj: List, dict, or single tensor. May be `None`.
+      key_order: List of strings with the order to return (used if obj is a
+        dict).
+      obj_name: String name of object (e.g. "features" or "labels")
+      order_name: String name of the key order (e.g. "inputs" or "outputs")
+
+    Returns:
+      List of tensors, or `None`
+
+    Raises:
+      KeyError: If obj has invalid keys.
+    """
+    if obj is None:
+      return None
+    elif isinstance(obj, (list, tuple)):
+      return [_convert_tensor(x) for x in obj]
+    elif isinstance(obj, dict):
+      # Ensure that the obj keys and keys in key_order are exactly the same.
+      different_keys = set(obj.keys()) ^ set(key_order)
+
+      if different_keys:
+        raise KeyError(
+            'The dictionary passed into {obj_name} does not have the expected '
+            '{order_name} keys defined in the keras model.'
+            '\n\tExpected keys: {order_keys}'
+            '\n\t{obj_name} keys: {obj_keys}'
+            '\n\tDifference: {different_keys}'.format(
+                order_name=order_name, order_keys=set(key_order),
+                obj_name=obj_name, obj_keys=set(obj.keys()),
+                different_keys=different_keys))
+
+      return [_convert_tensor(obj[key]) for key in key_order]
+    else:  # Assume obj is a tensor.
+      return [_convert_tensor(obj)]
+
+  input_names = None
+  output_names = None
+  if isinstance(features, dict):
+    input_names = (
+        keras_model.input_names if keras_model._is_graph_network else
+        ['input_%d' % i for i in range(1, len(features) + 1)])
+  if isinstance(labels, dict):
+    output_names = (
+        keras_model.output_names if keras_model._is_graph_network else
+        ['output_%d' % i for i in range(1, len(labels) + 1)])
+
+  input_tensors = _to_ordered_tensor_list(
+      features, input_names, 'features', 'inputs')
+  target_tensors = _to_ordered_tensor_list(
+      labels, output_names, 'labels', 'outputs')
+
+  return input_tensors, target_tensors
 
 
 def _clone_and_build_model(mode,
@@ -289,61 +178,14 @@ def _clone_and_build_model(mode,
   Returns:
     The newly built model.
   """
-  # Set to True during training, False for inference.
+  # Set to True during training, False for inference or testing.
   K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN)
-
-  # Get list of inputs.
-  if features is None:
-    input_tensors = None
-  else:
-    input_tensors = _create_ordered_io(keras_model,
-                                       estimator_io=features,
-                                       is_input=True)
-  # Get list of outputs.
-  if labels is None:
-    target_tensors = None
-  elif isinstance(labels, dict):
-    target_tensors = _create_ordered_io(keras_model,
-                                        estimator_io=labels,
-                                        is_input=False)
-  else:
-    target_tensors = [
-        _convert_tensor(labels)
-    ]
-
-  if keras_model._is_graph_network:
-    if custom_objects:
-      with CustomObjectScope(custom_objects):
-        model = models.clone_model(keras_model, input_tensors=input_tensors)
-    else:
-      model = models.clone_model(keras_model, input_tensors=input_tensors)
-  else:
-    model = keras_model
-    _in_place_subclassed_model_reset(model)
-    if input_tensors is not None:
-      model._set_inputs(input_tensors)
-
-  # Compile/Build model
-  if mode is model_fn_lib.ModeKeys.PREDICT:
-    if isinstance(model, models.Sequential):
-      model.build()
-  else:
-    if isinstance(keras_model.optimizer, optimizers.TFOptimizer):
-      optimizer = keras_model.optimizer
-    else:
-      optimizer_config = keras_model.optimizer.get_config()
-      optimizer = keras_model.optimizer.__class__.from_config(optimizer_config)
-    optimizer.iterations = training_util.get_or_create_global_step()
-
-    model.compile(
-        optimizer,
-        keras_model.loss,
-        metrics=keras_model.metrics,
-        loss_weights=keras_model.loss_weights,
-        sample_weight_mode=keras_model.sample_weight_mode,
-        weighted_metrics=keras_model.weighted_metrics,
-        target_tensors=target_tensors)
-  return model
+  input_tensors, target_tensors = _convert_estimator_io_to_keras(
+      keras_model, features, labels)
+  return models.clone_and_build_model(
+      keras_model, input_tensors, target_tensors, custom_objects,
+      compile_clone=(mode != model_fn_lib.ModeKeys.PREDICT),
+      in_place_reset=(not keras_model._is_graph_network))
 
 
 def _create_keras_model_fn(keras_model, custom_objects=None):
@@ -361,7 +203,7 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
     """model_fn for keras Estimator."""
     # Raise an error when users use DistributionStrategy with native Keras
     # optimizers. Currently we only support native TensorFlow optimizers.
-    if distribute_lib.has_distribution_strategy() and \
+    if distribution_strategy_context.has_distribution_strategy() and \
         not isinstance(keras_model.optimizer,
                        (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
       raise ValueError('Only TensorFlow native optimizers are supported with '
@@ -373,7 +215,7 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
     # We need to make sure that the output names of the last layer in the model
     # is the same for each of the cloned models. This is required for mirrored
     # strategy when we call regroup.
-    if distribute_lib.has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       for name in model.output_names:
         name = re.compile(r'_\d$').sub('', name)
         model_output_names.append(name)
@@ -423,7 +265,7 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
     if not model._is_graph_network:
       # Reset model state to original state,
       # to avoid `model_fn` being destructive for the initial model argument.
-      _in_place_subclassed_model_state_restoration(keras_model)
+      models.in_place_subclassed_model_state_restoration(keras_model)
     return model_fn_lib.EstimatorSpec(
         mode=mode,
         predictions=predictions,
@@ -487,8 +329,9 @@ def model_to_estimator(keras_model=None,
                        config=None):
   """Constructs an `Estimator` instance from given keras model.
 
-  For usage example, please see
-  @{$guide/estimators$creating_estimators_from_keras_models}.
+  For usage example, please see:
+  [Creating estimators from Keras
+  Models](https://tensorflow.org/guide/estimators#model_to_estimator).
 
   Args:
     keras_model: A compiled Keras model object. This argument is mutually
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 332e385726..290c4604ce 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -184,12 +184,14 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     gfile.MakeDirs(self._base_dir)
     self._config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
+    super(TestKerasEstimator, self).setUp()
 
   def tearDown(self):
     # Make sure nothing is stuck in limbo.
     writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
+    super(TestKerasEstimator, self).tearDown()
 
   def test_train(self):
     for model_type in ['sequential', 'functional']:
@@ -511,19 +513,19 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       input_dict = {'input_1': x_train}
       output_dict = {'invalid_output_name': y_train}
       return input_dict, output_dict
-
     model = simple_functional_model()
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
     with self.test_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=model, config=self._config)
-
     with self.test_session():
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(KeyError,
+                                   'Difference: .*invalid_input_name'):
         est_keras.train(input_fn=invald_input_name_input_fn, steps=100)
 
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(KeyError,
+                                   'Difference: .*invalid_output_name'):
         est_keras.train(input_fn=invald_output_name_input_fn, steps=100)
 
   def test_custom_objects(self):
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index 9db9ccd01d..007970bef7 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -141,7 +141,7 @@ class EstimatorSpec(
         prediction.
       predictions: Predictions `Tensor` or dict of `Tensor`.
       loss: Training loss `Tensor`. Must be either scalar, or with shape `[1]`.
-      train_op: Op for the training step.
+      train_op: Op to run one training step.
       eval_metric_ops: Dict of metric results keyed by name. The values of the
         dict are the results of calling a metric function, namely a
         `(metric_tensor, update_op)` tuple. `metric_tensor` should be evaluated
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
index 08e41fd414..b6f1b16a22 100644
--- a/tensorflow/python/estimator/model_fn_test.py
+++ b/tensorflow/python/estimator/model_fn_test.py
@@ -48,7 +48,7 @@ class EstimatorSpecTrainTest(test.TestCase):
 
   def testRequiredArgumentsSet(self):
     """Tests that no errors are raised when all required arguments are set."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.TRAIN,
           loss=constant_op.constant(1.),
@@ -56,7 +56,7 @@ class EstimatorSpecTrainTest(test.TestCase):
 
   def testAllArgumentsSet(self):
     """Tests that no errors are raised when all arguments are set."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       predictions = {'loss': loss}
       classes = constant_op.constant('hello')
@@ -77,7 +77,7 @@ class EstimatorSpecTrainTest(test.TestCase):
 
   def testLossNumber(self):
     """Tests that error is raised when loss is a number (not Tensor)."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
         model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.TRAIN,
@@ -86,20 +86,20 @@ class EstimatorSpecTrainTest(test.TestCase):
 
   def testLoss1DTensor(self):
     """Tests that no errors are raised when loss is 1D tensor."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.TRAIN,
           loss=constant_op.constant([1.]),
           train_op=control_flow_ops.no_op())
 
   def testLossMissing(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(ValueError, 'Missing loss'):
         model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.TRAIN, train_op=control_flow_ops.no_op())
 
   def testLossNotScalar(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(ValueError, 'Loss must be scalar'):
         model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.TRAIN,
@@ -107,7 +107,7 @@ class EstimatorSpecTrainTest(test.TestCase):
             train_op=control_flow_ops.no_op())
 
   def testLossSparseTensor(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = sparse_tensor.SparseTensor(
           indices=[[0]],
           values=[0.],
@@ -121,7 +121,7 @@ class EstimatorSpecTrainTest(test.TestCase):
   def testLossFromDifferentGraph(self):
     with ops.Graph().as_default():
       loss = constant_op.constant(1.)
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'must be from the default graph'):
         model_fn.EstimatorSpec(
@@ -130,13 +130,13 @@ class EstimatorSpecTrainTest(test.TestCase):
             train_op=control_flow_ops.no_op())
 
   def testTrainOpMissing(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(ValueError, 'Missing train_op'):
         model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.TRAIN, loss=constant_op.constant(1.))
 
   def testTrainOpNotOperationAndTensor(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(TypeError,
                                    'train_op must be Operation or Tensor'):
         model_fn.EstimatorSpec(
@@ -147,7 +147,7 @@ class EstimatorSpecTrainTest(test.TestCase):
   def testTrainOpFromDifferentGraph(self):
     with ops.Graph().as_default():
       train_op = control_flow_ops.no_op()
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'must be from the default graph'):
         model_fn.EstimatorSpec(
@@ -156,7 +156,7 @@ class EstimatorSpecTrainTest(test.TestCase):
             train_op=train_op)
 
   def testTrainingChiefHookInvalid(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           TypeError, 'All hooks must be SessionRunHook instances'):
         model_fn.EstimatorSpec(
@@ -166,7 +166,7 @@ class EstimatorSpecTrainTest(test.TestCase):
             training_chief_hooks=[_InvalidHook()])
 
   def testTrainingHookInvalid(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           TypeError, 'All hooks must be SessionRunHook instances'):
         model_fn.EstimatorSpec(
@@ -176,7 +176,7 @@ class EstimatorSpecTrainTest(test.TestCase):
             training_hooks=[_InvalidHook()])
 
   def testScaffoldInvalid(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           TypeError, r'scaffold must be tf\.train\.Scaffold'):
         model_fn.EstimatorSpec(
@@ -186,7 +186,7 @@ class EstimatorSpecTrainTest(test.TestCase):
             scaffold=_InvalidScaffold())
 
   def testReturnDefaultScaffold(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       estimator_spec = model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.TRAIN,
           loss=constant_op.constant(1.),
@@ -199,7 +199,7 @@ class EstimatorSpecEvalTest(test.TestCase):
 
   def testRequiredArgumentsSet(self):
     """Tests that no errors are raised when all required arguments are set."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL,
@@ -208,7 +208,7 @@ class EstimatorSpecEvalTest(test.TestCase):
 
   def testAllArgumentsSet(self):
     """Tests that no errors are raised when all arguments are set."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       predictions = {'loss': loss}
       classes = constant_op.constant('hello')
@@ -227,7 +227,7 @@ class EstimatorSpecEvalTest(test.TestCase):
           evaluation_hooks=[_FakeHook()])
 
   def testEvaluationHookInvalid(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           TypeError, 'All hooks must be SessionRunHook instances'):
         model_fn.EstimatorSpec(
@@ -237,7 +237,7 @@ class EstimatorSpecEvalTest(test.TestCase):
 
   def testTupleMetric(self):
     """Tests that no errors are raised when a metric is tuple-valued."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL,
@@ -248,7 +248,7 @@ class EstimatorSpecEvalTest(test.TestCase):
 
   def testLoss1DTensor(self):
     """Tests that no errors are raised when loss is 1D tensor."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant([1.])
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL,
@@ -257,7 +257,7 @@ class EstimatorSpecEvalTest(test.TestCase):
 
   def testLossNumber(self):
     """Tests that error is raised when loss is a number (not Tensor)."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
         model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
@@ -265,14 +265,14 @@ class EstimatorSpecEvalTest(test.TestCase):
             loss=1.)
 
   def testLossMissing(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(ValueError, 'Missing loss'):
         model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions={'loss': constant_op.constant(1.)})
 
   def testLossNotScalar(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant([1., 2.])
       with self.assertRaisesRegexp(ValueError, 'Loss must be scalar'):
         model_fn.EstimatorSpec(
@@ -281,7 +281,7 @@ class EstimatorSpecEvalTest(test.TestCase):
             loss=loss)
 
   def testLossSparseTensor(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = sparse_tensor.SparseTensor(
           indices=[[0]],
           values=[0.],
@@ -296,7 +296,7 @@ class EstimatorSpecEvalTest(test.TestCase):
   def testLossFromDifferentGraph(self):
     with ops.Graph().as_default():
       loss = constant_op.constant(1.)
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'must be from the default graph'):
         model_fn.EstimatorSpec(
@@ -305,7 +305,7 @@ class EstimatorSpecEvalTest(test.TestCase):
             loss=loss)
 
   def testReplaceRaisesConstructorChecks(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       spec = model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL, predictions={'loss': loss}, loss=loss)
@@ -313,7 +313,7 @@ class EstimatorSpecEvalTest(test.TestCase):
         spec._replace(loss=constant_op.constant([1., 2.]))
 
   def testReplaceDoesReplace(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       spec = model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL, predictions={'loss': loss}, loss=loss)
@@ -321,7 +321,7 @@ class EstimatorSpecEvalTest(test.TestCase):
       self.assertEqual(['m'], list(new_spec.predictions.keys()))
 
   def testReplaceNotAllowModeChange(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       spec = model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL, predictions={'loss': loss}, loss=loss)
@@ -331,13 +331,13 @@ class EstimatorSpecEvalTest(test.TestCase):
         spec._replace(mode=model_fn.ModeKeys.TRAIN)
 
   def testPredictionsMissingIsOkay(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL, loss=constant_op.constant(1.))
 
   def testPredictionsTensor(self):
     """Tests that no error is raised when predictions is Tensor (not dict)."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.EVAL,
@@ -345,7 +345,7 @@ class EstimatorSpecEvalTest(test.TestCase):
           loss=loss)
 
   def testPredictionsNumber(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           TypeError, r'predictions\[number\] must be Tensor'):
         model_fn.EstimatorSpec(
@@ -354,7 +354,7 @@ class EstimatorSpecEvalTest(test.TestCase):
             loss=constant_op.constant(1.))
 
   def testPredictionsSparseTensor(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {
           'sparse': sparse_tensor.SparseTensor(
               indices=[[0]],
@@ -370,7 +370,7 @@ class EstimatorSpecEvalTest(test.TestCase):
   def testPredictionsFromDifferentGraph(self):
     with ops.Graph().as_default():
       predictions = {'loss': constant_op.constant(1.)}
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           ValueError, 'must be from the default graph'):
         model_fn.EstimatorSpec(
@@ -379,7 +379,7 @@ class EstimatorSpecEvalTest(test.TestCase):
             loss=constant_op.constant(1.))
 
   def testEvalMetricOpsNoDict(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       with self.assertRaisesRegexp(
           TypeError, 'eval_metric_ops must be a dict'):
@@ -390,7 +390,7 @@ class EstimatorSpecEvalTest(test.TestCase):
             eval_metric_ops=loss)
 
   def testEvalMetricOpsNoTuple(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       with self.assertRaisesRegexp(
           TypeError,
@@ -403,7 +403,7 @@ class EstimatorSpecEvalTest(test.TestCase):
             eval_metric_ops={'loss': loss})
 
   def testEvalMetricOpsNoTensorOrOperation(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       with self.assertRaisesRegexp(TypeError, 'must be Operation or Tensor'):
         model_fn.EstimatorSpec(
@@ -413,7 +413,7 @@ class EstimatorSpecEvalTest(test.TestCase):
             eval_metric_ops={'loss': ('NonTensor', loss)})
 
   def testEvalMetricNestedNoTensorOrOperation(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       with self.assertRaisesRegexp(TypeError, 'must be Operation or Tensor'):
         model_fn.EstimatorSpec(
@@ -427,7 +427,7 @@ class EstimatorSpecEvalTest(test.TestCase):
     with ops.Graph().as_default():
       eval_metric_ops = {
           'loss': (control_flow_ops.no_op(), constant_op.constant(1.))}
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       with self.assertRaisesRegexp(
           ValueError, 'must be from the default graph'):
@@ -443,14 +443,14 @@ class EstimatorSpecInferTest(test.TestCase):
 
   def testRequiredArgumentsSet(self):
     """Tests that no errors are raised when all required arguments are set."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.PREDICT,
           predictions={'loss': constant_op.constant(1.)})
 
   def testAllArgumentsSet(self):
     """Tests that no errors are raised when all arguments are set."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       loss = constant_op.constant(1.)
       predictions = {'loss': loss}
       classes = constant_op.constant('hello')
@@ -470,7 +470,7 @@ class EstimatorSpecInferTest(test.TestCase):
           prediction_hooks=[_FakeHook()])
 
   def testPredictionHookInvalid(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           TypeError, 'All hooks must be SessionRunHook instances'):
         model_fn.EstimatorSpec(
@@ -479,25 +479,25 @@ class EstimatorSpecInferTest(test.TestCase):
             prediction_hooks=[_InvalidHook()])
 
   def testPredictionsMissing(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(ValueError, 'Missing predictions'):
         model_fn.EstimatorSpec(mode=model_fn.ModeKeys.PREDICT)
 
   def testPredictionsTensor(self):
     """Tests that no error is raised when predictions is Tensor (not dict)."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       model_fn.EstimatorSpec(
           mode=model_fn.ModeKeys.PREDICT, predictions=constant_op.constant(1.))
 
   def testPredictionsNumber(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       with self.assertRaisesRegexp(
           TypeError, r'predictions\[number\] must be Tensor'):
         model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.PREDICT, predictions={'number': 1.})
 
   def testPredictionsSparseTensor(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {
           'sparse': sparse_tensor.SparseTensor(
               indices=[[0]],
@@ -509,7 +509,7 @@ class EstimatorSpecInferTest(test.TestCase):
             mode=model_fn.ModeKeys.PREDICT, predictions=predictions)
 
   def testExportOutputsNoDict(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {'loss': constant_op.constant(1.)}
       classes = constant_op.constant('hello')
       with self.assertRaisesRegexp(
@@ -520,7 +520,7 @@ class EstimatorSpecInferTest(test.TestCase):
             export_outputs=export_output.ClassificationOutput(classes=classes))
 
   def testExportOutputsValueNotExportOutput(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {'loss': constant_op.constant(1.)}
       with self.assertRaisesRegexp(
           TypeError,
@@ -533,7 +533,7 @@ class EstimatorSpecInferTest(test.TestCase):
             export_outputs={'head_name': predictions})
 
   def testExportOutputsSingleheadMissingDefault(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {'loss': constant_op.constant(1.)}
       output_1 = constant_op.constant([1.])
       regression_output = export_output.RegressionOutput(value=output_1)
@@ -552,7 +552,7 @@ class EstimatorSpecInferTest(test.TestCase):
       self.assertEqual(expected_export_outputs, estimator_spec.export_outputs)
 
   def testExportOutputsMultiheadWithDefault(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {'loss': constant_op.constant(1.)}
       output_1 = constant_op.constant([1.])
       output_2 = constant_op.constant(['2'])
@@ -571,7 +571,7 @@ class EstimatorSpecInferTest(test.TestCase):
       self.assertEqual(export_outputs, estimator_spec.export_outputs)
 
   def testExportOutputsMultiheadMissingDefault(self):
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {'loss': constant_op.constant(1.)}
       output_1 = constant_op.constant([1.])
       output_2 = constant_op.constant(['2'])
@@ -594,13 +594,13 @@ class EstimatorSpecInferTest(test.TestCase):
 
   def testDefaultExportOutputCreated(self):
     """Ensure that a default PredictOutput is created for export."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = constant_op.constant(1.)
       self._assertDefaultExportOutputForPredictions(predictions)
 
   def testDefaultExportOutputCreatedDict(self):
     """Ensure that a default PredictOutput is created for export for dicts."""
-    with ops.Graph().as_default(), self.test_session():
+    with ops.Graph().as_default(), self.cached_session():
       predictions = {'loss': constant_op.constant(1.),
                      'score': constant_op.constant(10.)}
       self._assertDefaultExportOutputForPredictions(predictions)
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 220c3e58ca..b1ca207b62 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -26,6 +26,7 @@ import six
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.distribute import estimator_training as distribute_coordinator_training
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat_internal
@@ -51,6 +52,7 @@ _DEFAULT_REPLACEABLE_LIST = [
     'device_fn',
     'protocol',
     'eval_distribute',
+    'experimental_distribute',
 ]
 
 _SAVE_CKPT_ERR = (
@@ -331,7 +333,8 @@ class RunConfig(object):
                train_distribute=None,
                device_fn=None,
                protocol=None,
-               eval_distribute=None):
+               eval_distribute=None,
+               experimental_distribute=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -458,7 +461,8 @@ class RunConfig(object):
       train_distribute: An optional instance of
         `tf.contrib.distribute.DistributionStrategy`. If specified,
         then Estimator will distribute the user's model during training,
-        according to the policy specified by that strategy.
+        according to the policy specified by that strategy. Setting
+        `experimental_distribute.train_distribute` is preferred.
       device_fn: A callable invoked for every `Operation` that takes the
         `Operation` and returns the device string. If `None`, defaults to
         the device function returned by `tf.train.replica_device_setter`
@@ -468,7 +472,13 @@ class RunConfig(object):
       eval_distribute: An optional instance of
         `tf.contrib.distribute.DistributionStrategy`. If specified,
         then Estimator will distribute the user's model during evaluation,
-        according to the policy specified by that strategy.
+        according to the policy specified by that strategy. Setting
+        `experimental_distribute.eval_distribute` is preferred.
+      experimental_distribute: an optional
+        `tf.contrib.distribute.DistributeConfig` object specifying
+        DistributionStrategy-related configuration. The `train_distribute` and
+        `eval_distribute` can be passed as parameters to `RunConfig` or set in
+        `experimental_distribute` but not both.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -508,11 +518,15 @@ class RunConfig(object):
         train_distribute=train_distribute,
         device_fn=device_fn,
         protocol=protocol,
-        eval_distribute=eval_distribute)
+        eval_distribute=eval_distribute,
+        experimental_distribute=experimental_distribute)
 
-    self._init_distributed_setting_from_environment_var(tf_config)
-
-    self._maybe_overwrite_session_config_for_distributed_training()
+    if train_distribute or eval_distribute or experimental_distribute:
+      logging.info('Initializing RunConfig with distribution strategies.')
+      distribute_coordinator_training.init_run_config(self, tf_config)
+    else:
+      self._init_distributed_setting_from_environment_var(tf_config)
+      self._maybe_overwrite_session_config_for_distributed_training()
 
   def _maybe_overwrite_session_config_for_distributed_training(self):
     """Overwrites the session_config for distributed training.
@@ -810,6 +824,7 @@ class RunConfig(object):
       - `device_fn`,
       - `protocol`.
       - `eval_distribute`,
+      - `experimental_distribute`,
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index bb1305767f..240be5dabe 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -26,6 +26,7 @@ import time
 import six
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import estimator_training as distribute_coordinator_training
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import exporter as exporter_lib
 from tensorflow.python.estimator import run_config as run_config_lib
@@ -129,8 +130,8 @@ class TrainSpec(
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        See [Premade Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
             tuple (features, labels) with same constraints as below.
@@ -193,8 +194,8 @@ class EvalSpec(
 
     Args:
       input_fn: A function that constructs the input data for evaluation.
-        See @{$premade_estimators#create_input_functions} for more
-        information. The function should construct and return one of
+        See [Premade Estimators](https://tensorflow.org/api_guides/premade_estimators#create_input_functions)
+        for more information. The function should construct and return one of
         the following:
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
             tuple (features, labels) with same constraints as below.
@@ -274,8 +275,10 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   evaluation `input_fn`, steps, etc.
 
   This utility function provides consistent behavior for both local
-  (non-distributed) and distributed configurations. Currently, the only
-  supported distributed training configuration is between-graph replication.
+  (non-distributed) and distributed configurations. The default distribution
+  configuration is parameter server-based between-graph replication. For other
+  types of distribution configurations such as all-reduce training, please use
+  [DistributionStrategies](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).  # pylint: disable=line-too-long
 
   Overfitting: In order to avoid overfitting, it is recommended to set up the
   training `input_fn` to shuffle the training data properly.
@@ -426,6 +429,11 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   }'
   ```
 
+  When `distribute` or `experimental_distribute.train_distribute` and
+  `experimental_distribute.remote_cluster` is set, this method will start a
+  client running on the current host which connects to the `remote_cluster` for
+  training and evaluation.
+
   Args:
     estimator: An `Estimator` instance to train and evaluate.
     train_spec: A `TrainSpec` instance to specify the training specification.
@@ -444,8 +452,16 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
 
   executor = _TrainingExecutor(
       estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
-
   config = estimator.config
+
+  # If `distribute_coordinator_mode` is set and running in distributed
+  # environment, we run `train_and_evaluate` via distribute coordinator.
+  if distribute_coordinator_training.should_run_distribute_coordinator(config):
+    logging.info('Running `train_and_evaluate` with Distribute Coordinator.')
+    distribute_coordinator_training.train_and_evaluate(
+        estimator, train_spec, eval_spec, _TrainingExecutor)
+    return
+
   if (config.task_type == run_config_lib.TaskType.EVALUATOR and
       config.task_id > 0):
     raise ValueError(
@@ -837,6 +853,13 @@ class _TrainingExecutor(object):
     if difference > 0:
       logging.info('Waiting %f secs before starting next eval run.', difference)
       time.sleep(difference)
+    elif (throttle_secs == 0 and
+          eval_result.status != _EvalStatus.EVALUATED):
+      # Prints a user-actionable warning to avoid unnecessary load on evaluator.
+      logging.warning(
+          'EvalSpec.throttle_secs is set as 0. This might overload the job '
+          'before finding (next) new checkpoint. Please consider to increase '
+          'it.')
 
     return (eval_result, should_early_stop)
 
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index dc106c7d3b..7d46917a6f 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -83,6 +83,9 @@ _INVALID_EVAL_LISTENER_MSG = 'must have type `_ContinuousEvalListener`'
 _INVALID_CONFIG_FOR_STD_SERVER_MSG = 'Could not start server; .*TF_CONFIG'
 _INVALID_LOCAL_TASK_WITH_CLUSTER = '`task.type` in TF_CONFIG cannot be `local`'
 _INVALID_TASK_TYPE = '`estimator.config` must have task_type set.'
+_INPROPER_THROTTL_SECS = (
+    'EvalSpec.throttle_secs is set as 0.*Please consider to increase')
+
 # The message should NOT have 'local' word as part of it. As (?!word) is looking
 # ahead, so, the $ (ending) check is required; otherwise, it will match
 # partially and return successuful.
@@ -1281,7 +1284,7 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     ]
 
     eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=2)
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
     with test.mock.patch.object(logging, 'warning') as mock_log:
@@ -1295,6 +1298,34 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     # successuful evaluation)
     self.assertEqual(2, mock_log.call_count)
 
+  def test_warning_if_throttle_secs_is_zero(self):
+    training_max_step = 200
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.evaluate.side_effect = [
+        {_GLOBAL_STEP_KEY: training_max_step}
+    ]
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec.max_steps = training_max_step
+
+    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
+
+    # We need to make the first one invalid, so it will check the
+    # throttle_secs=0.
+    mock_est.latest_checkpoint.side_effect = [None, 'path']
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      executor.run_evaluator()
+
+    # First ckpt is invalid.
+    self.assertEqual(2, mock_est.latest_checkpoint.call_count)
+    self.assertEqual(1, mock_est.evaluate.call_count)
+
+    self.assertRegexpMatches(str(mock_log.call_args), _INPROPER_THROTTL_SECS)
+
   def test_continuous_eval_listener_eval_result(self):
     training_max_step = 200
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
diff --git a/tensorflow/python/estimator/util_test.py b/tensorflow/python/estimator/util_test.py
index d7e0610779..d440c454dc 100644
--- a/tensorflow/python/estimator/util_test.py
+++ b/tensorflow/python/estimator/util_test.py
@@ -39,7 +39,7 @@ class UtilTest(test.TestCase):
 
     features, labels, hooks = util.parse_input_fn_result(_input_fn())
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       vals = sess.run([features, labels])
 
     self.assertAllEqual(vals[0], np.arange(100))
@@ -67,7 +67,7 @@ class UtilTest(test.TestCase):
 
     features, labels, hooks = util.parse_input_fn_result(_input_fn())
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       vals = sess.run([features])
 
     self.assertAllEqual(vals[0], np.arange(100))
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 6be930be87..9b482237ab 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -262,7 +262,7 @@ class NumericColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([price]))
     self.assertIn('price', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
   def test_parse_example_with_default_value(self):
@@ -284,7 +284,7 @@ class NumericColumnTest(test.TestCase):
                     no_data.SerializeToString()],
         features=fc.make_parse_example_spec([price]))
     self.assertIn('price', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
 
   def test_normalizer_fn_must_be_callable(self):
@@ -298,7 +298,7 @@ class NumericColumnTest(test.TestCase):
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
 
   def test_get_dense_tensor(self):
@@ -433,7 +433,7 @@ class BucketizedColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([bucketized_price]))
     self.assertIn('price', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
   def test_transform_feature(self):
@@ -700,7 +700,7 @@ class HashedCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -719,7 +719,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = outputs[hashed_sparse]
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [6, 4, 1]
-    with self.test_session():
+    with self.cached_session():
       self.assertEqual(dtypes.int64, output.values.dtype)
       self.assertAllEqual(expected_values, output.values.eval())
       self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
@@ -775,7 +775,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = builder.get(hashed_sparse)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
   def test_int32_64_is_compatible(self):
@@ -789,7 +789,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = builder.get(hashed_sparse)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
   def test_get_sparse_tensors(self):
@@ -984,7 +984,7 @@ class CrossedColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([price_cross_wire]))
     self.assertIn('price', features)
     self.assertIn('wire', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
       wire_sparse = features['wire']
       self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
@@ -1007,7 +1007,7 @@ class CrossedColumnTest(test.TestCase):
     }
     outputs = _transform_features(features, [price_cross_wire])
     output = outputs[price_cross_wire]
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       output_val = sess.run(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
@@ -2747,6 +2747,62 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
+  def test_fills_cols_to_vars_shared_embedding(self):
+    # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
+    # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
+    # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
+    # shared one variable.
+    price1 = fc.numeric_column('price1')
+    dense_feature = fc.numeric_column('dense_feature')
+    dense_feature_bucketized = fc.bucketized_column(
+        dense_feature, boundaries=[0.])
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[3.], [4.]],
+          'dense_feature': [[-1.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      cols_to_vars = {}
+      all_cols = [
+          price1, dense_feature_bucketized, some_embedding_column,
+          shared_embedding_a, shared_embedding_b
+      ]
+      fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
+      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertEqual(0, len(cols_to_vars[price1]))
+      self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
+      self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
+      self.assertEqual(1, len(cols_to_vars[shared_embedding_a]))
+      # This is a bug in the current implementation and should be fixed in the
+      # new one.
+      self.assertEqual(0, len(cols_to_vars[shared_embedding_b]))
+      self.assertIsInstance(cols_to_vars[some_embedding_column][0],
+                            variables_lib.Variable)
+      self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
+      self.assertIsInstance(cols_to_vars[shared_embedding_a][0],
+                            variables_lib.Variable)
+      self.assertAllEqual(cols_to_vars[shared_embedding_a][0].shape, [3, 2])
+
   def test_fills_cols_to_vars_partitioned_variables(self):
     price1 = fc.numeric_column('price1')
     dense_feature = fc.numeric_column('dense_feature')
@@ -2772,6 +2828,10 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
+      self.assertEqual(
+          'input_from_feature_columns/input_layer/sparse_feature_embedding/'
+          'embedding_weights/part_0:0',
+          cols_to_vars[some_embedding_column][0].name)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [2, 10])
       self.assertAllEqual(cols_to_vars[some_embedding_column][1].shape, [2, 10])
       self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10])
@@ -3262,7 +3322,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
     with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
-      with self.test_session():
+      with self.cached_session():
         lookup_ops.tables_initializer().run()
 
   def test_invalid_vocabulary_size(self):
@@ -3286,7 +3346,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
     with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
-      with self.test_session():
+      with self.cached_session():
         lookup_ops.tables_initializer().run()
 
   def test_invalid_num_oov_buckets(self):
@@ -3350,7 +3410,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -3775,7 +3835,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -3797,7 +3857,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -4096,7 +4156,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -4365,7 +4425,7 @@ class IndicatorColumnTest(test.TestCase):
         fc.categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({'animal': ['fox', 'fox']})
     output = builder.get(animal)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
 
   def test_2D_shape_succeeds(self):
@@ -4380,7 +4440,7 @@ class IndicatorColumnTest(test.TestCase):
                 dense_shape=[2, 1])
     })
     output = builder.get(animal)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
 
   def test_multi_hot(self):
@@ -4393,7 +4453,7 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
     })
     output = builder.get(animal)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
 
   def test_multi_hot2(self):
@@ -4405,7 +4465,7 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
     })
     output = builder.get(animal)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
 
   def test_deep_copy(self):
@@ -4430,7 +4490,7 @@ class IndicatorColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a_indicator]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -4641,7 +4701,7 @@ class EmbeddingColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a_embedded]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -5407,7 +5467,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([a_embedded, b_embedded]))
     self.assertIn('aaa', features)
     self.assertIn('bbb', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -5544,20 +5604,6 @@ class SharedEmbeddingColumnTest(test.TestCase):
       self.assertIsNone(partition_info)
       return embedding_values
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups_a = (
-        # example 0:
-        (7., 11.),  # ids [2], embedding = [7, 11]
-        # example 1:
-        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-    )
-    expected_lookups_b = (
-        # example 0:
-        (1., 2.),  # ids [0], embedding = [1, 2]
-        # example 1:
-        (0., 0.),  # ids [], embedding = [0, 0]
-    )
-
     # Build columns.
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
@@ -5990,7 +6036,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([a_weighted]))
     self.assertIn('aaa', features)
     self.assertIn('weights', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index b6bf516286..aa66ed77e9 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -142,6 +142,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -155,7 +156,6 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -164,67 +164,148 @@ from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 
 
-def _internal_input_layer(features,
-                          feature_columns,
-                          weight_collections=None,
-                          trainable=True,
-                          cols_to_vars=None,
-                          scope=None):
-  """See input_layer. `scope` is a name or variable scope to use."""
+class StateManager(object):
+  """Manages the state associated with FeatureColumns.
 
-  feature_columns = fc_old._normalize_feature_columns(feature_columns)  # pylint: disable=protected-access
-  for column in feature_columns:
-    if not isinstance(column, fc_old._DenseColumn):  # pylint: disable=protected-access
-      raise ValueError(
-          'Items of feature_columns must be a _DenseColumn. '
-          'You can wrap a categorical column with an '
-          'embedding_column or indicator_column. Given: {}'.format(column))
-  weight_collections = list(weight_collections or [])
-  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-
-  # a non-None `scope` can allow for variable reuse, when, e.g., this function
-  # is wrapped by a `make_template`.
-  with variable_scope.variable_scope(
-      scope, default_name='input_layer', values=features.values()):
-    builder = fc_old._LazyBuilder(features)  # pylint: disable=protected-access
-    output_tensors = []
-    ordered_columns = []
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
-        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
-        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
-        batch_size = array_ops.shape(tensor)[0]
-        output_tensors.append(
-            array_ops.reshape(tensor, shape=(batch_size, num_elements)))
-        if cols_to_vars is not None:
-          # Retrieve any variables created (some _DenseColumn's don't create
-          # variables, in which case an empty list is returned).
-          cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
-    _verify_static_batch_size_equality(output_tensors, ordered_columns)
-    return array_ops.concat(output_tensors, 1)
+  Some `FeatureColumn`s create variables or resources to assist their
+  computation. The `StateManager` is responsible for creating and storing these
+  objects since `FeatureColumn`s are supposed to be stateless configuration
+  only.
+  """
+
+  def create_variable(self,
+                      feature_column,
+                      name,
+                      shape,
+                      dtype=None,
+                      trainable=True,
+                      initializer=None):
+    """Creates a new variable.
+
+    Args:
+      feature_column: A `FeatureColumn` object this variable corresponds to.
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      trainable: Whether this variable is trainable or not.
+      initializer: initializer instance (callable).
+
+    Returns:
+      The created variable.
+    """
+    del feature_column, name, shape, dtype, trainable, initializer
+    raise NotImplementedError('StateManager.create_variable')
+
+  def add_variable(self, feature_column, var):
+    """Adds an existing variable to the state.
+
+    Args:
+      feature_column: A `FeatureColumn` object to associate this variable with.
+      var: The variable.
+    """
+    del feature_column, var
+    raise NotImplementedError('StateManager.add_variable')
+
+  def get_variable(self, feature_column, name):
+    """Returns an existing variable.
+
+    Args:
+      feature_column: A `FeatureColumn` object this variable corresponds to.
+      name: variable name.
+    """
+    del feature_column, name
+    raise NotImplementedError('StateManager.get_var')
+
+  def add_resource(self, feature_column, name, resource):
+    """Creates a new resource.
+
+    Resources can be things such as tables etc.
+
+    Args:
+      feature_column: A `FeatureColumn` object this resource corresponds to.
+      name: Name of the resource.
+      resource: The resource.
+
+    Returns:
+      The created resource.
+    """
+    del feature_column, name, resource
+    raise NotImplementedError('StateManager.add_resource')
 
+  def get_resource(self, feature_column, name):
+    """Returns an already created resource.
 
-def input_layer(features,
-                feature_columns,
-                weight_collections=None,
-                trainable=True,
-                cols_to_vars=None):
-  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
+    Resources can be things such as tables etc.
+
+    Args:
+      feature_column: A `FeatureColumn` object this variable corresponds to.
+      name: Name of the resource.
+    """
+    del feature_column, name
+    raise NotImplementedError('StateManager.get_resource')
+
+
+class _InputLayerStateManager(StateManager):
+  """Manages the state of InputLayer."""
+
+  def __init__(self, layer, feature_columns, trainable):
+    """Creates an _InputLayerStateManager object.
+
+    Args:
+      layer: The input layer this state manager is associated with.
+      feature_columns: List of feature columns for the input layer
+      trainable: Whether by default, variables created are trainable or not.
+    """
+    self._trainable = trainable
+    self._layer = layer
+    self._cols_to_vars_map = {}
+    self._cols_to_names_map = {}
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      self._cols_to_vars_map[column] = {}
+      base_name = column.name
+      if isinstance(column, SharedEmbeddingColumn):
+        base_name = column.shared_collection_name
+      with variable_scope.variable_scope(base_name) as vs:
+        self._cols_to_names_map[column] = _strip_leading_slashes(vs.name)
+
+  def create_variable(self,
+                      feature_column,
+                      name,
+                      shape,
+                      dtype=None,
+                      trainable=True,
+                      initializer=None):
+    if name in self._cols_to_vars_map[feature_column]:
+      raise ValueError('Variable already exists.')
+    with variable_scope.variable_scope(self._cols_to_names_map[feature_column]):
+      var = self._layer.add_variable(
+          name=name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          trainable=self._trainable and trainable,
+          # TODO(rohanj): Get rid of this hack once we have a mechanism for
+          # specifying a default partitioner for an entire layer. In that case,
+          # the default getter for Layers should work.
+          getter=variable_scope.get_variable)
+      self._cols_to_vars_map[feature_column][name] = var
+      return var
+
+  def get_variable(self, feature_column, name):
+    if name in self._cols_to_vars_map[feature_column]:
+      return self._cols_to_vars_map[feature_column][name]
+    raise ValueError('Variable does not exist.')
+
+
+class FeatureLayer(Layer):
+  """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
   At the first layer of the model, this column oriented data should be converted
   to a single `Tensor`.
 
+  This layer can be called multiple times with different features.
+
   Example:
 
   ```python
@@ -233,105 +314,122 @@ def input_layer(features,
       categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
   columns = [price, keywords_embedded, ...]
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  dense_tensor = input_layer(features, columns)
+  feature_layer = FeatureLayer(columns)
+  dense_tensor = feature_layer(features)
   for units in [128, 64, 32]:
     dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
-  prediction = tf.layers.dense(dense_tensor, 1)
-  ```
-
-  Args:
-    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
-      keys. For example `numeric_column('price')` will look at 'price' key in
-      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
-      corresponding `_FeatureColumn`.
-    feature_columns: An iterable containing the FeatureColumns to use as inputs
-      to your model. All items should be instances of classes derived from
-      `_DenseColumn` such as `numeric_column`, `embedding_column`,
-      `bucketized_column`, `indicator_column`. If you have categorical features,
-      you can wrap them with an `embedding_column` or `indicator_column`.
-    weight_collections: A list of collection names to which the Variable will be
-      added. Note that variables will also be added to collections
-      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
-    trainable: If `True` also add the variable to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
-      mapping from `_FeatureColumn` to list of `Variable`s.  For example, after
-      the call, we might have cols_to_vars =
-      {_EmbeddingColumn(
-        categorical_column=_HashedCategoricalColumn(
-          key='sparse_feature', hash_bucket_size=5, dtype=tf.string),
-        dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10),
-                        <tf.Variable 'some_variable:1' shape=(5, 10)]}
-      If a column creates no variables, its value will be an empty list.
-
-  Returns:
-    A `Tensor` which represents input layer of a model. Its shape
-    is (batch_size, first_layer_dimension) and its dtype is `float32`.
-    first_layer_dimension is determined based on given `feature_columns`.
-
-  Raises:
-    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
-  """
-  return _internal_input_layer(features, feature_columns, weight_collections,
-                               trainable, cols_to_vars)
-
-
-# TODO(akshayka): InputLayer should be a subclass of Layer, and it
-# should implement the logic in input_layer using Layer's build-and-call
-# paradigm; input_layer should create an instance of InputLayer and
-# return the result of invoking its apply method, just as functional layers do.
-class InputLayer(object):
-  """An object-oriented version of `input_layer` that reuses variables."""
+  prediction = tf.layers.dense(dense_tensor, 1)."""
 
   def __init__(self,
                feature_columns,
-               weight_collections=None,
                trainable=True,
-               cols_to_vars=None):
-    """See `input_layer`."""
+               name=None,
+               shared_state_manager=None,
+               **kwargs):
+    """Constructs a FeatureLayer.
 
-    self._feature_columns = feature_columns
-    self._weight_collections = weight_collections
-    self._trainable = trainable
-    self._cols_to_vars = cols_to_vars
-    self._input_layer_template = template.make_template(
-        'feature_column_input_layer',
-        _internal_input_layer,
-        create_scope_now_=True)
-    self._scope = self._input_layer_template.variable_scope
-
-  def __call__(self, features):
-    return self._input_layer_template(
-        features=features,
-        feature_columns=self._feature_columns,
-        weight_collections=self._weight_collections,
-        trainable=self._trainable,
-        cols_to_vars=None,
-        scope=self._scope)
+    Args:
+      feature_columns: An iterable containing the FeatureColumns to use as
+        inputs to your model. All items should be instances of classes derived
+        from `DenseColumn` such as `numeric_column`, `embedding_column`,
+        `bucketized_column`, `indicator_column`. If you have categorical
+        features, you can wrap them with an `embedding_column` or
+        `indicator_column`.
+      trainable: If `True` also add the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      name: Name to give to the FeatureLayer.
+      shared_state_manager: SharedEmbeddingStateManager that manages the state
+        of SharedEmbeddingColumns. The state of SharedEmbeddingColumns, unlike
+        regular embedding columns cannot be owned by the InputLayer itself since
+        SharedEmbeddingColumns can be shared across different InputLayers. As a
+        result users are expected to create a SharedEmbeddingStateManager object
+        which would be responsible for managing the shared state and can be
+        passed into different InputLayer objects to share state. For example,
+
+        ```python
+        sc_1, sc_2 = shared_embedding_column_v2(...)
+        sc_3, sc_4 = shared_embedding_column_v2(...)
+        ssm = SharedEmbeddingStateManager()
+        feature_layer1 = FeatureLayer([sc_1, sc_3], ...,
+                                      shared_state_manager=ssm)
+        feature_layer2 = FeatureLayer([sc_2, sc_4], ...,
+                                      shared_state_manager=ssm)
+        ```
+        now input_layer1 and input_layer2 will share variables across. If
+        sharing is not desired, one can create 2 separate
+        SharedEmbeddingStateManager objects
+
+        ```python
+        ssm1 = SharedEmbeddingStateManager()
+        ssm2 = SharedEmbeddingStateManager()
+        feature_layer1 = FeatureLayer([sc_1, sc_3], ...,
+                                      shared_state_manager=ssm1)
+        feature_layer2 = FeatureLayer([sc_2, sc_4], ...,
+                                      shared_state_manager=ssm2)
+        ```
+      **kwargs: Keyword arguments to construct a layer.
 
-  @property
-  def non_trainable_variables(self):
-    return self._input_layer_template.non_trainable_variables
+    Raises:
+      ValueError: if an item in `feature_columns` is not a `DenseColumn`.
+    """
+    super(FeatureLayer, self).__init__(name=name, trainable=trainable, **kwargs)
 
-  @property
-  def non_trainable_weights(self):
-    return self._input_layer_template.non_trainable_weights
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._state_manager = _InputLayerStateManager(self, self._feature_columns,
+                                                  self.trainable)
+    self._shared_state_manager = shared_state_manager
+    for column in sorted(self._feature_columns, key=lambda x: x.name):
+      if not isinstance(column, DenseColumn):
+        raise ValueError(
+            'Items of feature_columns must be a DenseColumn. '
+            'You can wrap a categorical column with an '
+            'embedding_column or indicator_column. Given: {}'.format(column))
 
-  @property
-  def trainable_variables(self):
-    return self._input_layer_template.trainable_variables
+  def build(self, _):
+    for column in sorted(self._feature_columns, key=lambda x: x.name):
+      if isinstance(column, SharedEmbeddingColumn):
+        column.create_state(self._shared_state_manager)
+      else:
+        with variable_scope.variable_scope(None, default_name=self.name):
+          column.create_state(self._state_manager)
+      super(FeatureLayer, self).build(None)
 
-  @property
-  def trainable_weights(self):
-    return self._input_layer_template.trainable_weights
+  def call(self, features, cols_to_output_tensors=None):
+    """Returns a dense tensor corresponding to the `feature_columns`.
 
-  @property
-  def variables(self):
-    return self._input_layer_template.variables
+    Args:
+      features: A mapping from key to tensors. `FeatureColumn`s look up via
+        these keys. For example `numeric_column('price')` will look at 'price'
+        key in this dict. Values can be a `SparseTensor` or a `Tensor` depends
+        on corresponding `FeatureColumn`.
+      cols_to_output_tensors: If not `None`, this will be filled with a dict
+        mapping feature columns to output tensors created.
 
-  @property
-  def weights(self):
-    return self._input_layer_template.weights
+    Returns:
+      A `Tensor` which represents input layer of a model. Its shape
+      is (batch_size, first_layer_dimension) and its dtype is `float32`.
+      first_layer_dimension is determined based on given `feature_columns`.
+    """
+    transformation_cache = FeatureTransformationCache(features)
+    output_tensors = []
+    ordered_columns = []
+    for column in sorted(self._feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      if isinstance(column, SharedEmbeddingColumn):
+        tensor = column.get_dense_tensor(transformation_cache,
+                                         self._shared_state_manager)
+      else:
+        tensor = column.get_dense_tensor(transformation_cache,
+                                         self._state_manager)
+      num_elements = column.variable_shape.num_elements()
+      batch_size = array_ops.shape(tensor)[0]
+      tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+      output_tensors.append(tensor)
+      if cols_to_output_tensors is not None:
+        cols_to_output_tensors[column] = tensor
+
+    _verify_static_batch_size_equality(output_tensors, ordered_columns)
+    return array_ops.concat(output_tensors, 1)
 
 
 def linear_model(features,
@@ -565,12 +663,15 @@ class _BiasLayer(base.Layer):
     return self._bias_variable
 
 
-def _get_expanded_variable_list(variable):
-  if (isinstance(variable, variables.Variable) or
-      resource_variable_ops.is_resource_variable(variable)):
-    return [variable]  # Single variable case.
-  else:  # Must be a PartitionedVariable, so convert into a list.
-    return list(variable)
+def _get_expanded_variable_list(var_list):
+  returned_list = []
+  for variable in var_list:
+    if (isinstance(variable, variables.Variable) or
+        resource_variable_ops.is_resource_variable(variable)):
+      returned_list.append(variable)  # Single variable case.
+    else:  # Must be a PartitionedVariable, so convert into a list.
+      returned_list.extend(list(variable))
+  return returned_list
 
 
 def _strip_leading_slashes(name):
@@ -661,7 +762,7 @@ class _LinearModel(training.Model):
               scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
           name='weighted_sum')
       bias = self._bias_layer.variables[0]
-      self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
+      self._cols_to_vars['bias'] = _get_expanded_variable_list([bias])
     return predictions
 
   def _add_layers(self, layers):
@@ -877,10 +978,15 @@ def embedding_column(
       trainable=trainable)
 
 
-def shared_embedding_columns(
-    categorical_columns, dimension, combiner='mean', initializer=None,
-    shared_embedding_collection_name=None, ckpt_to_load_from=None,
-    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
+def shared_embedding_columns_v2(categorical_columns,
+                                dimension,
+                                combiner='mean',
+                                initializer=None,
+                                shared_embedding_collection_name=None,
+                                ckpt_to_load_from=None,
+                                tensor_name_in_ckpt=None,
+                                max_norm=None,
+                                trainable=True):
   """List of dense columns that convert from sparse, categorical input.
 
   This is similar to `embedding_column`, except that it produces a list of
@@ -1803,51 +1909,6 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
       keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
 
 
-class StateManager(object):
-  """Manages the state associated with FeatureColumns.
-
-  Some `FeatureColumn`s create variables or resources to assist their
-  computation. The `StateManager` is responsible for creating and storing these
-  objects since `FeatureColumn`s are supposed to be stateless configuration
-  only.
-  """
-
-  def get_variable(self,
-                   feature_column,
-                   name,
-                   shape,
-                   dtype=None,
-                   initializer=None):
-    """Creates a new variable or returns an existing one.
-
-    Args:
-      feature_column: A `FeatureColumn` object this variable corresponds to.
-      name: variable name.
-      shape: variable shape.
-      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-      initializer: initializer instance (callable).
-
-    Returns:
-      The variable.
-    """
-    raise NotImplementedError('StateManager.get_variable')
-
-  def get_resource(self, feature_column, name, resource_creator):
-    """Creates a new resource or returns an existing one.
-
-    Resources can be things such as tables etc.
-
-    Args:
-      feature_column: A `FeatureColumn` object this variable corresponds to.
-      name: Name of the resource.
-      resource_creator: A callable that can create the resource.
-
-    Returns:
-      The resource.
-    """
-    raise NotImplementedError('StateManager.get_resource')
-
-
 class FeatureColumn(object):
   """Represents a feature column abstraction.
 
@@ -2550,6 +2611,17 @@ class EmbeddingColumn(
     """See `DenseColumn` base class."""
     return tensor_shape.vector(self.dimension)
 
+  def create_state(self, state_manager):
+    """Creates the embedding lookup variable."""
+    embedding_shape = (self.categorical_column.num_buckets, self.dimension)
+    state_manager.create_variable(
+        self,
+        name='embedding_weights',
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        trainable=self.trainable,
+        initializer=self.initializer)
+
   def _get_dense_tensor_internal(self, transformation_cache, state_manager):
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
@@ -2558,13 +2630,8 @@ class EmbeddingColumn(
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    embedding_shape = (self.categorical_column.num_buckets, self.dimension)
     embedding_weights = state_manager.get_variable(
-        self,
-        name='embedding_weights',
-        shape=embedding_shape,
-        dtype=dtypes.float32,
-        initializer=self.initializer)
+        self, name='embedding_weights')
 
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
@@ -2637,6 +2704,68 @@ def _get_graph_for_variable(var):
     return var.graph
 
 
+class SharedEmbeddingStateManager(Layer):
+  """A state manager that handle the state of shared embedding columns.
+
+  This can handle multiple sets of columns that share variables."""
+
+  def __init__(self, trainable=True, name=None, **kwargs):
+    """Constructs a `SharedEmbeddingStateManager`.
+
+    Args:
+      trainable: If true, variables created are trainable.
+      name: Name of the State Manager.
+      **kwargs: Keyword arguments.
+    """
+    super(SharedEmbeddingStateManager, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    self._var_dict = {}
+
+  def create_variable(self,
+                      name,
+                      shape,
+                      dtype=None,
+                      trainable=True,
+                      initializer=None):
+    """Creates a variable.
+
+    Makes sure only one var is created per `shared_collection_name`. `name` is
+    ignored here as the variable is named `shared_collection_name` instead.
+
+    Args:
+      name: Name of the variable. Not used.
+      shape: Variable shape.
+      dtype: Variable type.
+      trainable: If variable created should be trainable or not.
+      initializer: Variable initializer.
+
+    Returns:
+      A variable or partitioned variable.
+    """
+    if name in self._var_dict:
+      var = self._var_dict[name]
+      return var
+    with variable_scope.variable_scope(
+        self.name, reuse=variable_scope.AUTO_REUSE):
+      var = self.add_variable(
+          name=name,
+          shape=shape,
+          dtype=dtype,
+          trainable=self.trainable and trainable,
+          initializer=initializer,
+          # TODO(rohanj): Get rid of this hack once we have a mechanism for
+          # specifying a default partitioner for an entire layer. In that case,
+          # the default getter for Layers should work.
+          getter=variable_scope.get_variable)
+    self._var_dict[name] = var
+    return var
+
+  def get_variable(self, feature_column, name):
+    if name not in self._var_dict:
+      raise ValueError('Variable name: {} not recognized.'.format(name))
+    return self._var_dict[name]
+
+
 class SharedEmbeddingColumn(
     DenseColumn, SequenceDenseColumn,
     collections.namedtuple(
@@ -2675,6 +2804,16 @@ class SharedEmbeddingColumn(
     """See `DenseColumn` base class."""
     return tensor_shape.vector(self.dimension)
 
+  def create_state(self, state_manager):
+    """Creates the shared embedding lookup variable."""
+    embedding_shape = (self.categorical_column.num_buckets, self.dimension)
+    state_manager.create_variable(
+        name=self.shared_collection_name,
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        trainable=self.trainable,
+        initializer=self.initializer)
+
   def _get_dense_tensor_internal(self, transformation_cache, state_manager):
     """Private method that follows the signature of _get_dense_tensor."""
     # This method is called from a variable_scope with name _var_scope_name,
@@ -2687,13 +2826,8 @@ class SharedEmbeddingColumn(
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
-      embedding_shape = (self.categorical_column.num_buckets, self.dimension)
       embedding_weights = state_manager.get_variable(
-          self,
-          name='embedding_weights',
-          shape=embedding_shape,
-          dtype=dtypes.float32,
-          initializer=self.initializer)
+          self, name=self.shared_collection_name)
 
       if self.ckpt_to_load_from is not None:
         to_restore = embedding_weights
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 80a9d5d40e..6b343ecf3e 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -33,12 +33,12 @@ from tensorflow.python.eager import context
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column.feature_column_v2 import _LinearModel
+from tensorflow.python.feature_column.feature_column_v2 import _transform_features
 from tensorflow.python.feature_column.feature_column_v2 import FeatureColumn
+from tensorflow.python.feature_column.feature_column_v2 import FeatureLayer
 from tensorflow.python.feature_column.feature_column_v2 import FeatureTransformationCache
-from tensorflow.python.feature_column.feature_column_v2 import InputLayer
 from tensorflow.python.feature_column.feature_column_v2 import StateManager
-from tensorflow.python.feature_column.feature_column_v2 import _LinearModel
-from tensorflow.python.feature_column.feature_column_v2 import _transform_features
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -269,7 +269,7 @@ class NumericColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([price]))
     self.assertIn('price', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
   def test_parse_example_with_default_value(self):
@@ -291,7 +291,7 @@ class NumericColumnTest(test.TestCase):
                     no_data.SerializeToString()],
         features=fc.make_parse_example_spec([price]))
     self.assertIn('price', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
 
   def test_normalizer_fn_must_be_callable(self):
@@ -305,7 +305,7 @@ class NumericColumnTest(test.TestCase):
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price], None)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
 
   def test_get_dense_tensor(self):
@@ -439,7 +439,7 @@ class BucketizedColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([bucketized_price]))
     self.assertIn('price', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
   def test_transform_feature(self):
@@ -717,7 +717,7 @@ class HashedCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -736,7 +736,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = outputs[hashed_sparse]
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [6, 4, 1]
-    with self.test_session():
+    with self.cached_session():
       self.assertEqual(dtypes.int64, output.values.dtype)
       self.assertAllEqual(expected_values, output.values.eval())
       self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
@@ -792,7 +792,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
   def test_int32_64_is_compatible(self):
@@ -806,7 +806,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
   def test_get_sparse_tensors(self):
@@ -824,22 +824,6 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
-  def DISABLED_test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_hash_bucket('aaa', 10)
-    inputs = sparse_tensor.SparseTensor(
-        values=['omar', 'stringer', 'marlo'],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    column._get_sparse_tensors(
-        FeatureTransformationCache({
-            'aaa': inputs
-        }),
-        weight_collections=('my_weights',))
-
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
-
   def test_get_sparse_tensors_dense_input(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     transformation_cache = FeatureTransformationCache({
@@ -1000,7 +984,7 @@ class CrossedColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([price_cross_wire]))
     self.assertIn('price', features)
     self.assertIn('wire', features)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
       wire_sparse = features['wire']
       self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
@@ -1023,7 +1007,7 @@ class CrossedColumnTest(test.TestCase):
     }
     outputs = _transform_features(features, [price_cross_wire], None)
     output = outputs[price_cross_wire]
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       output_val = sess.run(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
@@ -2640,13 +2624,13 @@ class _LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
-class InputLayerTest(test.TestCase):
+class FeatureLayerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    input_layer = InputLayer(fc_old.numeric_column('a'))
-    inputs = self.evaluate(input_layer(features))
+    feature_layer = FeatureLayer(fc.numeric_column('a'))
+    inputs = self.evaluate(feature_layer(features))
     self.assertAllClose([[0.]], inputs)
 
   def test_reuses_variables(self):
@@ -2657,7 +2641,7 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc_old.categorical_column_with_identity(
+      categorical_column = fc.categorical_column_with_identity(
           key='a', num_buckets=3)
       embedding_dimension = 2
       def _embedding_column_initializer(shape, dtype, partition_info):
@@ -2670,16 +2654,16 @@ class InputLayerTest(test.TestCase):
             (1, 1))  # id 2
         return embedding_values
 
-      embedding_column = fc_old.embedding_column(
+      embedding_column = fc.embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      input_layer = InputLayer([embedding_column])
+      feature_layer = FeatureLayer([embedding_column])
       features = {'a': sparse_input}
 
-      inputs = input_layer(features)
-      variables = input_layer.variables
+      inputs = feature_layer(features)
+      variables = feature_layer.variables
 
       # Sanity check: test that the inputs are correct.
       self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
@@ -2687,13 +2671,13 @@ class InputLayerTest(test.TestCase):
       # Check that only one variable was created.
       self.assertEqual(1, len(variables))
 
-      # Check that invoking input_layer on the same features does not create
+      # Check that invoking feature_layer on the same features does not create
       # additional variables
-      _ = input_layer(features)
+      _ = feature_layer(features)
       self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], input_layer.variables[0])
+      self.assertEqual(variables[0], feature_layer.variables[0])
 
-  def test_feature_column_input_layer_gradient(self):
+  def test_feature_column_feature_layer_gradient(self):
     with context.eager_mode():
       sparse_input = sparse_tensor.SparseTensor(
           indices=((0, 0), (1, 0), (2, 0)),
@@ -2701,7 +2685,7 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc_old.categorical_column_with_identity(
+      categorical_column = fc.categorical_column_with_identity(
           key='a', num_buckets=3)
       embedding_dimension = 2
 
@@ -2715,16 +2699,16 @@ class InputLayerTest(test.TestCase):
             (1, 1))  # id 2
         return embedding_values
 
-      embedding_column = fc_old.embedding_column(
+      embedding_column = fc.embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      input_layer = InputLayer([embedding_column])
+      feature_layer = FeatureLayer([embedding_column])
       features = {'a': sparse_input}
 
       def scale_matrix():
-        matrix = input_layer(features)
+        matrix = feature_layer(features)
         return 2 * matrix
 
       # Sanity check: Verify that scale_matrix returns the correct output.
@@ -2739,185 +2723,139 @@ class InputLayerTest(test.TestCase):
       self.assertAllEqual([0, 1, 2], indexed_slice.indices)
       self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
-
-class FunctionalInputLayerTest(test.TestCase):
-
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
-      fc.input_layer(features={}, feature_columns=[])
+      FeatureLayer(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
-      fc.input_layer(
-          features={'a': [[0]]},
-          feature_columns=[
-              fc_old.categorical_column_with_hash_bucket('wire_cast', 4)
-          ])
+    with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
+      FeatureLayer(feature_columns=[
+          fc.categorical_column_with_hash_bucket('wire_cast', 4)
+      ])(
+          features={
+              'a': [[0]]
+          })
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.input_layer(
-          features={'a': [[0]]},
-          feature_columns={'a': fc_old.numeric_column('a')})
+      FeatureLayer(feature_columns={'a': fc.numeric_column('a')})(
+          features={
+              'a': [[0]]
+          })
 
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.input_layer(features, fc_old.numeric_column('a'))
+      net = FeatureLayer(fc.numeric_column('a'))(features)
       with _initialized_session():
         self.assertAllClose([[0.]], net.eval())
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
-      columns = (fc_old.numeric_column(key) for key in features)
-      net = fc.input_layer(features, columns)
+      columns = (fc.numeric_column(key) for key in features)
+      net = FeatureLayer(columns)(features)
       with _initialized_session():
         self.assertAllClose([[0., 1.]], net.eval())
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      fc.input_layer(
-          features={'a': [[0]]},
-          feature_columns=[
-              fc_old.numeric_column('a'),
-              fc_old.numeric_column('a')
-          ])
+      FeatureLayer(
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])(
+                               features={
+                                   'a': [[0]]
+                               })
 
   def test_one_column(self):
-    price = fc_old.numeric_column('price')
+    price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      net = fc.input_layer(features, [price])
+      net = FeatureLayer([price])(features)
       with _initialized_session():
         self.assertAllClose([[1.], [5.]], net.eval())
 
   def test_multi_dimension(self):
-    price = fc_old.numeric_column('price', shape=2)
+    price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      net = fc.input_layer(features, [price])
+      net = FeatureLayer([price])(features)
       with _initialized_session():
         self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
 
   def test_raises_if_shape_mismatch(self):
-    price = fc_old.numeric_column('price', shape=2)
+    price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        fc.input_layer(features, [price])
+        FeatureLayer([price])(features)
 
   def test_reshaping(self):
-    price = fc_old.numeric_column('price', shape=[1, 2])
+    price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = fc.input_layer(features, [price])
+      net = FeatureLayer([price])(features)
       with _initialized_session():
         self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
 
   def test_multi_column(self):
-    price1 = fc_old.numeric_column('price1', shape=2)
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
           'price2': [[3.], [4.]]
       }
-      net = fc.input_layer(features, [price1, price2])
+      net = FeatureLayer([price1, price2])(features)
       with _initialized_session():
         self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
 
-  def test_fills_cols_to_vars(self):
-    # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
-    # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
-    # creates a Variable.
-    price1 = fc_old.numeric_column('price1')
-    dense_feature = fc_old.numeric_column('dense_feature')
-    dense_feature_bucketized = fc_old.bucketized_column(
-        dense_feature, boundaries=[0.])
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc_old.embedding_column(
-        some_sparse_column, dimension=10)
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[3.], [4.]],
-          'dense_feature': [[-1.], [4.]],
-          'sparse_feature': [['a'], ['x']],
-      }
-      cols_to_vars = {}
-      all_cols = [price1, dense_feature_bucketized, some_embedding_column]
-      fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
-      self.assertEqual(0, len(cols_to_vars[price1]))
-      self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
-      self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
-      self.assertIsInstance(cols_to_vars[some_embedding_column][0],
-                            variables_lib.Variable)
-      self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
-
-  def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc_old.numeric_column('price1')
-    dense_feature = fc_old.numeric_column('dense_feature')
-    dense_feature_bucketized = fc_old.bucketized_column(
-        dense_feature, boundaries=[0.])
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc_old.embedding_column(
-        some_sparse_column, dimension=10)
+  def test_cols_to_output_tensors(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[3.], [4.]],
-          'dense_feature': [[-1.], [4.]],
-          'sparse_feature': [['a'], ['x']],
-      }
-      cols_to_vars = {}
-      all_cols = [price1, dense_feature_bucketized, some_embedding_column]
-      with variable_scope.variable_scope(
-          'input_from_feature_columns',
-          partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
-        fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
-      self.assertEqual(0, len(cols_to_vars[price1]))
-      self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
-      self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
-      self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [2, 10])
-      self.assertAllEqual(cols_to_vars[some_embedding_column][1].shape, [2, 10])
-      self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10])
+      cols_dict = {}
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      feature_layer = FeatureLayer([price1, price2])
+      net = feature_layer(features, cols_dict)
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], cols_dict[price1].eval())
+        self.assertAllClose([[3.], [4.]], cols_dict[price2].eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
 
   def test_column_order(self):
-    price_a = fc_old.numeric_column('price_a')
-    price_b = fc_old.numeric_column('price_b')
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
     with ops.Graph().as_default():
       features = {
           'price_a': [[1.]],
           'price_b': [[3.]],
       }
-      net1 = fc.input_layer(features, [price_a, price_b])
-      net2 = fc.input_layer(features, [price_b, price_a])
+      net1 = FeatureLayer([price_a, price_b])(features)
+      net2 = FeatureLayer([price_b, price_a])(features)
       with _initialized_session():
         self.assertAllClose([[1., 3.]], net1.eval())
         self.assertAllClose([[1., 3.]], net2.eval())
 
   def test_fails_for_categorical_column(self):
-    animal = fc_old.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
     with ops.Graph().as_default():
       features = {
           'animal':
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'):
-        fc.input_layer(features, [animal])
+      with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
+        FeatureLayer([animal])(features)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2926,12 +2864,12 @@ class FunctionalInputLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.input_layer(features, [price1, price2])
+        FeatureLayer([price1, price2])(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
-    price3 = fc_old.numeric_column('price3')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2941,31 +2879,31 @@ class FunctionalInputLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.input_layer(features, [price1, price2, price3])
+        FeatureLayer([price1, price2, price3])(features)
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      net = fc.input_layer(features, [price1, price2])
+      net = FeatureLayer([price1, price2])(features)
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'Dimensions of inputs should match'):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc_old.numeric_column('price1')
-    price2 = fc_old.numeric_column('price2')
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
           'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
       }
-      net = fc.input_layer(features, [price1, price2])
+      net = FeatureLayer([price1, price2])(features)
       with _initialized_session() as sess:
         sess.run(
             net,
@@ -2975,9 +2913,9 @@ class FunctionalInputLayerTest(test.TestCase):
             })
 
   def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc_old.embedding_column(
+    some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)
 
     with ops.Graph().as_default():
@@ -2985,28 +2923,30 @@ class FunctionalInputLayerTest(test.TestCase):
           'sparse_feature': [['a'], ['x']],
       }
       all_cols = [some_embedding_column]
-      fc.input_layer(features, all_cols)
-      fc.input_layer(features, all_cols)
+      FeatureLayer(all_cols)(features)
+      FeatureLayer(all_cols)(features)
       # Make sure that 2 variables get created in this case.
       self.assertEqual(2, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
       expected_var_names = [
-          'input_layer/sparse_feature_embedding/embedding_weights:0',
-          'input_layer_1/sparse_feature_embedding/embedding_weights:0'
+          'feature_layer/sparse_feature_embedding/embedding_weights:0',
+          'feature_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
       self.assertItemsEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
   def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_old.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
+    shared_state_manager = fc.SharedEmbeddingStateManager(
+        name='shared_feature_layer')
 
     with ops.Graph().as_default():
       features = {
@@ -3022,27 +2962,33 @@ class FunctionalInputLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
       all_cols = [embedding_column_a, embedding_column_b]
-      fc.input_layer(features, all_cols)
-      fc.input_layer(features, all_cols)
+      FeatureLayer(
+          all_cols, shared_state_manager=shared_state_manager)(
+              features)
+      FeatureLayer(
+          all_cols, shared_state_manager=shared_state_manager)(
+              features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
       self.assertItemsEqual(
-          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          ['shared_feature_layer/aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_old.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     all_cols = [embedding_column_a, embedding_column_b]
 
     with ops.Graph().as_default():
+      shared_state_manager1 = fc.SharedEmbeddingStateManager(
+          name='shared_feature_layer')
       features = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -3055,12 +3001,16 @@ class FunctionalInputLayerTest(test.TestCase):
                   values=(1, 2, 1),
                   dense_shape=(2, 2)),
       }
-      fc.input_layer(features, all_cols)
+      FeatureLayer(
+          all_cols, shared_state_manager=shared_state_manager1)(
+              features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
 
     with ops.Graph().as_default():
+      shared_state_manager2 = fc.SharedEmbeddingStateManager(
+          name='shared_feature_layer')
       features1 = {
           'aaa':
               sparse_tensor.SparseTensor(
@@ -3074,12 +3024,14 @@ class FunctionalInputLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
 
-      fc.input_layer(features1, all_cols)
+      FeatureLayer(
+          all_cols, shared_state_manager=shared_state_manager2)(
+              features1)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
       self.assertItemsEqual(
-          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          ['shared_feature_layer/aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
   def test_with_numpy_input_fn(self):
@@ -3092,14 +3044,14 @@ class FunctionalInputLayerTest(test.TestCase):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in input_layer
-    price = fc_old.numeric_column('price')
-    body_style = fc_old.categorical_column_with_vocabulary_list(
+    # price has 1 dimension in feature_layer
+    price = fc.numeric_column('price')
+    body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in input_layer.
-    one_hot_body_style = fc_old.indicator_column(body_style)
-    # embedded_body_style has 5 dims in input_layer.
-    embedded_body_style = fc_old.embedding_column(
+    # one_hot_body_style has 3 dims in feature_layer.
+    one_hot_body_style = fc.indicator_column(body_style)
+    # embedded_body_style has 5 dims in feature_layer.
+    embedded_body_style = fc.embedding_column(
         body_style, dimension=5, initializer=_initializer)
 
     input_fn = numpy_io.numpy_input_fn(
@@ -3110,8 +3062,8 @@ class FunctionalInputLayerTest(test.TestCase):
         batch_size=2,
         shuffle=False)
     features = input_fn()
-    net = fc.input_layer(features,
-                         [price, one_hot_body_style, embedded_body_style])
+    net = FeatureLayer([price, one_hot_body_style, embedded_body_style])(
+        features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
       coord = coordinator.Coordinator()
@@ -3137,18 +3089,18 @@ class FunctionalInputLayerTest(test.TestCase):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in input_layer
-    price = fc_old.numeric_column('price')
+    # price has 1 dimension in feature_layer
+    price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc_old.categorical_column_with_vocabulary_list(
+    # one_hot_body_style has 3 dims in feature_layer.
+    body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc_old.indicator_column(body_style)
+    one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in input_layer.
-    country = fc_old.categorical_column_with_vocabulary_list(
+    # embedded_body_style has 5 dims in feature_layer.
+    country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc_old.embedding_column(
+    embedded_country = fc.embedding_column(
         country, dimension=5, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
@@ -3165,8 +3117,7 @@ class FunctionalInputLayerTest(test.TestCase):
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
     self.assertEqual(1, features['country'].shape.ndims)
 
-    net = fc.input_layer(features,
-                         [price, one_hot_body_style, embedded_country])
+    net = FeatureLayer([price, one_hot_body_style, embedded_country])(features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
 
@@ -3187,18 +3138,18 @@ class FunctionalInputLayerTest(test.TestCase):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in input_layer
-    price = fc_old.numeric_column('price')
+    # price has 1 dimension in feature_layer
+    price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc_old.categorical_column_with_vocabulary_list(
+    # one_hot_body_style has 3 dims in feature_layer.
+    body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc_old.indicator_column(body_style)
+    one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in input_layer.
-    country = fc_old.categorical_column_with_vocabulary_list(
+    # embedded_body_style has 5 dims in feature_layer.
+    country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc_old.embedding_column(
+    embedded_country = fc.embedding_column(
         country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
@@ -3219,8 +3170,7 @@ class FunctionalInputLayerTest(test.TestCase):
         dense_shape=(2,))
     country_data = np.array([['US'], ['CA']])
 
-    net = fc.input_layer(features,
-                         [price, one_hot_body_style, embedded_country])
+    net = FeatureLayer([price, one_hot_body_style, embedded_country])(features)
     self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
 
@@ -3237,8 +3187,8 @@ class FunctionalInputLayerTest(test.TestCase):
               }))
 
   def test_with_rank_0_feature(self):
-    # price has 1 dimension in input_layer
-    price = fc_old.numeric_column('price')
+    # price has 1 dimension in feature_layer
+    price = fc.numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -3246,13 +3196,13 @@ class FunctionalInputLayerTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.input_layer(features, [price])
+      FeatureLayer([price])(features)
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = fc.input_layer(features, [price])
+    net = FeatureLayer([price])(features)
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
@@ -3267,7 +3217,7 @@ class MakeParseExampleSpecTest(test.TestCase):
 
     @property
     def name(self):
-      return "_TestFeatureColumn"
+      return '_TestFeatureColumn'
 
     def transform_feature(self, transformation_cache, state_manager):
       pass
@@ -3427,7 +3377,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     column.get_sparse_tensors(FeatureTransformationCache({'aaa': inputs}), None)
     with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
-      with self.test_session():
+      with self.cached_session():
         lookup_ops.tables_initializer().run()
 
   def test_invalid_vocabulary_size(self):
@@ -3451,7 +3401,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     column.get_sparse_tensors(FeatureTransformationCache({'aaa': inputs}), None)
     with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
-      with self.test_session():
+      with self.cached_session():
         lookup_ops.tables_initializer().run()
 
   def test_invalid_num_oov_buckets(self):
@@ -3521,7 +3471,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -3593,25 +3543,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                                       dense_shape=inputs.dense_shape),
                                   id_tensor.eval())
 
-  def DISABLED_test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    inputs = sparse_tensor.SparseTensor(
-        values=['omar', 'stringer', 'marlo'],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    column.get_sparse_tensors(
-        FeatureTransformationCache({
-            'aaa': inputs
-        }),
-        weight_collections=('my_weights',))
-
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
-
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -3972,7 +3903,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -3994,7 +3925,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -4043,24 +3974,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_tensor.eval())
 
-  def DISABLED_test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
-    inputs = sparse_tensor.SparseTensor(
-        values=['omar', 'stringer', 'marlo'],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    column.get_sparse_tensors(
-        FeatureTransformationCache({
-            'aaa': inputs
-        }),
-        weight_collections=('my_weights',))
-
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
-
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -4311,7 +4224,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -4356,22 +4269,6 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_tensor.eval())
 
-  def DISABLED_test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    column.get_sparse_tensors(
-        FeatureTransformationCache({
-            'aaa': inputs
-        }),
-        weight_collections=('my_weights',))
-
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
-
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column.get_sparse_tensors(
@@ -4595,7 +4492,7 @@ class IndicatorColumnTest(test.TestCase):
         'animal': ['fox', 'fox']
     })
     output = transformation_cache.get(animal, None)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
 
   def test_2D_shape_succeeds(self):
@@ -4610,7 +4507,7 @@ class IndicatorColumnTest(test.TestCase):
                 dense_shape=[2, 1])
     })
     output = transformation_cache.get(animal, None)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
 
   def test_multi_hot(self):
@@ -4623,7 +4520,7 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
 
   def test_multi_hot2(self):
@@ -4635,7 +4532,7 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
 
   def test_deep_copy(self):
@@ -4660,7 +4557,7 @@ class IndicatorColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a_indicator]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -4765,16 +4662,16 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], predictions.eval())
 
-  def test_input_layer(self):
-    animal = fc_old.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+  def test_feature_layer(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      net = fc.input_layer(features, [animal])
+      net = FeatureLayer([animal])(features)
       with _initialized_session():
         self.assertAllClose([[0., 1., 1., 0.]], net.eval())
 
@@ -4786,12 +4683,13 @@ class _TestStateManager(StateManager):
     self._all_variables = {}
     self._trainable = trainable
 
-  def get_variable(self,
-                   feature_column,
-                   name,
-                   shape,
-                   dtype=None,
-                   initializer=None):
+  def create_variable(self,
+                      feature_column,
+                      name,
+                      shape,
+                      dtype=None,
+                      trainable=True,
+                      initializer=None):
     if feature_column not in self._all_variables:
       self._all_variables[feature_column] = {}
     var_dict = self._all_variables[feature_column]
@@ -4801,11 +4699,19 @@ class _TestStateManager(StateManager):
       var = variable_scope.get_variable(
           name=name,
           shape=shape,
-          initializer=initializer,
-          trainable=self._trainable)
+          dtype=dtype,
+          trainable=self._trainable and trainable,
+          initializer=initializer)
       var_dict[name] = var
       return var
 
+  def get_variable(self, feature_column, name):
+    if feature_column not in self._all_variables:
+      raise ValueError('Do not recognize FeatureColumn.')
+    if name in self._all_variables[feature_column]:
+      return self._all_variables[feature_column][name]
+    raise ValueError('Could not find variable.')
+
 
 class EmbeddingColumnTest(test.TestCase):
 
@@ -4898,7 +4804,7 @@ class EmbeddingColumnTest(test.TestCase):
         serialized=[data.SerializeToString()],
         features=fc.make_parse_example_spec([a_embedded]))
     self.assertIn('aaa', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -4967,6 +4873,7 @@ class EmbeddingColumnTest(test.TestCase):
         categorical_column, dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
+    embedding_column.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup = embedding_column.get_dense_tensor(
@@ -5028,6 +4935,7 @@ class EmbeddingColumnTest(test.TestCase):
         categorical_column, dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
+    embedding_column.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup = embedding_column.get_dense_tensor(
@@ -5043,36 +4951,6 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
-  def DISABLED_test_get_dense_tensor_weight_collections(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    embedding_column = fc.embedding_column(categorical_column, dimension=2)
-
-    # Provide sparse input and get dense result.
-    embedding_column.get_dense_tensor(
-        FeatureTransformationCache({
-            'aaa': sparse_input
-        }),
-        weight_collections=('my_vars',))
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
-
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -5117,6 +4995,7 @@ class EmbeddingColumnTest(test.TestCase):
         categorical_column, dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
+    embedding_column.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     input_indices = array_ops.placeholder(dtype=dtypes.int64)
@@ -5187,6 +5066,7 @@ class EmbeddingColumnTest(test.TestCase):
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
     state_manager = _TestStateManager()
+    embedding_column.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup = embedding_column.get_dense_tensor(
@@ -5354,7 +5234,7 @@ class EmbeddingColumnTest(test.TestCase):
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
         self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
 
-  def test_input_layer(self):
+  def test_feature_layer(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -5392,30 +5272,29 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
+    embedding_column = fc.embedding_column(
         categorical_column,
         dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
-    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+    l = FeatureLayer((embedding_column,))
+    feature_layer = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in trainable_vars]))
+    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
-  def test_input_layer_not_trainable(self):
+  def test_feature_layer_not_trainable(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -5453,65 +5332,26 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
+    embedding_column = fc.embedding_column(
         categorical_column,
         dimension=embedding_dimension,
         initializer=_initializer,
         trainable=False)
 
     # Provide sparse input and get dense result.
-    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+    feature_layer = FeatureLayer((embedding_column,))({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     self.assertItemsEqual(
         [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
-
-
-class _TestSharedEmbeddingStateManager(StateManager):
-  """Manages the state for shared embedding columns.
-
-  This can handle multiple groups of shared embedding columns.
-  """
-
-  def __init__(self, trainable=True):
-    # Dict of shared_embedding_collection_name to a dict of variables.
-    self._all_variables = {}
-    self._trainable = trainable
-
-  def get_variable(self,
-                   feature_column,
-                   name,
-                   shape,
-                   dtype=None,
-                   initializer=None):
-    if not isinstance(feature_column, fc.SharedEmbeddingColumn):
-      raise ValueError(
-          'SharedEmbeddingStateManager can only handle SharedEmbeddingColumns. '
-          'Given type: {} '.format(type(feature_column)))
-
-    collection_name = feature_column.shared_collection_name
-    if collection_name not in self._all_variables:
-      self._all_variables[collection_name] = {}
-    var_dict = self._all_variables[collection_name]
-    if name in var_dict:
-      return var_dict[name]
-    else:
-      var = variable_scope.get_variable(
-          name=name,
-          shape=shape,
-          initializer=initializer,
-          trainable=self._trainable)
-      var_dict[name] = var
-      return var
+      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
@@ -5522,7 +5362,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
@@ -5560,7 +5400,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5605,7 +5445,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    original_a, _ = fc.shared_embedding_columns(
+    original_a, _ = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5613,7 +5453,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         shared_embedding_collection_name='shared_embedding_collection_name',
         ckpt_to_load_from='my_ckpt',
         tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        max_norm=42.,
+        trainable=False)
     for embedding_column_a in (original_a, copy.deepcopy(original_a)):
       self.assertEqual('aaa', embedding_column_a.categorical_column.name)
       self.assertEqual(3, embedding_column_a.categorical_column.num_buckets)
@@ -5642,8 +5483,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.shared_embedding_columns(
-          [categorical_column_a, categorical_column_b], dimension=2,
+      fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b],
+          dimension=2,
           initializer='not_fn')
 
   def test_incompatible_column_type(self):
@@ -5656,7 +5498,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'all categorical_columns must have the same type.*'
         'IdentityCategoricalColumn.*HashedCategoricalColumn'):
-      fc.shared_embedding_columns(
+      fc.shared_embedding_columns_v2(
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
@@ -5669,11 +5511,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='bbb', num_buckets=3)
     weighted_categorical_column_b = fc.weighted_categorical_column(
         categorical_column_b, weight_feature_key='bbb_weights')
-    fc.shared_embedding_columns(
+    fc.shared_embedding_columns_v2(
         [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc.shared_embedding_columns_v2(
         [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc.shared_embedding_columns_v2(
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
@@ -5682,8 +5524,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     b = fc.categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -5698,7 +5539,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([a_embedded, b_embedded]))
     self.assertIn('aaa', features)
     self.assertIn('bbb', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
@@ -5717,8 +5558,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
   def test_transform_feature(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -5788,10 +5628,13 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
-    state_manager = _TestSharedEmbeddingStateManager()
+        dimension=embedding_dimension,
+        initializer=_initializer)
+    state_manager = fc.SharedEmbeddingStateManager(name='shared_feature_layer')
+    embedding_column_a.create_state(state_manager)
+    embedding_column_b.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a.get_dense_tensor(
@@ -5801,7 +5644,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertItemsEqual(('shared_feature_layer/aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
@@ -5809,58 +5652,6 @@ class SharedEmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
       self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
-  def DISABLED_test_get_dense_tensor_weight_collections(self):
-    # Inputs.
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-    input_features = {'aaa': input_a, 'bbb': input_b}
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    fc.input_layer(
-        input_features, [embedding_column_a, embedding_column_b],
-        weight_collections=('my_vars',))
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
-        tuple(v.name for v in global_vars))
-    my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
-        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
-        tuple(v.name for v in my_vars))
-
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -5903,10 +5694,13 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
-    state_manager = _TestSharedEmbeddingStateManager()
+        dimension=embedding_dimension,
+        initializer=_initializer)
+    state_manager = fc.SharedEmbeddingStateManager()
+    embedding_column_a.create_state(state_manager)
+    embedding_column_b.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a.get_dense_tensor(
@@ -6096,7 +5890,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
         self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
 
-  def _test_input_layer(self, trainable=True):
+  def _test_feature_layer(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
     sparse_input_a = sparse_tensor.SparseTensorValue(
@@ -6111,6 +5905,18 @@ class SharedEmbeddingColumnTest(test.TestCase):
         indices=((0, 0),),
         values=(0,),
         dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
 
     # Embedding variable.
     embedding_dimension = 2
@@ -6130,51 +5936,127 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 0:
         # A ids [2], embedding = [7, 11]
         # B ids [0], embedding = [1, 2]
-        (7., 11., 1., 2.),
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
         # example 1:
         # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
         # B ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0.),
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
     )
 
     # Build columns.
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_old.shared_embedding_columns(
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer,
         trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    shared_state_manager = fc.SharedEmbeddingStateManager(
+        name='shared_feature_layer')
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
 
     # Provide sparse input and get dense result.
-    input_layer = fc.input_layer(
-        features={'aaa': sparse_input_a, 'bbb': sparse_input_b},
-        feature_columns=(embedding_column_b, embedding_column_a))
+    feature_layer = FeatureLayer(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d),
+        shared_state_manager=shared_state_manager)(
+            features)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-        tuple([v.name for v in global_vars]))
+    self.assertItemsEqual([
+        'shared_feature_layer/aaa_bbb_shared_embedding:0',
+        'shared_feature_layer/ccc_ddd_shared_embedding:0'
+    ], tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual(
-          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-          tuple([v.name for v in trainable_vars]))
+      self.assertItemsEqual([
+          'shared_feature_layer/aaa_bbb_shared_embedding:0',
+          'shared_feature_layer/ccc_ddd_shared_embedding:0'
+      ], tuple([v.name for v in trainable_vars]))
     else:
       self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, feature_layer.eval())
+
+  def test_feature_layer(self):
+    self._test_feature_layer()
+
+  def test_feature_layer_no_trainable(self):
+    self._test_feature_layer(trainable=False)
+
 
-  def test_input_layer(self):
-    self._test_input_layer()
+class SharedEmbeddingStateManagerTest(test.TestCase):
 
-  def test_input_layer_no_trainable(self):
-    self._test_input_layer(trainable=False)
+  def test_basic(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b], dimension=2)
+    shared_state_manager = fc.SharedEmbeddingStateManager(
+        name='shared_feature_layer')
+    var_a = shared_state_manager.create_variable('aaa_bbb_shared_embedding',
+                                                 [5, 10])
+    var_b = shared_state_manager.create_variable('aaa_bbb_shared_embedding',
+                                                 [5, 10])
+    self.assertEqual(var_a, var_b)
+    self.assertEqual('shared_feature_layer/aaa_bbb_shared_embedding:0',
+                     var_a.name)
+    self.assertIsInstance(var_a, variables_lib.Variable)
+
+  def test_multiple_sets(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=3)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=3)
+
+    fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b], dimension=2)
+    fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d], dimension=2)
+    shared_state_manager = fc.SharedEmbeddingStateManager(
+        name='shared_feature_layer')
+    var_a = shared_state_manager.create_variable('aaa_bbb_shared_embedding',
+                                                 [5, 10])
+    var_c = shared_state_manager.create_variable('ccc_ddd_shared_embedding',
+                                                 [5, 10])
+    self.assertIsInstance(var_a, variables_lib.Variable)
+    self.assertIsInstance(var_c, variables_lib.Variable)
+    self.assertNotEquals(var_a, var_c)
+    self.assertEqual('shared_feature_layer/aaa_bbb_shared_embedding:0',
+                     var_a.name)
+    self.assertEqual('shared_feature_layer/ccc_ddd_shared_embedding:0',
+                     var_c.name)
 
 
 class WeightedCategoricalColumnTest(test.TestCase):
@@ -6271,7 +6153,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         features=fc.make_parse_example_spec([a_weighted]))
     self.assertIn('aaa', features)
     self.assertIn('weights', features)
-    with self.test_session():
+    with self.cached_session():
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index b3eb57d067..eca34ac26e 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Operations that generate constants.
 
-See the @{$python/constant_op$constants guide}.
+See the [constants guide](https://tensorflow.org/api_guides/python/constant_op).
 """
 
 # Must be separate from array_ops to avoid a cyclic dependency.
@@ -145,6 +145,17 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
                                                [-1. -1. -1.]]
   ```
 
+  `tf.constant` differs from `tf.fill` in a few ways:
+
+  *   `tf.constant` supports arbitrary constants, not just uniform scalar
+      Tensors like `tf.fill`.
+  *   `tf.constant` creates a `Const` node in the computation graph with the
+      exact value at graph construction time. On the other hand, `tf.fill`
+      creates an Op in the graph that is expanded at runtime.
+  *   Because `tf.constant` only embeds constant values in the graph, it does
+      not support dynamic shapes based on other runtime Tensors, whereas
+      `tf.fill` does.
+
   Args:
     value:          A constant value (or list) of output type `dtype`.
 
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index ab06a2babf..06c653097a 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import threading
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -229,6 +230,12 @@ class DeviceSpec(object):
     """
     return DeviceSpec().parse_from_string(spec)
 
+  def __eq__(self, other):
+    return self.to_string() == other.to_string()
+
+  def __hash__(self):
+    return hash(self.to_string())
+
 
 def check_valid(spec):
   """Check that a device spec is valid.
@@ -254,6 +261,14 @@ def canonical_name(device):
     return device.to_string()
 
 
+# Cache from DeviceSpec objects to their corresponding device functions.
+# This cache is maintained for correctness, not performance: it makes it
+# possible to compare the device function stacks belonging to different
+# graphs in a meaningful way.
+_cached_device_functions = {}
+_cache_lock = threading.Lock()
+
+
 def merge_device(spec):
   """Returns a device function that merges devices specifications.
 
@@ -280,11 +295,18 @@ def merge_device(spec):
   Raises:
     ValueError: if the spec was not valid.
   """
-  if not isinstance(spec, DeviceSpec):
-    spec = DeviceSpec.from_string(spec or "")
-  def _device_function(node_def):
-    current_device = DeviceSpec.from_string(node_def.device or "")
-    copy_spec = copy.copy(spec)
-    copy_spec.merge_from(current_device)  # current_device takes precedence.
-    return copy_spec
-  return _device_function
+  with _cache_lock:
+    if not isinstance(spec, DeviceSpec):
+      spec = DeviceSpec.from_string(spec or "")
+    cached_function = _cached_device_functions.get(spec, None)
+    if cached_function is not None:
+      return cached_function
+
+    def _device_function(node_def):
+      current_device = DeviceSpec.from_string(node_def.device or "")
+      copy_spec = copy.copy(spec)
+      copy_spec.merge_from(current_device)  # current_device takes precedence.
+      return copy_spec
+
+    _cached_device_functions[spec] = _device_function
+    return _device_function
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index f47c0d8a5e..a8aef3a009 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 
 import collections
 import hashlib
-import sys
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
@@ -34,7 +33,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import cond_v2_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import compat
@@ -42,9 +40,6 @@ from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
-# This is to avoid a circular dependency with cond_v2_impl.
-cond_v2_impl._function = sys.modules[__name__]  # pylint: disable=protected-access
-
 
 class Defun(object):
   """Decorator used to define TensorFlow functions.
@@ -1029,20 +1024,10 @@ def _from_definition(fdef, grad_func=None):
   result = _DefinedFunction(func, argnames, input_types, func_name, grad_func,
                             python_grad_func, out_names)
   # pylint: disable=protected-access
-  if ops._USE_C_API:
-    serialized = fdef.SerializeToString()
-    c_func = c_api.TF_FunctionImportFunctionDef(serialized)
-    result._c_func = c_api_util.ScopedTFFunction(c_func)
-    result._extra_inputs = []
-  else:
-    result._definition = fdef
-    # Captured inputs are added as regular inputs to a function when it's
-    # serialized, i.e. any extra inputs from the original function are now
-    # included in `result`._args
-    result._extra_inputs = []
-    result._hash_str = result._create_hash_str(
-        result._definition.signature.input_arg,
-        result._definition.signature.output_arg, result._definition.node_def)
+  serialized = fdef.SerializeToString()
+  c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+  result._c_func = c_api_util.ScopedTFFunction(c_func)
+  result._extra_inputs = []
   # pylint: enable=protected-access
 
   return result
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 1b09506662..a04fa369ae 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -23,7 +23,7 @@ import sys
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
-from tensorflow.python.framework import function
+from tensorflow.python.eager import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import versions
@@ -34,13 +34,13 @@ cond_v2_impl._function_def_to_graph = sys.modules[__name__]  # pylint: disable=p
 
 
 def function_def_to_graph(fdef, input_shapes=None):
-  """Converts a FunctionDef to a function._FuncGraph (sub-class Graph).
+  """Converts a FunctionDef to a function.FuncGraph (sub-class Graph).
 
-  The returned _FuncGraph's `name`, `inputs` and `outputs` fields will be set.
+  The returned FuncGraph's `name`, `inputs` and `outputs` fields will be set.
   The input tensors are represented as placeholders.
 
-  Note: `_FuncGraph.inputs` and `_FuncGraph._captured` are not set and may be
-  set by the caller.
+  Note: `FuncGraph.inputs` and `FuncGraph.captures` are not set and may be set
+  by the caller.
 
   Args:
     fdef: FunctionDef.
@@ -50,9 +50,9 @@ def function_def_to_graph(fdef, input_shapes=None):
       placeholder will have unknown shape.
 
   Returns:
-    A _FuncGraph.
+    A FuncGraph.
   """
-  func_graph = function._FuncGraph(fdef.signature.name, capture_by_value=False)  # pylint: disable=protected-access
+  func_graph = function.FuncGraph(fdef.signature.name)
   graph_def, nested_to_flat_tensor_name = function_def_to_graph_def(
       fdef, input_shapes)
 
@@ -60,7 +60,7 @@ def function_def_to_graph(fdef, input_shapes=None):
     # Add all function nodes to the graph.
     importer.import_graph_def(graph_def, name="")
 
-    # Initialize fields specific to _FuncGraph.
+    # Initialize fields specific to FuncGraph.
 
     # inputs
     input_tensor_names = [
@@ -144,6 +144,8 @@ def function_def_to_graph_def(fdef, input_shapes=None):
 
   for arg_def in fdef.signature.input_arg:
     nested_to_flat_tensor_name[arg_def.name] = "{}:0".format(arg_def.name)
+    control_name = "^" + arg_def.name
+    nested_to_flat_tensor_name[control_name] = control_name
 
   for node_def in fdef.node_def:
     op_def = ops.get_default_graph()._get_op_def(node_def.op)  # pylint: disable=protected-access
@@ -172,6 +174,8 @@ def function_def_to_graph_def(fdef, input_shapes=None):
         flat_name = "{}:{}".format(node_def.name, flattened_index)
         nested_to_flat_tensor_name[nested_name] = flat_name
         flattened_index += 1
+      control_name = "^" + node_def.name
+      nested_to_flat_tensor_name[control_name] = control_name
 
   # Update inputs of all nodes in graph.
   for node_def in graph_def.node:
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index cd2a16ed5a..e013fb6e4d 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
@@ -56,7 +56,7 @@ class FunctionDefToGraphTest(test.TestCase):
     fdef = self._build_function_def()
     g = function_def_to_graph.function_def_to_graph(fdef)
     self.assertEqual(g.name, "_whats_in_a_name")
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       inputs = sess.run(g.inputs, feed_dict={"x:0": 2, "y:0": 3})
       self.assertSequenceEqual(inputs, [2.0, 3.0])
       outputs = sess.run(g.outputs, feed_dict={"x:0": 2, "y:0": 3})
@@ -154,14 +154,20 @@ class FunctionDefToGraphDefTest(test.TestCase):
     self.assertDictEqual(
         tensor_name_map, {
             "x": "x:0",
+            "^x": "^x",
             "y": "y:0",
+            "^y": "^y",
             "z": "z:0",
+            "^z": "^z",
             "foo_1:d:0": "foo_1:0",
             "foo_1:e:0": "foo_1:1",
+            "^foo_1": "^foo_1",
             "list_output:a:0": "list_output:0",
             "list_output:a:1": "list_output:1",
+            "^list_output": "^list_output",
             "foo_2:d:0": "foo_2:0",
             "foo_2:e:0": "foo_2:1",
+            "^foo_2": "^foo_2",
         })
 
   def testShapes(self):
@@ -184,33 +190,56 @@ class FunctionDefToGraphDefTest(test.TestCase):
     x = constant_op.constant(5.0)
     y = constant_op.constant(10.0)
 
-    @function.Defun()
+    @function.defun
     def fn():
 
-      @function.Defun()
+      @function.defun
       def inner_fn():
         return x + y
 
       return inner_fn()
 
-    # Instantiate the function in this graph so that
-    # `function_def_to_graph` can find it.
-    fn()
-
+    @function.defun
     def fn2():
       return 2 * fn()
 
-    fdef = function._DefinedFunction(fn2, [], []).definition
+    fn2_defun = fn2.get_concrete_function()
+
+    # Call `fn2` to make sure `fn` is correctly instantiated so
+    # `function_def_to_graph` can find it.
+    fn2_defun()
+
+    fdef = fn2_defun._inference_function.definition
     func_graph = function_def_to_graph.function_def_to_graph(fdef)
     with func_graph.as_default():
       x_ph, y_ph = func_graph.inputs
-      with self.test_session(graph=func_graph) as sess:
+      with self.session(graph=func_graph) as sess:
         self.assertEqual(
             sess.run(func_graph.outputs[0], feed_dict={
                 x_ph: 5.0,
                 y_ph: 10.0
             }), 30.0)
 
+  def testControlDependencies(self):
+
+    @function.defun
+    def fn(inp):
+      x = constant_op.constant(2.0, name="x")
+      # TODO(b/79881896): Test external control dependency once that's
+      # supported.
+      with ops.control_dependencies([x, inp]):
+        constant_op.constant(3.0, name="y")
+      return 4.0
+
+    inp = constant_op.constant(1.0)
+    fdef = fn.get_concrete_function(inp).function_def
+    func_graph = function_def_to_graph.function_def_to_graph(fdef)
+
+    op = func_graph.get_operation_by_name("y")
+    self.assertEqual(len(op.control_inputs), 2)
+    self.assertEqual(op.control_inputs[0].name, "x")
+    self.assertEqual(op.control_inputs[1].name, "placeholder")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 1707f929b8..ee723bacaf 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -347,7 +347,7 @@ class FunctionTest(test.TestCase):
                 do_function_inlining=True,
                 do_constant_folding=True)))
 
-    with self.test_session(graph=g, config=cfg):
+    with self.session(graph=g, config=cfg):
       self.assertAllClose(y.eval(), 6.)
       self.assertAllClose(dx.eval(), 2.)
 
@@ -530,7 +530,7 @@ class FunctionTest(test.TestCase):
       v = variables.Variable(constant_op.constant(10.0))
       z = Foo(v)
 
-    with self.test_session(graph=g):
+    with self.session(graph=g):
       variables.global_variables_initializer().run()
       self.assertAllEqual(z.eval(), 101.)
 
@@ -552,7 +552,7 @@ class FunctionTest(test.TestCase):
       expected_val = v.value()
       actual_val, actual_shape = Foo()
 
-    with self.test_session(graph=g):
+    with self.session(graph=g):
       v.initializer.run()
       self.assertAllEqual(expected_val.eval(), actual_val.eval())
       self.assertAllEqual(expected_shape, actual_shape.eval())
@@ -732,7 +732,7 @@ class FunctionTest(test.TestCase):
       dx1, = gradients_impl.gradients([y1], [x])
 
     # Both should produce the same result and gradient.
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       vals = sess.run([y0, y1, dx0, dx1], {x: np.random.uniform(size=(3, 7))})
       self.assertAllClose(vals[0], vals[1])
       self.assertAllClose(vals[2], vals[3])
@@ -762,7 +762,7 @@ class FunctionTest(test.TestCase):
 
       z = Bar()
 
-    with self.test_session(graph=g):
+    with self.session(graph=g):
       variables.global_variables_initializer().run()
       self.assertAllEqual(y.eval(), [[12.0]])
       self.assertAllEqual(z.eval(), [[1.0]])
@@ -795,7 +795,7 @@ class FunctionTest(test.TestCase):
 
       y = Foo()
 
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       self.assertEqual(sess.run(y), 10)
 
   def testCaptureInCond(self):
@@ -810,7 +810,7 @@ class FunctionTest(test.TestCase):
       y = Foo(True)
       z = Foo(False)
 
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       self.assertEqual(sess.run(y), 1)
       self.assertEqual(sess.run(z), 2)
 
@@ -855,7 +855,7 @@ class FunctionTest(test.TestCase):
       y = Foo(x)
       z = Bar(x)
 
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       v0, v1 = sess.run([y, z])
       self.assertAllEqual(v0, 20.)
       self.assertAllEqual(v1, 20.)
@@ -1128,7 +1128,7 @@ class FunctionTest(test.TestCase):
       y2 = PartThree(x2)
       dx2, = gradients_impl.gradients(ys=[y2], xs=[x2])
 
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       v0, v1, v2 = sess.run([dx0, dx1, dx2])
 
     self.assertAllEqual(v0, 2.)
@@ -1353,7 +1353,7 @@ class FunctionOverloadTest(test.TestCase):
       x = Sinh(constant_op.constant(0.25, dtypes.float32))
       y = Sinh(constant_op.constant(0.25, dtypes.float64))
 
-    with self.test_session(graph=g):
+    with self.session(graph=g):
       self.assertAllClose(x.eval(), np.sinh(0.25))
       self.assertAllClose(y.eval(), np.sinh(0.25))
 
@@ -1374,7 +1374,7 @@ class FunctionOverloadTest(test.TestCase):
         y = F(x)
         dx, = gradients_impl.gradients(y, x)
 
-        with self.test_session(graph=g):
+        with self.session(graph=g):
           self.assertAllClose(dx.eval(), 0.25)
 
   def testDocString(self):
@@ -1418,7 +1418,7 @@ class FunctionCaptureByValueTest(test.TestCase):
 
     self.assertEqual(0, len(Foo.captured_inputs))
 
-    with self.test_session(graph=g):
+    with self.session(graph=g):
       self.assertAllEqual(y.eval(), [[12.0]])
 
 
@@ -1701,7 +1701,7 @@ class VariableHoistingTest(test.TestCase):
     self.assertEqual("Foo/w", w.op.name)
     self.assertEqual("Foo/b", b.op.name)
 
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       sess.run(variables.global_variables_initializer())
       w, b, x, y0, loss, dw, db = sess.run([w, b, x, y0, loss, dw, db])
 
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 7182c28666..18e7d8aa14 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -1205,7 +1205,7 @@ class ImportGraphDefTest(test.TestCase):
           gdef, return_elements=["p1:0", "p2:0", "f:0", "f:1"], name="")
       grad = gradients_impl.gradients([a], [p1, p2])
 
-      with self.test_session(graph=g2) as sess:
+      with self.session(graph=g2) as sess:
         feed_dict = {p1: 1, p2: 2}
         a_val, b_val, grad_val = sess.run([a, b, grad], feed_dict=feed_dict)
         self.assertEqual(a_val, 3.0)
@@ -1225,7 +1225,7 @@ class ImportGraphDefTest(test.TestCase):
       # functions created in g2).
       grad = gradients_impl.gradients([a], [p1, p2])
 
-      with self.test_session(graph=g3) as sess:
+      with self.session(graph=g3) as sess:
         feed_dict = {p1: 1, p2: 2}
         a_val, b_val, grad_val = sess.run([a, b, grad], feed_dict=feed_dict)
         self.assertEqual(a_val, 3.0)
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 5cf8697210..6e5f7aafac 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -70,7 +70,7 @@ class SimpleMetaGraphTest(test.TestCase):
     input_feed_value = -10  # Arbitrary input value for feed_dict.
 
     orig_graph = ops.Graph()
-    with self.test_session(graph=orig_graph) as sess:
+    with self.session(graph=orig_graph) as sess:
       # Create a minimal graph with zero variables.
       input_tensor = array_ops.placeholder(
           dtypes.float32, shape=[], name="input")
@@ -98,7 +98,7 @@ class SimpleMetaGraphTest(test.TestCase):
 
     # Create a clean graph and import the MetaGraphDef nodes.
     new_graph = ops.Graph()
-    with self.test_session(graph=new_graph) as sess:
+    with self.session(graph=new_graph) as sess:
       # Import the previously export meta graph.
       meta_graph.import_scoped_meta_graph(filename)
 
@@ -197,7 +197,7 @@ class SimpleMetaGraphTest(test.TestCase):
     # When inputs to the Complex Op are float64 instances, "T" maps to float64
     # and "Tout" maps to complex128. Since these attr values don't map to their
     # defaults, they must not be stripped.
-    with self.test_session(graph=ops.Graph()):
+    with self.session(graph=ops.Graph()):
       real_num = constant_op.constant(1.0, dtype=dtypes.float64, name="real")
       imag_num = constant_op.constant(2.0, dtype=dtypes.float64, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
@@ -855,7 +855,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
         _TestDir("metrics_export"), "meta_graph.pb")
 
     graph = ops.Graph()
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       values_queue = data_flow_ops.FIFOQueue(
           4, dtypes.float32, shapes=(1, 2))
       _enqueue_vector(sess, values_queue, [0, 1])
@@ -876,7 +876,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
     # Verifies that importing a meta_graph with LOCAL_VARIABLES collection
     # works correctly.
     graph = ops.Graph()
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       meta_graph.import_scoped_meta_graph(meta_graph_filename)
       initializer = variables.local_variables_initializer()
       sess.run(initializer)
@@ -885,7 +885,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
     # collection is of node_list type works, but cannot build initializer
     # with the collection.
     graph = ops.Graph()
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       meta_graph.import_scoped_meta_graph(
           test.test_src_dir_path(
               "python/framework/testdata/metrics_export_meta_graph.pb"))
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 98a1802490..8c85a422e7 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import collections
 import copy
-import os
 import re
 import sys
 import threading
@@ -67,7 +66,7 @@ from tensorflow.python.util.tf_export import tf_export
 # Temporary global switches determining if we should enable the work-in-progress
 # calls to the C API. These will be removed once all functionality is supported.
 _USE_C_API = True
-_USE_C_SHAPES = os.getenv("TF_C_API_GRAPH_CONSTRUCTION_SHAPES", "1") != "0"
+_USE_C_SHAPES = True
 
 
 def tensor_id(tensor):
@@ -753,6 +752,9 @@ class _EagerTensorBase(Tensor):
   def __format__(self, format_spec):
     return self.numpy().__format__(format_spec)
 
+  def __reduce__(self):
+    return (convert_to_tensor, (self.numpy(),))
+
   def _numpy(self):
     raise NotImplementedError()
 
@@ -2856,19 +2858,11 @@ class Graph(object):
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
-    if self._use_c_api_hack():
-      self._scoped_c_graph = c_api_util.ScopedTFGraph()
-      # The C API requires all ops to have shape functions. Disable this
-      # requirement (many custom ops do not have shape functions, and we don't
-      # want to break these existing cases).
-      c_api.SetRequireShapeInferenceFns(self._c_graph, False)
-    else:
-      self._scoped_c_graph = None
-
-  # TODO(apassos) remove once the C API is used by default.
-  def _use_c_api_hack(self):
-    """Temporary hack; can be overridden to force C API usage."""
-    return _USE_C_API
+    self._scoped_c_graph = c_api_util.ScopedTFGraph()
+    # The C API requires all ops to have shape functions. Disable this
+    # requirement (many custom ops do not have shape functions, and we don't
+    # want to break these existing cases).
+    c_api.SetRequireShapeInferenceFns(self._c_graph, False)
 
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
@@ -3118,7 +3112,7 @@ class Graph(object):
     Returns:
       bool indicating whether or not 'name' is registered in function library.
     """
-    return name in self._functions
+    return compat.as_str(name) in self._functions
 
   def _get_function(self, name):
     """Returns the function definition for 'name'.
@@ -3128,7 +3122,7 @@ class Graph(object):
     Returns:
       The function def proto.
     """
-    return self._functions.get(name, None)
+    return self._functions.get(compat.as_str(name), None)
 
   def _add_function(self, function):
     """Adds a function to the graph.
@@ -3164,7 +3158,7 @@ class Graph(object):
     c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
     # pylint: enable=protected-access
 
-    self._functions[name] = function
+    self._functions[compat.as_str(name)] = function
 
     # Need a new-enough consumer to support the functions we add to the graph.
     if self._graph_def_versions.min_consumer < 12:
@@ -4860,6 +4854,18 @@ class Graph(object):
     else:
       self._graph_control_dependencies_stack = control_dependencies
 
+  @property
+  def _distribution_strategy_stack(self):
+    """A stack to maintain distribution strategy context for each thread."""
+    if not hasattr(self._thread_local, "_distribution_strategy_stack"):
+      self._thread_local._distribution_strategy_stack = []  # pylint: disable=protected-access
+    return self._thread_local._distribution_strategy_stack  # pylint: disable=protected-access
+
+  @_distribution_strategy_stack.setter
+  def _distribution_strategy_stack(self, _distribution_strategy_stack):
+    self._thread_local._distribution_strategy_stack = (  # pylint: disable=protected-access
+        _distribution_strategy_stack)
+
   def _mutation_lock(self):
     """Returns a lock to guard code that creates & mutates ops.
 
@@ -5772,6 +5778,38 @@ class GraphKeys(object):
     return cls.GLOBAL_VARIABLES
 
 
+def dismantle_graph(graph):
+  """Cleans up reference cycles from a `Graph`.
+
+  Helpful for making sure the garbage collector doesn't need to run after a
+  temporary `Graph` is no longer needed.
+
+  Args:
+    graph: A `Graph` object to destroy. Neither it nor any of its ops are usable
+      after this function runs.
+  """
+  # pylint: disable=protected-access
+  # OrderedDict, constructed on Graph creation, makes a simple reference loop
+  # and hides it in an __attribute in some Python versions. We don't need to
+  # throw an error if we can't find it, but if we do find it we can break the
+  # loop to avoid creating work for the garbage collector.
+  graph_operations = graph.get_operations()
+  problematic_cycle = graph._functions.__dict__.get("_OrderedDict__root", None)
+  # pylint: enable=protected-access
+  if problematic_cycle:
+    try:
+      del problematic_cycle[0][:]
+    except TypeError:
+      # This is probably not one of the problematic Python versions. Continue
+      # with the rest of our cleanup.
+      pass
+  # Now clean up Operation<->Graph reference cycles by clearing all of the
+  # attributes for the Graph and its ops.
+  for op in graph_operations:
+    op.__dict__ = {}
+  graph.__dict__ = {}
+
+
 @tf_export("add_to_collection")
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 318387c61b..ced0581402 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -493,7 +493,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       y.op._add_control_input(z.op)  # pylint: disable=protected-access
       y.op._add_control_input(x.op)  # pylint: disable=protected-access
       x.op._add_control_input(y.op)  # pylint: disable=protected-access
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Graph is invalid, contains a cycle with 2 nodes"):
@@ -1614,6 +1614,33 @@ class CollectionTest(test_util.TensorFlowTestCase):
       # Collections are ordered.
       self.assertEqual([90, 100], ops.get_collection("key"))
 
+  def test_defun(self):
+    with context.eager_mode():
+
+      @eager_function.defun
+      def defun():
+        ops.add_to_collection("int", 1)
+        ops.add_to_collection("tensor", constant_op.constant(2))
+
+        @eager_function.defun
+        def inner_defun():
+          self.assertEqual(ops.get_collection("int"), [1])
+          three = ops.get_collection("tensor")[0] + ops.get_collection("int")[0]
+          ops.add_to_collection("int", 2)
+          self.assertEqual(ops.get_collection("int"), [1, 2])
+          ops.add_to_collection("foo", "bar")
+          self.assertEqual(ops.get_collection("foo"), ["bar"])
+          return three
+
+        self.assertEqual(ops.get_collection("int"), [1])
+        three = inner_defun()
+        self.assertEqual(ops.get_collection("int"), [1, 2])
+        self.assertEqual(ops.get_collection("foo"), ["bar"])
+        return three
+
+      three = defun()
+      self.assertEqual(three.numpy(), 3)
+
 
 ops.NotDifferentiable("FloatOutput")
 
@@ -2459,7 +2486,7 @@ class AsGraphDefTest(test_util.TensorFlowTestCase):
     """Test that the graphdef version is plumbed through to kernels."""
     with ops.Graph().as_default() as g:
       version = g.graph_def_versions.producer
-      with self.test_session(graph=g):
+      with self.session(graph=g):
         v = test_ops.graph_def_version().eval()
         self.assertEqual(version, v)
 
@@ -2757,7 +2784,7 @@ class DeprecatedTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default() as g:
       test_util.set_producer_version(g, 7)
       old = test_ops.old()
-      with self.test_session(graph=g):
+      with self.session(graph=g):
         old.run()
 
   def _error(self):
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 76d4c2017c..2022fbcbaa 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -102,15 +102,6 @@ string TensorPBString(const TensorProto& pb) {
   return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
 }
 
-const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.in_arg_size(); ++i) {
-    if (api_def.in_arg(i).name() == name) {
-      return &api_def.in_arg(i);
-    }
-  }
-  return nullptr;
-}
-
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
  public:
   GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 031b4a384e..f2270342b0 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -483,15 +483,6 @@ const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
   return nullptr;
 }
 
-const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.in_arg_size(); ++i) {
-    if (api_def.in_arg(i).name() == name) {
-      return &api_def.in_arg(i);
-    }
-  }
-  return nullptr;
-}
-
 GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
                          const string& function_name)
     : op_def_(op_def),
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 8eb943b960..e20ad5fd33 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -52,7 +52,7 @@ Status ReadOpListFromFile(const string& filename,
     if (scanner.One(strings::Scanner::LETTER_DIGIT_DOT)
             .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
             .GetResult(nullptr, &op_name)) {
-      op_list->emplace_back(op_name.ToString());
+      op_list->emplace_back(op_name);
     }
     s = input_buffer->ReadLine(&line_contents);
   }
diff --git a/tensorflow/python/framework/smart_cond.py b/tensorflow/python/framework/smart_cond.py
index 48a834392b..7ee2b5b347 100644
--- a/tensorflow/python/framework/smart_cond.py
+++ b/tensorflow/python/framework/smart_cond.py
@@ -77,11 +77,9 @@ def smart_constant_value(pred):
     pred_value = pred
   elif isinstance(pred, ops.Tensor):
     pred_value = tensor_util.constant_value(pred)
-    # TODO(skyewm): consider folding this into tensor_util.constant_value when
-    # _USE_C_API is removed (there may be performance and correctness bugs, so I
-    # wanted to limit the change hidden behind _USE_C_API).
+    # TODO(skyewm): consider folding this into tensor_util.constant_value.
     # pylint: disable=protected-access
-    if pred_value is None and ops._USE_C_API:
+    if pred_value is None:
       pred_value = c_api.TF_TryEvaluateConstant_wrapper(pred.graph._c_graph,
                                                         pred._as_tf_output())
     # pylint: enable=protected-access
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index a45581190f..d1bdd9b80a 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -112,8 +112,6 @@ class SparseTensor(_TensorLike):
       values: A 1-D tensor of any type and shape `[N]`.
       dense_shape: A 1-D int64 tensor of shape `[ndims]`.
 
-    Returns:
-      A `SparseTensor`.
     """
     with ops.name_scope(None, "SparseTensor",
                         [indices, values, dense_shape]):
@@ -184,10 +182,31 @@ class SparseTensor(_TensorLike):
     return self._dense_shape
 
   @property
+  def shape(self):
+    """Get the `TensorShape` representing the shape of the dense tensor.
+
+    Returns:
+      A `TensorShape` object.
+    """
+    return tensor_util.constant_value_as_shape(self._dense_shape)
+
+  @property
   def graph(self):
     """The `Graph` that contains the index, value, and dense_shape tensors."""
     return self._indices.graph
 
+  def consumers(self):
+    """Returns a list of `Operation`s that consume this `SparseTensor`.
+
+    Returns:
+      A list of `Operation`s.
+    """
+    values_consumers = set(self._values.consumers())
+    indices_consumers = set(self._indices.consumers())
+    dense_shape_consumers = set(self._dense_shape.consumers())
+    return list(values_consumers \
+                .union(indices_consumers, dense_shape_consumers))
+
   def __str__(self):
     return "SparseTensor(indices=%s, values=%s, dense_shape=%s)" % (
         self._indices, self._values, self._dense_shape)
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index c001fed3b0..2bcfbc17df 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
 
@@ -63,6 +65,18 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
         sparse_tensor.is_sparse(
             sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 
+  def testConsumers(self):
+    sp = sparse_tensor.SparseTensor([[0, 0], [1, 2]], [1.0, 3.0], [3, 4])
+    w = ops.convert_to_tensor(np.ones([4, 1], np.float32))
+    out = sparse_ops.sparse_tensor_dense_matmul(sp, w)
+    self.assertEqual(len(sp.consumers()), 1)
+    self.assertEqual(sp.consumers()[0], out.op)
+
+    dense = sparse_ops.sparse_tensor_to_dense(sp)
+    self.assertEqual(len(sp.consumers()), 2)
+    self.assertTrue(dense.op in sp.consumers())
+    self.assertTrue(out.op in sp.consumers())
+
 
 class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index cee7398974..00759eb611 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -137,12 +137,7 @@ def _subscribe_new(tensor, side_effects, control_cache):
     # are subscribed at the same time, we remove the control dependency from
     # the original op only once and we add the dependencies to all the
     # new identities.
-    if ops._USE_C_API:  # pylint: disable=protected-access
-      new_control_inputs = consumer_op.control_inputs
-    else:
-      # Make a copy so we don't modify the actual control inputs (this is fixed
-      # in the C API).
-      new_control_inputs = list(consumer_op.control_inputs)
+    new_control_inputs = consumer_op.control_inputs
     if tensor.op in new_control_inputs:
       new_control_inputs.remove(tensor.op)
     new_control_inputs.append(out.op)
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index bd0f691a61..11b681d544 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -498,7 +498,8 @@ class TensorShape(object):
 
   If a tensor is produced by an operation of type `"Foo"`, its shape
   may be inferred if there is a registered shape function for
-  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in C++`}
+  `"Foo"`. See [Shape
+  functions](https://tensorflow.org/extend/adding_an_op#shape_functions_in_c)
   for details of shape functions and how to register them. Alternatively,
   the shape may be set explicitly using `tf.Tensor.set_shape`.
   """
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 9be6391b04..155134fac4 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -63,6 +63,7 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
@@ -369,6 +370,7 @@ def enable_c_shapes(fn):
       fn(*args, **kwargs)
     finally:
       ops._USE_C_SHAPES = prev_value
+
   # pylint: enable=protected-access
 
   return wrapper
@@ -398,6 +400,53 @@ def with_c_shapes(cls):
   return cls
 
 
+def enable_cond_v2(fn):
+  """Decorator for enabling CondV2 on a test.
+
+  Note this enables using CondV2 after running the test class's setup/teardown
+  methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+
+  # pylint: disable=protected-access
+  def wrapper(*args, **kwargs):
+    prev_value = control_flow_ops._ENABLE_COND_V2
+    control_flow_ops._ENABLE_COND_V2 = True
+    try:
+      fn(*args, **kwargs)
+    finally:
+      control_flow_ops._ENABLE_COND_V2 = prev_value
+  # pylint: enable=protected-access
+
+  return wrapper
+
+
+def with_cond_v2(cls):
+  """Adds methods that call original methods but with CondV2 enabled.
+
+  Note this enables CondV2 in new methods after running the test class's
+  setup method.
+
+  Args:
+    cls: class to decorate
+
+  Returns:
+    cls with new test methods added
+  """
+  if control_flow_ops._ENABLE_COND_V2:
+    return cls
+
+  for name, value in cls.__dict__.copy().items():
+    if callable(value) and name.startswith("test"):
+      setattr(cls, name + "WithCondV2", enable_cond_v2(value))
+  return cls
+
+
 def assert_no_new_pyobjects_executing_eagerly(f):
   """Decorator for asserting that no new Python objects persist after a test.
 
@@ -418,7 +467,8 @@ def assert_no_new_pyobjects_executing_eagerly(f):
       previous_count = len(gc.get_objects())
       collection_sizes_before = {
           collection: len(ops.get_collection(collection))
-          for collection in ops.get_default_graph().collections}
+          for collection in ops.get_default_graph().collections
+      }
       for _ in range(3):
         f(self, **kwargs)
       # Note that gc.get_objects misses anything that isn't subject to garbage
@@ -430,8 +480,8 @@ def assert_no_new_pyobjects_executing_eagerly(f):
         if len(collection) > size_before:
           raise AssertionError(
               ("Collection %s increased in size from "
-               "%d to %d (current items %s).")
-              % (collection_key, size_before, len(collection), collection))
+               "%d to %d (current items %s).") % (collection_key, size_before,
+                                                  len(collection), collection))
         # Make sure our collection checks don't show up as leaked memory by
         # removing references to temporary variables.
         del collection
@@ -446,8 +496,8 @@ def assert_no_new_pyobjects_executing_eagerly(f):
       # Using plain assert because not all classes using this decorator
       # have assertLessEqual
       assert new_count <= previous_count, (
-          "new_count(%d) is not less than or equal to previous_count(%d)" % (
-              new_count, previous_count))
+          "new_count(%d) is not less than or equal to previous_count(%d)" %
+          (new_count, previous_count))
       gc.enable()
 
   return decorator
@@ -547,10 +597,12 @@ def assert_no_garbage_created(f):
             return "<%s %d>" % (obj.__class__.__name__, id(obj))
 
           logging.error("  Object type: %s", _safe_object_str(obj))
-          logging.error("  Referrer types: %s", ", ".join(
-              [_safe_object_str(ref) for ref in gc.get_referrers(obj)]))
-          logging.error("  Referent types: %s", ", ".join(
-              [_safe_object_str(ref) for ref in gc.get_referents(obj)]))
+          logging.error(
+              "  Referrer types: %s", ", ".join(
+                  [_safe_object_str(ref) for ref in gc.get_referrers(obj)]))
+          logging.error(
+              "  Referent types: %s", ", ".join(
+                  [_safe_object_str(ref) for ref in gc.get_referents(obj)]))
           logging.error("  Object attribute names: %s", dir(obj))
           logging.error("  Object __str__:")
           logging.error(obj)
@@ -629,9 +681,8 @@ def generate_combinations_with_testcase_name(**kwargs):
   for combination in combinations:
     assert isinstance(combination, OrderedDict)
     name = "".join([
-        "_{}_{}".format(
-            "".join(filter(str.isalnum, key)),
-            "".join(filter(str.isalnum, str(value))))
+        "_{}_{}".format("".join(filter(str.isalnum, key)), "".join(
+            filter(str.isalnum, str(value))))
         for key, value in combination.items()
     ])
     named_combinations.append(
@@ -736,15 +787,19 @@ def run_in_graph_and_eager_modes(func=None,
         run_eagerly = assert_no_new_tensors(
             assert_no_garbage_created(run_eagerly))
 
-      with context.eager_mode():
+      if reset_test:
+        # This decorator runs the wrapped test twice.
+        # Reset the test environment between runs.
+        self.tearDown()
+        self._tempdir = None
+      # Create a new graph for the eagerly executed version of this test for
+      # better isolation.
+      graph_for_eager_test = ops.Graph()
+      with graph_for_eager_test.as_default(), context.eager_mode():
         if reset_test:
-          # This decorator runs the wrapped test twice.
-          # Reset the test environment between runs.
-          self.tearDown()
-          self._tempdir = None
           self.setUp()
-
         run_eagerly(self, **kwargs)
+      ops.dismantle_graph(graph_for_eager_test)
 
     return decorated
 
@@ -967,21 +1022,64 @@ class TensorFlowTestCase(googletest.TestCase):
 
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
-  def test_session(self,
-                   graph=None,
-                   config=None,
-                   use_gpu=False,
-                   force_gpu=False):
+  def session(self, graph=None, config=None, use_gpu=False, force_gpu=False):
     """Returns a TensorFlow Session for use in executing tests.
 
-    This method should be used for all functional tests.
+    Note that this will set this session and the graph as global defaults.
+
+    Use the `use_gpu` and `force_gpu` options to control where ops are run. If
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
+    `use_gpu` is True, TensorFlow tries to run as many ops on the GPU as
+    possible. If both `force_gpu and `use_gpu` are False, all ops are pinned to
+    the CPU.
+
+    Example:
+    ```python
+    class MyOperatorTest(test_util.TensorFlowTestCase):
+      def testMyOperator(self):
+        with self.session(use_gpu=True):
+          valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
+          result = MyOperator(valid_input).eval()
+          self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
+          invalid_input = [-1.0, 2.0, 7.0]
+          with self.assertRaisesOpError("negative input not supported"):
+            MyOperator(invalid_input).eval()
+    ```
+
+    Args:
+      graph: Optional graph to use during the returned session.
+      config: An optional config_pb2.ConfigProto to use to configure the
+        session.
+      use_gpu: If True, attempt to run as many ops as possible on GPU.
+      force_gpu: If True, pin all ops to `/device:GPU:0`.
+
+    Yields:
+      A Session object that should be used as a context manager to surround
+      the graph building and execution code in a test case.
+    """
+    if context.executing_eagerly():
+      yield None
+    else:
+      sess = self._create_session(graph, config, use_gpu, force_gpu)
+      with self._constrain_devices_and_set_default(
+          sess, use_gpu, force_gpu) as constrained_sess:
+        # We need to do this to make sure the session closes, otherwise, even
+        # if the user does with self.session():, it will not close the session.
+        with constrained_sess:
+          yield constrained_sess
+
+  @contextlib.contextmanager
+  def cached_session(self,
+                     graph=None,
+                     config=None,
+                     use_gpu=False,
+                     force_gpu=False):
+    """Returns a TensorFlow Session for use in executing tests.
 
-    This method behaves different than session.Session: for performance reasons
-    `test_session` will by default (if `graph` is None) reuse the same session
-    across tests. This means you may want to either call the function
-    `reset_default_graph()` before tests, or if creating an explicit new graph,
-    pass it here (simply setting it with `as_default()` won't do it), which will
-    trigger the creation of a new session.
+    This method behaves differently than self.session(): for performance reasons
+    `cached_session` will by default reuse the same session within the same
+    test. The session returned by this function will only be closed at the end
+    of the test (in the TearDown function).
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
     `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
@@ -993,7 +1091,7 @@ class TensorFlowTestCase(googletest.TestCase):
     ```python
     class MyOperatorTest(test_util.TensorFlowTestCase):
       def testMyOperator(self):
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True) as sess:
           valid_input = [1.0, 2.0, 3.0, 4.0, 5.0]
           result = MyOperator(valid_input).eval()
           self.assertEqual(result, [1.0, 2.0, 3.0, 5.0, 8.0]
@@ -1009,74 +1107,39 @@ class TensorFlowTestCase(googletest.TestCase):
       use_gpu: If True, attempt to run as many ops as possible on GPU.
       force_gpu: If True, pin all ops to `/device:GPU:0`.
 
-    Returns:
+    Yields:
       A Session object that should be used as a context manager to surround
       the graph building and execution code in a test case.
     """
+    if context.executing_eagerly():
+      yield None
+    else:
+      with self._get_cached_session(
+          graph, config, use_gpu, force_gpu,
+          crash_if_inconsistent_args=True) as sess:
+        yield sess
+
+  @contextlib.contextmanager
+  def test_session(self,
+                   graph=None,
+                   config=None,
+                   use_gpu=False,
+                   force_gpu=False):
+    """Use cached_session instead."""
     if self.id().endswith(".test_session"):
       self.skipTest("Not a test.")
 
-    def prepare_config(config):
-      """Returns a config for sessions.
-
-      Args:
-        config: An optional config_pb2.ConfigProto to use to configure the
-          session.
-      Returns:
-        A config_pb2.ConfigProto object.
-      """
-      if config is None:
-        config = config_pb2.ConfigProto()
-        config.allow_soft_placement = not force_gpu
-        config.gpu_options.per_process_gpu_memory_fraction = 0.3
-      elif force_gpu and config.allow_soft_placement:
-        config = config_pb2.ConfigProto().CopyFrom(config)
-        config.allow_soft_placement = False
-      # Don't perform optimizations for tests so we don't inadvertently run
-      # gpu ops on cpu
-      config.graph_options.optimizer_options.opt_level = -1
-      config.graph_options.rewrite_options.constant_folding = (
-          rewriter_config_pb2.RewriterConfig.OFF)
-      config.graph_options.rewrite_options.arithmetic_optimization = (
-          rewriter_config_pb2.RewriterConfig.OFF)
-      return config
-
     if context.executing_eagerly():
       yield None
-    elif graph is None:
-      if self._cached_session is None:
-        self._cached_session = session.Session(
-            graph=None, config=prepare_config(config))
-      sess = self._cached_session
-      with sess.graph.as_default(), sess.as_default():
-        if force_gpu:
-          # Use the name of an actual device if one is detected, or '/device:GPU:0'
-          # otherwise
-          gpu_name = gpu_device_name()
-          if not gpu_name:
-            gpu_name = "/device:GPU:0"
-          with sess.graph.device(gpu_name):
-            yield sess
-        elif use_gpu:
-          yield sess
-        else:
-          with sess.graph.device("/cpu:0"):
-            yield sess
     else:
-      with session.Session(graph=graph, config=prepare_config(config)) as sess:
-        if force_gpu:
-          # Use the name of an actual device if one is detected, or '/device:GPU:0'
-          # otherwise
-          gpu_name = gpu_device_name()
-          if not gpu_name:
-            gpu_name = "/device:GPU:0"
-          with sess.graph.device(gpu_name):
-            yield sess
-        elif use_gpu:
+      if graph is None:
+        with self._get_cached_session(
+            graph, config, use_gpu, force_gpu,
+            crash_if_inconsistent_args=False) as sess:
+          yield sess
+      else:
+        with self.session(graph, config, use_gpu, force_gpu) as sess:
           yield sess
-        else:
-          with sess.graph.device("/cpu:0"):
-            yield sess
 
   # pylint: enable=g-doc-return-or-yield
 
@@ -1202,9 +1265,10 @@ class TensorFlowTestCase(googletest.TestCase):
       msg: An optional string message to append to the failure message.
     """
     # f1 == f2 is needed here as we might have: f1, f2 = inf, inf
-    self.assertTrue(f1 == f2 or math.fabs(f1 - f2) <= err,
-                    "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
-                                           if msg is not None else ""))
+    self.assertTrue(
+        f1 == f2 or math.fabs(f1 - f2) <= err,
+        "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
+                               if msg is not None else ""))
 
   def assertArrayNear(self, farray1, farray2, err, msg=None):
     """Asserts that two float arrays are near each other.
@@ -1250,8 +1314,9 @@ class TensorFlowTestCase(googletest.TestCase):
   def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s." %
-                     (a.shape, b.shape))
+    self.assertEqual(
+        a.shape, b.shape,
+        "Shape mismatch: expected %s, got %s." % (a.shape, b.shape))
     if not np.allclose(a, b, rtol=rtol, atol=atol):
       # Prints more details than np.testing.assert_allclose.
       #
@@ -1453,8 +1518,9 @@ class TensorFlowTestCase(googletest.TestCase):
     msg = msg if msg else ""
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-                     " %s" % (a.shape, b.shape, msg))
+    self.assertEqual(
+        a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+        " %s" % (a.shape, b.shape, msg))
     same = (a == b)
 
     if (a.dtype in [
@@ -1682,8 +1748,8 @@ class TensorFlowTestCase(googletest.TestCase):
       self.fail(exception_type.__name__ + " not raised")
     except Exception as e:  # pylint: disable=broad-except
       if not isinstance(e, exception_type) or not predicate(e):
-        raise AssertionError("Exception of type %s: %s" % (str(type(e)),
-                                                           str(e)))
+        raise AssertionError(
+            "Exception of type %s: %s" % (str(type(e)), str(e)))
 
   # pylint: enable=g-doc-return-or-yield
 
@@ -1719,8 +1785,9 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     device1 = pydev.canonical_name(device1)
     device2 = pydev.canonical_name(device2)
-    self.assertEqual(device1, device2, "Devices %s and %s are not equal. %s" %
-                     (device1, device2, msg))
+    self.assertEqual(
+        device1, device2,
+        "Devices %s and %s are not equal. %s" % (device1, device2, msg))
 
   # Fix Python 3 compatibility issues
   if six.PY3:
@@ -1734,6 +1801,113 @@ class TensorFlowTestCase(googletest.TestCase):
 
     # pylint: enable=invalid-name
 
+  @contextlib.contextmanager
+  def _constrain_devices_and_set_default(self, sess, use_gpu, force_gpu):
+    """Set the session and its graph to global default and constrain devices."""
+    if context.executing_eagerly():
+      yield None
+    else:
+      with sess.graph.as_default(), sess.as_default():
+        if force_gpu:
+          # Use the name of an actual device if one is detected, or
+          # '/device:GPU:0' otherwise
+          gpu_name = gpu_device_name()
+          if not gpu_name:
+            gpu_name = "/device:GPU:0"
+          with sess.graph.device(gpu_name):
+            yield sess
+        elif use_gpu:
+          yield sess
+        else:
+          with sess.graph.device("/cpu:0"):
+            yield sess
+
+  def _create_session(self, graph, config, use_gpu, force_gpu):
+    """See session() for details."""
+    if context.executing_eagerly():
+      return None
+    else:
+
+      def prepare_config(config):
+        """Returns a config for sessions.
+
+        Args:
+          config: An optional config_pb2.ConfigProto to use to configure the
+            session.
+        Returns:
+          A config_pb2.ConfigProto object.
+        """
+        if config is None:
+          config = config_pb2.ConfigProto()
+          config.allow_soft_placement = not force_gpu
+          config.gpu_options.per_process_gpu_memory_fraction = 0.3
+        elif force_gpu and config.allow_soft_placement:
+          config = config_pb2.ConfigProto().CopyFrom(config)
+          config.allow_soft_placement = False
+        # Don't perform optimizations for tests so we don't inadvertently run
+        # gpu ops on cpu
+        config.graph_options.optimizer_options.opt_level = -1
+        config.graph_options.rewrite_options.constant_folding = (
+            rewriter_config_pb2.RewriterConfig.OFF)
+        config.graph_options.rewrite_options.arithmetic_optimization = (
+            rewriter_config_pb2.RewriterConfig.OFF)
+        return config
+
+      return session.Session(graph=graph, config=prepare_config(config))
+
+  @contextlib.contextmanager
+  def _get_cached_session(self,
+                          graph=None,
+                          config=None,
+                          use_gpu=False,
+                          force_gpu=False,
+                          crash_if_inconsistent_args=True):
+    """See cached_session() for documentation."""
+    if context.executing_eagerly():
+      yield None
+    else:
+      if self._cached_session is None:
+        sess = self._create_session(
+            graph=graph, config=config, use_gpu=use_gpu, force_gpu=force_gpu)
+        self._cached_session = sess
+        self._cached_graph = graph
+        self._cached_config = config
+        self._cached_use_gpu = use_gpu
+        self._cached_force_gpu = force_gpu
+        with self._constrain_devices_and_set_default(
+            sess, use_gpu, force_gpu) as constrained_sess:
+          yield constrained_sess
+      else:
+        if crash_if_inconsistent_args and self._cached_graph is not graph:
+          raise ValueError("The graph used to get the cached session is "
+                           "different than the one that was used to create the "
+                           "session. Maybe create a new session with "
+                           "self.session()")
+        if crash_if_inconsistent_args and self._cached_config is not config:
+          raise ValueError("The config used to get the cached session is "
+                           "different than the one that was used to create the "
+                           "session. Maybe create a new session with "
+                           "self.session()")
+        if crash_if_inconsistent_args and self._cached_use_gpu is not use_gpu:
+          raise ValueError(
+              "The use_gpu value used to get the cached session is "
+              "different than the one that was used to create the "
+              "session. Maybe create a new session with "
+              "self.session()")
+        if crash_if_inconsistent_args and (self._cached_force_gpu is
+                                           not force_gpu):
+          raise ValueError(
+              "The force_gpu value used to get the cached session is "
+              "different than the one that was used to create the "
+              "session. Maybe create a new session with "
+              "self.session()")
+        # If you modify this logic, make sure to modify it in _create_session
+        # as well.
+        sess = self._cached_session
+        with self._constrain_devices_and_set_default(
+            sess, use_gpu, force_gpu) as constrained_sess:
+          yield constrained_sess
+
 
 @tf_export("test.create_local_cluster")
 def create_local_cluster(num_workers,
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index f983cbef04..f68c0ddecb 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -22,6 +22,7 @@ import collections
 import copy
 import random
 import threading
+import weakref
 
 import numpy as np
 
@@ -40,6 +41,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -57,6 +59,33 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertRaises(ValueError, test_util.assert_ops_in_graph,
                       {"hello": "Variable"}, ops.get_default_graph())
 
+  def test_session_functions(self):
+    with self.test_session() as sess:
+      sess_ref = weakref.ref(sess)
+      with self.cached_session(graph=None, config=None) as sess2:
+        # We make sure that sess2 is sess.
+        assert sess2 is sess
+        # We make sure we raise an exception if we use cached_session with
+        # different values.
+        with self.assertRaises(ValueError):
+          with self.cached_session(graph=ops.Graph()) as sess2:
+            pass
+        with self.assertRaises(ValueError):
+          with self.cached_session(use_gpu=True) as sess2:
+            pass
+        with self.assertRaises(ValueError):
+          with self.cached_session(force_gpu=True) as sess2:
+            pass
+    # We make sure that test_session will cache the session even after the
+    # with scope.
+    assert not sess_ref()._closed
+    with self.session() as unique_sess:
+      unique_sess_ref = weakref.ref(unique_sess)
+      with self.session() as sess2:
+        assert sess2 is not unique_sess
+    # We make sure the session is closed when we leave the with statement.
+    assert unique_sess_ref()._closed
+
   def test_assert_equal_graph_def(self):
     with ops.Graph().as_default() as g:
       def_empty = g.as_graph_def()
@@ -666,6 +695,22 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertEqual(modes[2:], ["setup_eager", "run_eager"])
 
 
+# Its own test case to reproduce variable sharing issues which only pop up when
+# setUp() is overridden and super() is not called.
+class GraphAndEagerNoVariableSharing(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    pass  # Intentionally does not call TensorFlowTestCase's super()
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_no_variable_sharing(self):
+    variable_scope.get_variable(
+        name="step_size",
+        initializer=np.array(1e-5, np.float32),
+        use_resource=True,
+        trainable=False)
+
+
 class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
   def test_no_reference_cycle_decorator(self):
diff --git a/tensorflow/python/grappler/cost_analyzer.h b/tensorflow/python/grappler/cost_analyzer.h
index b5364aa37a..d15858c1ee 100644
--- a/tensorflow/python/grappler/cost_analyzer.h
+++ b/tensorflow/python/grappler/cost_analyzer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
-#define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
+#ifndef TENSORFLOW_PYTHON_GRAPPLER_COST_ANALYZER_H_
+#define TENSORFLOW_PYTHON_GRAPPLER_COST_ANALYZER_H_
 
 #include <iostream>
 #include "tensorflow/core/framework/cost_graph.pb.h"
@@ -80,4 +80,4 @@ class CostAnalyzer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ANALYZER_H_
+#endif  // TENSORFLOW_PYTHON_GRAPPLER_COST_ANALYZER_H_
diff --git a/tensorflow/python/grappler/model_analyzer.h b/tensorflow/python/grappler/model_analyzer.h
index 97ffafabe1..9764a75b29 100644
--- a/tensorflow/python/grappler/model_analyzer.h
+++ b/tensorflow/python/grappler/model_analyzer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_MODEL_ANALYZER_H_
-#define TENSORFLOW_CORE_GRAPPLER_COSTS_MODEL_ANALYZER_H_
+#ifndef TENSORFLOW_PYTHON_GRAPPLER_MODEL_ANALYZER_H_
+#define TENSORFLOW_PYTHON_GRAPPLER_MODEL_ANALYZER_H_
 
 #include <iostream>
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -43,4 +43,4 @@ class ModelAnalyzer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_MODEL_ANALYZER_H_
+#endif  // TENSORFLOW_PYTHON_GRAPPLER_MODEL_ANALYZER_H_
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 7eb7884d1d..f70da75610 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -140,6 +140,7 @@ py_library(
         ":backend",
         "//tensorflow/python/data",
         "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
 )
@@ -399,7 +400,7 @@ py_test(
 
 py_test(
     name = "local_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/local_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -625,6 +626,19 @@ cuda_py_test(
 )
 
 py_test(
+    name = "conv_utils_test",
+    size = "small",
+    srcs = ["utils/conv_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
     name = "image_test",
     size = "medium",
     srcs = ["preprocessing/image_test.py"],
@@ -675,7 +689,7 @@ py_test(
 
 py_test(
     name = "training_test",
-    size = "large",
+    size = "enormous",
     srcs = ["engine/training_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index 5cff1f8f9c..dd0bbcff39 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -45,7 +45,7 @@ class KerasActivationsTest(test.TestCase):
       assert fn == ref_fn
 
   def test_softmax(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
       f = keras.backend.function([x], [keras.activations.softmax(x)])
       test_values = np.random.random((2, 5))
@@ -59,7 +59,7 @@ class KerasActivationsTest(test.TestCase):
       keras.activations.softmax(x)
 
   def test_temporal_softmax(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(shape=(2, 2, 3))
       f = keras.backend.function([x], [keras.activations.softmax(x)])
       test_values = np.random.random((2, 2, 3)) * 10
@@ -73,7 +73,7 @@ class KerasActivationsTest(test.TestCase):
     alpha = 1.6732632423543772848170429916717
     scale = 1.0507009873554804934193349852946
 
-    with self.test_session():
+    with self.cached_session():
       positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
       result = f([positive_values])[0]
       self.assertAllClose(result, positive_values * scale, rtol=1e-05)
@@ -87,7 +87,7 @@ class KerasActivationsTest(test.TestCase):
     def softplus(x):
       return np.log(np.ones_like(x) + np.exp(x))
 
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
       f = keras.backend.function([x], [keras.activations.softplus(x)])
       test_values = np.random.random((2, 5))
@@ -99,7 +99,7 @@ class KerasActivationsTest(test.TestCase):
     def softsign(x):
       return np.divide(x, np.ones_like(x) + np.absolute(x))
 
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
       f = keras.backend.function([x], [keras.activations.softsign(x)])
       test_values = np.random.random((2, 5))
@@ -116,7 +116,7 @@ class KerasActivationsTest(test.TestCase):
         return z / (1 + z)
     sigmoid = np.vectorize(ref_sigmoid)
 
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
       f = keras.backend.function([x], [keras.activations.sigmoid(x)])
       test_values = np.random.random((2, 5))
@@ -130,7 +130,7 @@ class KerasActivationsTest(test.TestCase):
       z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
       return z
     hard_sigmoid = np.vectorize(ref_hard_sigmoid)
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
       f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
       test_values = np.random.random((2, 5))
@@ -139,7 +139,7 @@ class KerasActivationsTest(test.TestCase):
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_relu(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
       f = keras.backend.function([x], [keras.activations.relu(x)])
       test_values = np.random.random((2, 5))
@@ -148,7 +148,7 @@ class KerasActivationsTest(test.TestCase):
     self.assertAllClose(result, test_values, rtol=1e-05)
 
   def test_elu(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
       f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
       test_values = np.random.random((2, 5))
@@ -160,7 +160,7 @@ class KerasActivationsTest(test.TestCase):
     self.assertAllClose(result, true_result)
 
   def test_tanh(self):
-    with self.test_session():
+    with self.cached_session():
       test_values = np.random.random((2, 5))
       x = keras.backend.placeholder(ndim=2)
       exp = keras.activations.tanh(x)
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index cd9462d6b5..a8b6d55e41 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Keras Applications are canned architectures with pre-trained weights."""
 # pylint: disable=g-import-not-at-top
+# pylint: disable=g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -25,13 +26,49 @@ from tensorflow.python.keras import engine
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import utils
+from tensorflow.python.util import tf_inspect
+
+# `get_submodules_from_kwargs` has been introduced in 1.0.5, but we would
+# like to be able to handle prior versions. Note that prior to 1.0.5,
+# `keras_applications` did not expose a `__version__` attribute.
+if not hasattr(keras_applications, 'get_submodules_from_kwargs'):
+
+  if 'engine' in tf_inspect.getfullargspec(
+      keras_applications.set_keras_submodules)[0]:
+    keras_applications.set_keras_submodules(
+        backend=backend,
+        layers=layers,
+        models=models,
+        utils=utils,
+        engine=engine)
+  else:
+    keras_applications.set_keras_submodules(
+        backend=backend,
+        layers=layers,
+        models=models,
+        utils=utils)
+
+
+def keras_modules_injection(base_fun):
+  """Decorator injecting tf.keras replacements for Keras modules.
+
+  Arguments:
+      base_fun: Application function to decorate (e.g. `MobileNet`).
+
+  Returns:
+      Decorated function that injects keyword argument for the tf.keras
+      modules required by the Applications.
+  """
+
+  def wrapper(*args, **kwargs):
+    if hasattr(keras_applications, 'get_submodules_from_kwargs'):
+      kwargs['backend'] = backend
+      kwargs['layers'] = layers
+      kwargs['models'] = models
+      kwargs['utils'] = utils
+    return base_fun(*args, **kwargs)
+  return wrapper
 
-keras_applications.set_keras_submodules(
-    backend=backend,
-    engine=engine,
-    layers=layers,
-    models=models,
-    utils=utils)
 
 from tensorflow.python.keras.applications.densenet import DenseNet121
 from tensorflow.python.keras.applications.densenet import DenseNet169
@@ -39,7 +76,7 @@ from tensorflow.python.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras.applications.mobilenet import MobileNet
-# TODO(fchollet): enable MobileNetV2 in next version.
+from tensorflow.python.keras.applications.mobilenet_v2 import MobileNetV2
 from tensorflow.python.keras.applications.nasnet import NASNetLarge
 from tensorflow.python.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras.applications.resnet50 import ResNet50
diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py
index ef3198a937..b15ca5990a 100644
--- a/tensorflow/python/keras/applications/applications_test.py
+++ b/tensorflow/python/keras/applications/applications_test.py
@@ -32,7 +32,8 @@ MODEL_LIST = [
     (applications.InceptionV3, 2048),
     (applications.InceptionResNetV2, 1536),
     (applications.MobileNet, 1024),
-    # TODO(fchollet): enable MobileNetV2 in next version.
+    # TODO(fchollet): enable MobileNetV2 tests when a new TensorFlow test image
+    # is released with keras_applications upgraded to 1.0.5 or above.
     (applications.DenseNet121, 1024),
     (applications.DenseNet169, 1664),
     (applications.DenseNet201, 1920),
@@ -44,11 +45,6 @@ MODEL_LIST = [
 class ApplicationsTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(*MODEL_LIST)
-  def test_classification_model(self, model_fn, _):
-    model = model_fn(classes=1000, weights=None)
-    self.assertEqual(model.output_shape[-1], 1000)
-
-  @parameterized.parameters(*MODEL_LIST)
   def test_feature_extration_model(self, model_fn, output_dim):
     model = model_fn(include_top=False, weights=None)
     self.assertEqual(model.output_shape, (None, None, None, output_dim))
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index fbdcc66d2d..172848bbdb 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -20,18 +20,39 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import densenet
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-DenseNet121 = densenet.DenseNet121
-DenseNet169 = densenet.DenseNet169
-DenseNet201 = densenet.DenseNet201
-decode_predictions = densenet.decode_predictions
-preprocess_input = densenet.preprocess_input
-
-tf_export('keras.applications.densenet.DenseNet121',
-          'keras.applications.DenseNet121')(DenseNet121)
-tf_export('keras.applications.densenet.DenseNet169',
-          'keras.applications.DenseNet169')(DenseNet169)
-tf_export('keras.applications.densenet.DenseNet201',
-          'keras.applications.DenseNet201')(DenseNet201)
-tf_export('keras.applications.densenet.preprocess_input')(preprocess_input)
+
+@tf_export('keras.applications.densenet.DenseNet121',
+           'keras.applications.DenseNet121')
+@keras_modules_injection
+def DenseNet121(*args, **kwargs):
+  return densenet.DenseNet121(*args, **kwargs)
+
+
+@tf_export('keras.applications.densenet.DenseNet169',
+           'keras.applications.DenseNet169')
+@keras_modules_injection
+def DenseNet169(*args, **kwargs):
+  return densenet.DenseNet169(*args, **kwargs)
+
+
+@tf_export('keras.applications.densenet.DenseNet201',
+           'keras.applications.DenseNet201')
+@keras_modules_injection
+def DenseNet201(*args, **kwargs):
+  return densenet.DenseNet201(*args, **kwargs)
+
+
+@tf_export('keras.applications.densenet.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return densenet.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.densenet.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return densenet.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 70f8f6fb32..c25b5c2bdd 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -19,27 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import imagenet_utils
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-decode_predictions = imagenet_utils.decode_predictions
-preprocess_input = imagenet_utils.preprocess_input
 
-tf_export(
-    'keras.applications.imagenet_utils.decode_predictions',
-    'keras.applications.densenet.decode_predictions',
-    'keras.applications.inception_resnet_v2.decode_predictions',
-    'keras.applications.inception_v3.decode_predictions',
-    'keras.applications.mobilenet.decode_predictions',
-    'keras.applications.mobilenet_v2.decode_predictions',
-    'keras.applications.nasnet.decode_predictions',
-    'keras.applications.resnet50.decode_predictions',
-    'keras.applications.vgg16.decode_predictions',
-    'keras.applications.vgg19.decode_predictions',
-    'keras.applications.xception.decode_predictions',
-)(decode_predictions)
-tf_export(
-    'keras.applications.imagenet_utils.preprocess_input',
-    'keras.applications.resnet50.preprocess_input',
-    'keras.applications.vgg16.preprocess_input',
-    'keras.applications.vgg19.preprocess_input',
-)(preprocess_input)
+@tf_export('keras.applications.imagenet_utils.preprocess_input')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return imagenet_utils.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.imagenet_utils.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return imagenet_utils.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 63debb4e0d..0b9ef371fa 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -20,13 +20,25 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import inception_resnet_v2
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-InceptionResNetV2 = inception_resnet_v2.InceptionResNetV2
-decode_predictions = inception_resnet_v2.decode_predictions
-preprocess_input = inception_resnet_v2.preprocess_input
 
-tf_export('keras.applications.inception_resnet_v2.InceptionResNetV2',
-          'keras.applications.InceptionResNetV2')(InceptionResNetV2)
-tf_export(
-    'keras.applications.inception_resnet_v2.preprocess_input')(preprocess_input)
+@tf_export('keras.applications.inception_resnet_v2.InceptionResNetV2',
+           'keras.applications.InceptionResNetV2')
+@keras_modules_injection
+def InceptionResNetV2(*args, **kwargs):
+  return inception_resnet_v2.InceptionResNetV2(*args, **kwargs)
+
+
+@tf_export('keras.applications.inception_resnet_v2.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return inception_resnet_v2.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.inception_resnet_v2.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return inception_resnet_v2.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 87534086c8..ab76826e17 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -20,12 +20,25 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import inception_v3
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-InceptionV3 = inception_v3.InceptionV3
-decode_predictions = inception_v3.decode_predictions
-preprocess_input = inception_v3.preprocess_input
 
-tf_export('keras.applications.inception_v3.InceptionV3',
-          'keras.applications.InceptionV3')(InceptionV3)
-tf_export('keras.applications.inception_v3.preprocess_input')(preprocess_input)
+@tf_export('keras.applications.inception_v3.InceptionV3',
+           'keras.applications.InceptionV3')
+@keras_modules_injection
+def InceptionV3(*args, **kwargs):
+  return inception_v3.InceptionV3(*args, **kwargs)
+
+
+@tf_export('keras.applications.inception_v3.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return inception_v3.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.inception_v3.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return inception_v3.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index 3528f027b3..1f71a5ae99 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -20,12 +20,25 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import mobilenet
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-MobileNet = mobilenet.MobileNet
-decode_predictions = mobilenet.decode_predictions
-preprocess_input = mobilenet.preprocess_input
 
-tf_export('keras.applications.mobilenet.MobileNet',
-          'keras.applications.MobileNet')(MobileNet)
-tf_export('keras.applications.mobilenet.preprocess_input')(preprocess_input)
+@tf_export('keras.applications.mobilenet.MobileNet',
+           'keras.applications.MobileNet')
+@keras_modules_injection
+def MobileNet(*args, **kwargs):
+  return mobilenet.MobileNet(*args, **kwargs)
+
+
+@tf_export('keras.applications.mobilenet.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return mobilenet.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.mobilenet.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return mobilenet.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 9194c3ee14..52ac5959ad 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -19,4 +19,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# TODO(fchollet): export MobileNetV2 as part of the public API in next version.
+from keras_applications import mobilenet_v2
+
+from tensorflow.python.keras.applications import keras_modules_injection
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.applications.mobilenet_v2.MobileNetV2',
+           'keras.applications.MobileNetV2')
+@keras_modules_injection
+def MobileNetV2(*args, **kwargs):
+  return mobilenet_v2.MobileNetV2(*args, **kwargs)
+
+
+@tf_export('keras.applications.mobilenet_v2.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return mobilenet_v2.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.mobilenet_v2.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return mobilenet_v2.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 26ff5db53f..44fc329d57 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -20,15 +20,32 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import nasnet
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-NASNetMobile = nasnet.NASNetMobile
-NASNetLarge = nasnet.NASNetLarge
-decode_predictions = nasnet.decode_predictions
-preprocess_input = nasnet.preprocess_input
 
-tf_export('keras.applications.nasnet.NASNetMobile',
-          'keras.applications.NASNetMobile')(NASNetMobile)
-tf_export('keras.applications.nasnet.NASNetLarge',
-          'keras.applications.NASNetLarge')(NASNetLarge)
-tf_export('keras.applications.nasnet.preprocess_input')(preprocess_input)
+@tf_export('keras.applications.nasnet.NASNetMobile',
+           'keras.applications.NASNetMobile')
+@keras_modules_injection
+def NASNetMobile(*args, **kwargs):
+  return nasnet.NASNetMobile(*args, **kwargs)
+
+
+@tf_export('keras.applications.nasnet.NASNetLarge',
+           'keras.applications.NASNetLarge')
+@keras_modules_injection
+def NASNetLarge(*args, **kwargs):
+  return nasnet.NASNetLarge(*args, **kwargs)
+
+
+@tf_export('keras.applications.nasnet.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return nasnet.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.nasnet.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return nasnet.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/resnet50.py b/tensorflow/python/keras/applications/resnet50.py
index 4d804a3c44..80d3f9044f 100644
--- a/tensorflow/python/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/applications/resnet50.py
@@ -20,11 +20,25 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import resnet50
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-ResNet50 = resnet50.ResNet50
-decode_predictions = resnet50.decode_predictions
-preprocess_input = resnet50.preprocess_input
 
-tf_export('keras.applications.resnet50.ResNet50',
-          'keras.applications.ResNet50')(ResNet50)
+@tf_export('keras.applications.resnet50.ResNet50',
+           'keras.applications.ResNet50')
+@keras_modules_injection
+def ResNet50(*args, **kwargs):
+  return resnet50.ResNet50(*args, **kwargs)
+
+
+@tf_export('keras.applications.resnet50.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return resnet50.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.resnet50.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return resnet50.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index c420d9b81e..8557d26931 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -20,11 +20,25 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import vgg16
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-VGG16 = vgg16.VGG16
-decode_predictions = vgg16.decode_predictions
-preprocess_input = vgg16.preprocess_input
 
-tf_export('keras.applications.vgg16.VGG16',
-          'keras.applications.VGG16')(VGG16)
+@tf_export('keras.applications.vgg16.VGG16',
+           'keras.applications.VGG16')
+@keras_modules_injection
+def VGG16(*args, **kwargs):
+  return vgg16.VGG16(*args, **kwargs)
+
+
+@tf_export('keras.applications.vgg16.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return vgg16.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.vgg16.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return vgg16.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index 73d3d1d1c3..8fc04413a0 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -20,11 +20,25 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import vgg19
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-VGG19 = vgg19.VGG19
-decode_predictions = vgg19.decode_predictions
-preprocess_input = vgg19.preprocess_input
 
-tf_export('keras.applications.vgg19.VGG19',
-          'keras.applications.VGG19')(VGG19)
+@tf_export('keras.applications.vgg19.VGG19',
+           'keras.applications.VGG19')
+@keras_modules_injection
+def VGG19(*args, **kwargs):
+  return vgg19.VGG19(*args, **kwargs)
+
+
+@tf_export('keras.applications.vgg19.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return vgg19.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.vgg19.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return vgg19.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 5b221ac8e0..960e6dec69 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -20,12 +20,25 @@ from __future__ import division
 from __future__ import print_function
 
 from keras_applications import xception
+
+from tensorflow.python.keras.applications import keras_modules_injection
 from tensorflow.python.util.tf_export import tf_export
 
-Xception = xception.Xception
-decode_predictions = xception.decode_predictions
-preprocess_input = xception.preprocess_input
 
-tf_export('keras.applications.xception.Xception',
-          'keras.applications.Xception')(Xception)
-tf_export('keras.applications.xception.preprocess_input')(preprocess_input)
+@tf_export('keras.applications.xception.Xception',
+           'keras.applications.Xception')
+@keras_modules_injection
+def Xception(*args, **kwargs):
+  return xception.Xception(*args, **kwargs)
+
+
+@tf_export('keras.applications.xception.decode_predictions')
+@keras_modules_injection
+def decode_predictions(*args, **kwargs):
+  return xception.decode_predictions(*args, **kwargs)
+
+
+@tf_export('keras.applications.xception.preprocess_input')
+@keras_modules_injection
+def preprocess_input(*args, **kwargs):
+  return xception.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 418586b85f..b52ab7f05c 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -94,6 +94,14 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 # We assume our devices don't change henceforth.
 _LOCAL_DEVICES = None
 
+# This dictionary holds a mapping between a graph and variables to initialize
+# in the graph.
+_GRAPH_VARIABLES = {}
+
+# This dictionary holds a mapping between a graph and TF optimizers created in
+# the graph.
+_GRAPH_TF_OPTIMIZERS = {}
+
 
 @tf_export('keras.backend.backend')
 def backend():
@@ -309,6 +317,8 @@ def clear_session():
   """
   global _SESSION
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  global _GRAPH_VARIABLES  # pylint: disable=global-variable-not-assigned
+  global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   ops.reset_default_graph()
   reset_uids()
   _SESSION = None
@@ -316,6 +326,8 @@ def clear_session():
       False, shape=(), name='keras_learning_phase')
   _GRAPH_LEARNING_PHASES = {}
   _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
+  _GRAPH_VARIABLES.pop(ops.get_default_graph(), None)
+  _GRAPH_TF_OPTIMIZERS.pop(ops.get_default_graph(), None)
 
 
 @tf_export('keras.backend.manual_variable_initialization')
@@ -651,12 +663,42 @@ def variable(value, dtype=None, name=None, constraint=None):
   elif hasattr(value, 'shape'):
     v._keras_shape = int_shape(value)
   v._uses_learning_phase = False
+  track_variable(v)
   return v
 
 
+def track_tf_optimizer(tf_optimizer):
+  """Tracks the given TF optimizer for initialization of its variables."""
+  if context.executing_eagerly():
+    return
+  graph = ops.get_default_graph()
+  if graph not in _GRAPH_TF_OPTIMIZERS:
+    _GRAPH_TF_OPTIMIZERS[graph] = set()
+  _GRAPH_TF_OPTIMIZERS[graph].add(tf_optimizer)
+
+
+def track_variable(v):
+  """Tracks the given variable for initialization."""
+  if context.executing_eagerly():
+    return
+  graph = v.graph if hasattr(v, 'graph') else ops.get_default_graph()
+  if graph not in _GRAPH_VARIABLES:
+    _GRAPH_VARIABLES[graph] = set()
+  _GRAPH_VARIABLES[graph].add(v)
+
+
+def _get_variables(graph=None):
+  """Returns variables corresponding to the given graph for initialization."""
+  assert not context.executing_eagerly()
+  variables = _GRAPH_VARIABLES.get(graph, set())
+  for opt in _GRAPH_TF_OPTIMIZERS.get(graph, set()):
+    variables.update(opt.optimizer.variables())
+  return variables
+
+
 def _initialize_variables(session):
   """Utility to initialize uninitialized variables on the fly."""
-  variables = variables_module.global_variables()
+  variables = _get_variables(ops.get_default_graph())
   candidate_vars = []
   for v in variables:
     if not getattr(v, '_keras_initialized', False):
@@ -974,6 +1016,7 @@ def zeros(shape, dtype=None, name=None):
     v = array_ops.zeros(shape=shape, dtype=tf_dtype, name=name)
     if py_all(v.shape.as_list()):
       return variable(v, dtype=dtype, name=name)
+    track_variable(v)
     return v
 
 
@@ -1008,6 +1051,7 @@ def ones(shape, dtype=None, name=None):
     v = array_ops.ones(shape=shape, dtype=tf_dtype, name=name)
     if py_all(v.shape.as_list()):
       return variable(v, dtype=dtype, name=name)
+    track_variable(v)
     return v
 
 
@@ -2766,7 +2810,8 @@ class Function(object):
       outputs: Output tensors to fetch.
       updates: Additional update ops to be run at function call.
       name: A name to help users identify what this function does.
-      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`.
+      session_kwargs: Arguments to `tf.Session.run()`:
+                      `fetches`, `feed_dict`, `options`, `run_metadata`.
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None,
@@ -2800,6 +2845,8 @@ class Function(object):
     self.fetches = session_kwargs.pop('fetches', [])
     if not isinstance(self.fetches, list):
       self.fetches = [self.fetches]
+    self.run_options = session_kwargs.pop('options', None)
+    self.run_metadata = session_kwargs.pop('run_metadata', None)
     # The main use case of `fetches` being passed to a model is the ability
     # to run custom updates
     # This requires us to wrap fetches in `identity` ops.
@@ -2857,6 +2904,9 @@ class Function(object):
       callable_opts.fetch.append(x.name)
     # Handle updates.
     callable_opts.target.append(self.updates_op.name)
+    # Handle run_options.
+    if self.run_options:
+      callable_opts.run_options.CopyFrom(self.run_options)
     # Create callable.
     callable_fn = session._make_callable_from_options(callable_opts)
     # Cache parameters corresponding to the generated callable, so that
@@ -2915,7 +2965,8 @@ class Function(object):
         session != self._session):
       self._make_callable(feed_arrays, feed_symbols, symbol_vals, session)
 
-    fetched = self._callable_fn(*array_vals)
+    fetched = self._callable_fn(*array_vals,
+                                run_metadata=self.run_metadata)
     self._call_fetch_callbacks(fetched[-len(self._fetches):])
     return fetched[:len(self.outputs)]
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 40e7910061..266af56611 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -21,6 +21,7 @@ from absl.testing import parameterized
 import numpy as np
 import scipy.sparse
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -118,7 +119,7 @@ class BackendUtilsTest(test.TestCase):
     self.assertEqual(keras.backend.get_uid('foo'), 1)
 
   def test_learning_phase(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       keras.backend.set_learning_phase(1)
       self.assertEqual(keras.backend.learning_phase(), 1)
       with self.assertRaises(ValueError):
@@ -132,7 +133,7 @@ class BackendUtilsTest(test.TestCase):
       sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
   def test_learning_phase_scope(self):
-    with self.test_session():
+    with self.cached_session():
       initial_learning_phase = keras.backend.learning_phase()
       with keras.backend.learning_phase_scope(1) as lp:
         self.assertEqual(lp, 1)
@@ -155,7 +156,7 @@ class BackendUtilsTest(test.TestCase):
     self.assertEqual(keras.backend.int_shape(x), (None, 4))
 
   def test_in_train_phase(self):
-    with self.test_session():
+    with self.cached_session():
       y1 = keras.backend.variable(1)
       y2 = keras.backend.variable(2)
       y = keras.backend.in_train_phase(y1, y2)
@@ -193,7 +194,7 @@ class BackendUtilsTest(test.TestCase):
       self.assertEqual(y.op.name[:12], 'StopGradient')
 
   def test_function_tf_feed_symbols(self):
-    with self.test_session():
+    with self.cached_session():
       # Test feeding a resource variable to `function`.
       x1 = keras.backend.placeholder(shape=())
       x2 = keras.backend.placeholder(shape=())
@@ -231,7 +232,7 @@ class BackendUtilsTest(test.TestCase):
     # keras.backend.function() these do not have control dependency on `outputs`
     # so they can run in parallel. Also they should not contribute to output of
     # keras.backend.function().
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.variable(0.)
       y = keras.backend.variable(0.)
       x_placeholder = keras.backend.placeholder(shape=())
@@ -252,7 +253,7 @@ class BackendUtilsTest(test.TestCase):
     # constructor but we can modify the values in the dictionary. Through
     # this feed_dict we can provide additional substitutions besides Keras
     # inputs.
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.variable(0.)
       y = keras.backend.variable(0.)
       x_placeholder = keras.backend.placeholder(shape=())
@@ -277,6 +278,29 @@ class BackendUtilsTest(test.TestCase):
       self.assertEqual(
           keras.backend.get_session().run(fetches=[x, y]), [30., 40.])
 
+  def test_function_tf_run_options_with_run_metadata(self):
+    with self.test_session():
+      x_placeholder = keras.backend.placeholder(shape=())
+      y_placeholder = keras.backend.placeholder(shape=())
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      # enable run_options.
+      f = keras.backend.function(inputs=[x_placeholder, y_placeholder],
+                                 outputs=[x_placeholder + y_placeholder],
+                                 options=run_options,
+                                 run_metadata=run_metadata)
+      output = f([10., 20.])
+      self.assertEqual(output, [30.])
+      self.assertGreater(len(run_metadata.partition_graphs), 0)
+      # disable run_options.
+      f1 = keras.backend.function(inputs=[x_placeholder, y_placeholder],
+                                  outputs=[x_placeholder + y_placeholder],
+                                  run_metadata=run_metadata)
+      output1 = f1([10., 20.])
+      self.assertEqual(output1, [30.])
+      self.assertEqual(len(run_metadata.partition_graphs), 0)
+
   def test_function_fetch_callbacks(self):
 
     class CallbackStub(object):
@@ -289,7 +313,7 @@ class BackendUtilsTest(test.TestCase):
         self.times_called += 1
         self.callback_result = result
 
-    with self.test_session():
+    with self.cached_session():
       callback = CallbackStub()
       x_placeholder = keras.backend.placeholder(shape=())
       y_placeholder = keras.backend.placeholder(shape=())
@@ -311,39 +335,39 @@ class BackendUtilsTest(test.TestCase):
 class BackendVariableTest(test.TestCase):
 
   def test_zeros(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.zeros((3, 4))
       val = keras.backend.eval(x)
       self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.ones((3, 4))
       val = keras.backend.eval(x)
       self.assertAllClose(val, np.ones((3, 4)))
 
   def test_eye(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.eye(4)
       val = keras.backend.eval(x)
       self.assertAllClose(val, np.eye(4))
 
   def test_zeros_like(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.zeros((3, 4))
       y = keras.backend.zeros_like(x)
       val = keras.backend.eval(y)
       self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones_like(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.zeros((3, 4))
       y = keras.backend.ones_like(x)
       val = keras.backend.eval(y)
       self.assertAllClose(val, np.ones((3, 4)))
 
   def test_random_uniform_variable(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
       val = keras.backend.eval(x)
       self.assertAllClose(val.mean(), 1.5, atol=1e-1)
@@ -351,7 +375,7 @@ class BackendVariableTest(test.TestCase):
       self.assertAllClose(val.min(), 1., atol=1e-1)
 
   def test_random_normal_variable(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.random_normal_variable((30, 20), 1., 0.5,
                                                seed=0)
       val = keras.backend.eval(x)
@@ -359,20 +383,20 @@ class BackendVariableTest(test.TestCase):
       self.assertAllClose(val.std(), 0.5, atol=1e-1)
 
   def test_count_params(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.backend.zeros((4, 5))
       val = keras.backend.count_params(x)
       self.assertAllClose(val, 20)
 
   def test_constant(self):
-    with self.test_session():
+    with self.cached_session():
       ref_val = np.random.random((3, 4)).astype('float32')
       x = keras.backend.constant(ref_val)
       val = keras.backend.eval(x)
       self.assertAllClose(val, ref_val)
 
   def test_sparse_variable(self):
-    with self.test_session():
+    with self.cached_session():
       val = scipy.sparse.eye(10)
       x = keras.backend.variable(val)
       self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
@@ -421,7 +445,7 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.argmax, np.argmax),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.test_session():
+      with self.cached_session():
         compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
                                          keras_kwargs={'axis': 1},
                                          np_kwargs={'axis': 1})
@@ -447,7 +471,7 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.exp, np.exp),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.test_session():
+      with self.cached_session():
         compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
 
     ops_to_test = [
@@ -455,19 +479,19 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.log, np.log),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.test_session():
+      with self.cached_session():
         compare_single_input_op_to_numpy(keras_op, np_op,
                                          input_shape=(4, 7),
                                          negative_values=False)
 
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(
           keras.backend.clip, np.clip,
           input_shape=(6, 4),
           keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
           np_kwargs={'a_min': 0.1, 'a_max': 1.4})
 
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(
           keras.backend.pow, np.power,
           input_shape=(6, 4),
@@ -486,14 +510,14 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.minimum, np.minimum),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.test_session():
+      with self.cached_session():
         compare_two_inputs_op_to_numpy(keras_op, np_op,
                                        input_shape_a=(4, 7),
                                        input_shape_b=(4, 7))
 
   def test_relu(self):
     x = ops.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
-    with self.test_session():
+    with self.cached_session():
       # standard relu
       relu_op = keras.backend.relu(x)
       self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
@@ -555,7 +579,7 @@ class BackendLinearAlgebraTest(test.TestCase):
 class BackendShapeOpsTest(test.TestCase):
 
   def test_reshape(self):
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
                                        input_shape=(4, 7),
                                        keras_args=[(2, 14)],
@@ -568,7 +592,7 @@ class BackendShapeOpsTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [1, 2, 5])
 
   def test_permute_dimensions(self):
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
                                        np.transpose,
                                        input_shape=(4, 7),
@@ -647,14 +671,14 @@ class BackendShapeOpsTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [1, 2, 3])
 
   def test_flatten(self):
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(keras.backend.flatten,
                                        np.reshape,
                                        input_shape=(4, 7, 6),
                                        np_args=[(4 * 7 * 6,)])
 
   def test_batch_flatten(self):
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(keras.backend.batch_flatten,
                                        np.reshape,
                                        input_shape=(4, 7, 6),
@@ -669,7 +693,7 @@ class BackendShapeOpsTest(test.TestCase):
       y[:, padding[0]:-padding[1], :] = x
       return y
 
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(keras.backend.temporal_padding,
                                        ref_op,
                                        input_shape=(4, 7, 6),
@@ -692,7 +716,7 @@ class BackendShapeOpsTest(test.TestCase):
         y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1]] = x
       return y
 
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(
           keras.backend.spatial_2d_padding,
           ref_op,
@@ -735,7 +759,7 @@ class BackendShapeOpsTest(test.TestCase):
           padding[2][0]:-padding[2][1]] = x
       return y
 
-    with self.test_session():
+    with self.cached_session():
       compare_single_input_op_to_numpy(
           keras.backend.spatial_3d_padding,
           ref_op,
@@ -757,7 +781,7 @@ class BackendShapeOpsTest(test.TestCase):
 class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_bias_add(self):
-    with self.test_session():
+    with self.cached_session():
       keras_op = keras.backend.bias_add
       np_op = np.add
       compare_two_inputs_op_to_numpy(keras_op, np_op,
@@ -783,7 +807,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         keras.backend.bias_add(x, b, data_format='unknown')
 
   def test_bias_add_channels_first(self):
-    with self.test_session():
+    with self.cached_session():
+
       def keras_op(x, b):
         return keras.backend.bias_add(x, b, data_format='channels_first')
 
@@ -959,7 +984,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                              strides,
                                              output_shape,
                                              'channels_last')
-          with self.test_session():
+          with self.cached_session():
             conv_cf = keras.backend.eval(conv_cf)
             conv_cl = keras.backend.eval(conv_cl)
 
@@ -1009,7 +1034,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                   output_shape,
                                                   'channels_last')
 
-    with self.test_session():
+    with self.cached_session():
       local_conv = keras.backend.eval(local_conv)
       local_conv_dim = keras.backend.eval(local_conv_dim)
 
@@ -1167,7 +1192,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.test_session():
+    with self.cached_session():
       for i, kwargs in enumerate(kwargs_list):
         last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
                                                              initial_states,
@@ -1263,7 +1288,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.test_session():
+    with self.cached_session():
       for i, kwargs in enumerate(kwargs_list):
         last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
                                                              initial_states,
@@ -1359,7 +1384,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 class TestCTC(test.TestCase):
 
   def test_ctc_decode(self):
-    with self.test_session():
+    with self.cached_session():
       depth = 6
       seq_len_0 = 5
       input_prob_matrix_0 = np.asarray(
@@ -1384,8 +1409,8 @@ class TestCTC(test.TestCase):
           np.array([seq_len_0], dtype=np.int32))
       # batch_size length vector of negative log probabilities
       log_prob_truth = np.array([
-          0.584855,  # output beam 0
-          0.389139  # output beam 1
+          -3.5821197,  # output beam 0
+          -3.777835    # output beam 1
       ], np.float32)[np.newaxis, :]
 
       decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
@@ -1408,7 +1433,7 @@ class TestCTC(test.TestCase):
       self.assertAllClose(log_prob_truth, log_prob_pred)
 
   def test_ctc_batch_cost(self):
-    with self.test_session():
+    with self.cached_session():
       label_lens = np.expand_dims(np.asarray([5, 4]), 1)
       input_lens = np.expand_dims(np.asarray([5, 5]), 1)  # number of timesteps
       loss_log_probs = [3.34211, 5.42262]
@@ -1464,13 +1489,13 @@ class TestCTC(test.TestCase):
 class TestRandomOps(test.TestCase):
 
   def test_random_binomial(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(123)
       x = keras.backend.random_binomial((1000, 1000), p=0.5)
       self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.1)
 
   def test_truncated_normal(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(123)
       x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
       y = keras.backend.eval(x)
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index f2feeb85a1..befe82f4ec 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -833,7 +833,7 @@ class TensorBoard(Callback):
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
 
-  @compatbility(eager)
+  @compatibility(eager)
   Using `Tensorboard` callback will work while eager execution is enabled,
   however outputting histogram summaries of weights and gradients is not
   supported, and thus `histogram_freq` will be ignored.
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index e84e023384..7675a6586f 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -235,11 +235,8 @@ class KerasCallbacksTest(test.TestCase):
           num_classes=NUM_CLASSES)
       y_test = keras.utils.to_categorical(y_test)
       y_train = keras.utils.to_categorical(y_train)
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
       model.compile(
           loss='categorical_crossentropy',
           optimizer='rmsprop',
@@ -298,9 +295,8 @@ class KerasCallbacksTest(test.TestCase):
           test_samples=50,
           input_shape=(1,),
           num_classes=NUM_CLASSES)
-      model = keras.models.Sequential((keras.layers.Dense(
-          1, input_dim=1, activation='relu'), keras.layers.Dense(
-              1, activation='sigmoid'),))
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=1, num_classes=1, input_dim=1)
       model.compile(
           optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
 
@@ -334,11 +330,8 @@ class KerasCallbacksTest(test.TestCase):
           num_classes=NUM_CLASSES)
       y_test = keras.utils.to_categorical(y_test)
       y_train = keras.utils.to_categorical(y_train)
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
       model.compile(
           loss='categorical_crossentropy',
           optimizer='sgd',
@@ -388,12 +381,8 @@ class KerasCallbacksTest(test.TestCase):
       def make_model():
         random_seed.set_random_seed(1234)
         np.random.seed(1337)
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Dense(
-                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-
+        model = testing_utils.get_small_sequential_mlp(
+            num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
         model.compile(
             loss='categorical_crossentropy',
             optimizer=keras.optimizers.SGD(lr=0.1),
@@ -498,12 +487,8 @@ class KerasCallbacksTest(test.TestCase):
 
       def make_model():
         np.random.seed(1337)
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Dense(
-                NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-        model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-
+        model = testing_utils.get_small_sequential_mlp(
+            num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
         model.compile(
             loss='categorical_crossentropy',
             optimizer=keras.optimizers.SGD(lr=0.1),
@@ -985,9 +970,8 @@ class KerasCallbacksTest(test.TestCase):
         yield x, y
 
     with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(10, input_dim=100, activation='relu'))
-      model.add(keras.layers.Dense(10, activation='softmax'))
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=10, num_classes=10, input_dim=100)
       model.compile(
           loss='categorical_crossentropy',
           optimizer='sgd',
@@ -1083,11 +1067,8 @@ class KerasCallbacksTest(test.TestCase):
       y_test = keras.utils.to_categorical(y_test)
       y_train = keras.utils.to_categorical(y_train)
 
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
       model.compile(
           loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
 
@@ -1179,40 +1160,36 @@ class KerasCallbacksTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_Tensorboard_eager(self):
-    with self.test_session():
-      temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='binary_crossentropy',
-          optimizer=adam.AdamOptimizer(0.01),
-          metrics=['accuracy'])
-
-      cbks = [keras.callbacks.TensorBoard(log_dir=temp_dir)]
+    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
 
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
 
-      self.assertTrue(os.path.exists(temp_dir))
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=adam.AdamOptimizer(0.01),
+        metrics=['accuracy'])
+
+    cbks = [keras.callbacks.TensorBoard(log_dir=temp_dir)]
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=2,
+        verbose=0)
+
+    self.assertTrue(os.path.exists(temp_dir))
 
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
diff --git a/tensorflow/python/keras/constraints_test.py b/tensorflow/python/keras/constraints_test.py
index 84e2db1033..4f674ea7c5 100644
--- a/tensorflow/python/keras/constraints_test.py
+++ b/tensorflow/python/keras/constraints_test.py
@@ -49,7 +49,7 @@ class KerasConstraintsTest(test.TestCase):
       assert fn.__class__ == ref_fn.__class__
 
   def test_max_norm(self):
-    with self.test_session():
+    with self.cached_session():
       array = get_example_array()
       for m in get_test_values():
         norm_instance = keras.constraints.max_norm(m)
@@ -69,13 +69,13 @@ class KerasConstraintsTest(test.TestCase):
       self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
 
   def test_non_neg(self):
-    with self.test_session():
+    with self.cached_session():
       non_neg_instance = keras.constraints.non_neg()
       normed = non_neg_instance(keras.backend.variable(get_example_array()))
       assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
 
   def test_unit_norm(self):
-    with self.test_session():
+    with self.cached_session():
       unit_norm_instance = keras.constraints.unit_norm()
       normalized = unit_norm_instance(
           keras.backend.variable(get_example_array()))
@@ -87,7 +87,7 @@ class KerasConstraintsTest(test.TestCase):
       assert np.abs(largest_difference) < 10e-5
 
   def test_min_max_norm(self):
-    with self.test_session():
+    with self.cached_session():
       array = get_example_array()
       for m in get_test_values():
         norm_instance = keras.constraints.min_max_norm(min_value=m,
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index d6d3db21fb..2c7f1036fb 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+import collections as collections_lib
 import enum  # pylint: disable=g-bad-import-order
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 
@@ -50,6 +50,7 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 class CallConvention(enum.Enum):
@@ -79,6 +80,7 @@ class Layer(checkpointable.CheckpointableBase):
   Users will just instantiate a layer and then treat it as a callable.
 
   We recommend that descendants of `Layer` implement the following methods:
+
   * `__init__()`: Save configuration in member variables
   * `build()`: Called once from `__call__`, when we know the shapes of inputs
     and `dtype`. Should have the calls to `add_weight()`, and then
@@ -272,6 +274,7 @@ class Layer(checkpointable.CheckpointableBase):
       return []
     return self._updates
 
+  @doc_controls.for_subclass_implementers
   def add_update(self, updates, inputs=None):
     """Add update op(s), potentially dependent on layer inputs.
 
@@ -372,6 +375,7 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       return self._losses
 
+  @doc_controls.for_subclass_implementers
   def add_loss(self, losses, inputs=None):
     """Add loss tensor(s), potentially dependent on layer inputs.
 
@@ -463,10 +467,12 @@ class Layer(checkpointable.CheckpointableBase):
     """Creates the variables of the layer."""
     self.built = True
 
+  @doc_controls.for_subclass_implementers
   def add_variable(self, *args, **kwargs):
     """Alias for `add_weight`."""
     return self.add_weight(*args, **kwargs)
 
+  @doc_controls.for_subclass_implementers
   def add_weight(self,
                  name,
                  shape,
@@ -479,7 +485,7 @@ class Layer(checkpointable.CheckpointableBase):
                  use_resource=None,
                  synchronization=vs.VariableSynchronization.AUTO,
                  aggregation=vs.VariableAggregation.NONE,
-                 getter=None):
+                 **kwargs):
     """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
@@ -507,7 +513,8 @@ class Layer(checkpointable.CheckpointableBase):
       aggregation: Indicates how a distributed variable will be aggregated.
         Accepted values are constants defined in the class
         `tf.VariableAggregation`.
-      getter: Variable getter argument to be passed to the `Checkpointable` API.
+      **kwargs: Additional keyword arguments. Accepted values are `getter` and
+        `collections`.
 
     Returns:
       The created variable.  Usually either a `Variable` or `ResourceVariable`
@@ -520,6 +527,13 @@ class Layer(checkpointable.CheckpointableBase):
       ValueError: When giving unsupported dtype and no initializer or when
         trainable has been set to True with synchronization set as `ON_READ`.
     """
+    # Validate optional keyword arguments.
+    for kwarg in kwargs:
+      if kwarg not in ['getter', 'collections']:
+        raise TypeError('Unknown keyword argument:', kwarg)
+    getter = kwargs.pop('getter', None)
+    collections = kwargs.pop('collections', None)
+
     if dtype is None:
       dtype = self.dtype or backend.floatx()
     dtype = dtypes.as_dtype(dtype)
@@ -568,8 +582,10 @@ class Layer(checkpointable.CheckpointableBase):
         trainable=trainable and self.trainable,
         partitioner=partitioner,
         use_resource=use_resource,
+        collections=collections,
         synchronization=synchronization,
         aggregation=aggregation)
+    backend.track_variable(variable)
 
     if regularizer is not None:
       # TODO(fchollet): in the future, this should be handled at the
@@ -646,6 +662,7 @@ class Layer(checkpointable.CheckpointableBase):
           activity_regularization = self._activity_regularizer(output)
         self.add_loss(activity_regularization, inputs=inputs)
 
+  @doc_controls.for_subclass_implementers
   def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
     """This is where the layer's logic lives.
 
@@ -1412,11 +1429,13 @@ class Layer(checkpointable.CheckpointableBase):
                            'instead.' % self.name)
 
   @property
+  @doc_controls.do_not_doc_inheritable
   def inbound_nodes(self):
     """Deprecated, do NOT use! Only for compatibility with external Keras."""
     return self._inbound_nodes
 
   @property
+  @doc_controls.do_not_doc_inheritable
   def outbound_nodes(self):
     """Deprecated, do NOT use! Only for compatibility with external Keras."""
     return self._outbound_nodes
@@ -1871,7 +1890,7 @@ def get_default_graph_uid_map():
   graph = ops.get_default_graph()
   name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
   if name_uid_map is None:
-    name_uid_map = collections.defaultdict(int)
+    name_uid_map = collections_lib.defaultdict(int)
     backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
   return name_uid_map
 
@@ -1886,6 +1905,7 @@ def make_variable(name,
                   validate_shape=True,
                   constraint=None,
                   use_resource=None,
+                  collections=None,
                   synchronization=vs.VariableSynchronization.AUTO,
                   aggregation=vs.VariableAggregation.NONE,
                   partitioner=None):  # pylint: disable=unused-argument
@@ -1919,6 +1939,8 @@ def make_variable(name,
     validate_shape: Passed to `vs.variable`.
     constraint: Constraint instance (callable).
     use_resource: Whether to use a `ResourceVariable`.
+    collections: List of graph collections keys. The new variable is added to
+      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
     synchronization: Indicates when a distributed a variable will be
       aggregated. Accepted values are constants defined in the class
       `tf.VariableSynchronization`. By default the synchronization is set to
@@ -1960,6 +1982,7 @@ def make_variable(name,
       validate_shape=validate_shape,
       constraint=constraint,
       use_resource=use_resource,
+      collections=collections,
       synchronization=synchronization,
       aggregation=aggregation)
   return v
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 708fa1c807..cd74e36e68 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -394,10 +394,10 @@ class Network(base_layer.Layer):
     no_dependency = isinstance(value, data_structures.NoDependency)
     value = data_structures.sticky_attribute_assignment(
         checkpointable=self, value=value, name=name)
-    if isinstance(value, (
-        base_layer.Layer,
-        Network,
-        data_structures.CheckpointableDataStructure)):
+    if (isinstance(value, (base_layer.Layer,
+                           Network,
+                           data_structures.CheckpointableDataStructure))
+        or checkpointable_layer_utils.has_weights(value)):
       try:
         is_graph_network = self._is_graph_network
       except AttributeError:
@@ -689,14 +689,14 @@ class Network(base_layer.Layer):
   def trainable_weights(self):
     return checkpointable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
-        sub_layers=self.layers,
+        sub_layers=self._layers,
         extra_variables=self._extra_variables)
 
   @property
   def non_trainable_weights(self):
     return checkpointable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
-        sub_layers=self.layers,
+        sub_layers=self._layers,
         extra_variables=self._extra_variables)
 
   @property
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index b7c2e9cb53..441f3f4948 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -687,7 +687,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
   def test_keras_optimizer_warning(self):
     graph = ops.Graph()
-    with graph.as_default(), self.test_session(graph):
+    with graph.as_default(), self.session(graph):
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(2, input_shape=(3,)))
       model.add(keras.layers.Dense(3))
@@ -741,7 +741,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
   def test_no_graph_pollution(self):
     with context.graph_mode():
       graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph) as session:
+      with graph.as_default(), self.session(graph) as session:
         model = SubclassedModel()
         temp_dir = self.get_temp_dir()
         prefix = os.path.join(temp_dir, 'ckpt')
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 415b15fde1..9f4019e29c 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -239,9 +239,9 @@ class Sequential(Model):
     x = inputs
     for layer in self.layers:
       kwargs = {}
-      if 'mask' in tf_inspect.getargspec(layer.call).args:
+      if 'mask' in tf_inspect.getfullargspec(layer.call).args:
         kwargs['mask'] = mask
-      if 'training' in tf_inspect.getargspec(layer.call).args:
+      if 'training' in tf_inspect.getfullargspec(layer.call).args:
         kwargs['training'] = training
 
       if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
@@ -332,6 +332,7 @@ class Sequential(Model):
     else:
       name = None
       build_input_shape = None
+      layer_configs = config
     model = cls(name=name)
     for layer_config in layer_configs:
       layer = layer_module.deserialize(layer_config,
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 3f8e120df0..28af8d61bc 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -25,22 +25,12 @@ from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
 
-def _get_small_mlp(num_hidden, num_classes, input_dim=None):
-  model = keras.models.Sequential()
-  if input_dim:
-    model.add(keras.layers.Dense(num_hidden, activation='relu',
-                                 input_dim=input_dim))
-  else:
-    model.add(keras.layers.Dense(num_hidden, activation='relu'))
-  model.add(keras.layers.Dense(num_classes, activation='softmax'))
-  return model
-
-
 class TestSequential(test.TestCase, parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
   """
@@ -63,7 +53,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     batch_size = 5
     num_classes = 2
 
-    model = _get_small_mlp(num_hidden, num_classes, input_dim)
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden, num_classes, input_dim)
     model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
@@ -94,7 +85,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     batch_size = 5
     num_classes = 2
 
-    model = _get_small_mlp(num_hidden, num_classes)
+    model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
@@ -118,7 +109,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     num_samples = 50
     steps_per_epoch = 10
 
-    model = _get_small_mlp(num_hidden, num_classes)
+    model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
@@ -145,9 +136,9 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
       def get_model():
         if deferred:
-          model = _get_small_mlp(10, 4)
+          model = testing_utils.get_small_sequential_mlp(10, 4)
         else:
-          model = _get_small_mlp(10, 4, input_dim=3)
+          model = testing_utils.get_small_sequential_mlp(10, 4, input_dim=3)
         model.compile(
             optimizer=rmsprop.RMSPropOptimizer(1e-3),
             loss='categorical_crossentropy',
@@ -262,7 +253,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     batch_size = 5
     num_classes = 2
 
-    model = _get_small_mlp(num_hidden, num_classes)
+    model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
@@ -284,21 +275,21 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_shape_inference_deferred(self):
-    model = _get_small_mlp(4, 5)
+    model = testing_utils.get_small_sequential_mlp(4, 5)
     output_shape = model.compute_output_shape((None, 7))
     self.assertEqual(tuple(output_shape.as_list()), (None, 5))
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_build_deferred(self):
-    model = _get_small_mlp(4, 5)
+    model = testing_utils.get_small_sequential_mlp(4, 5)
 
     model.build((None, 10))
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 4)
 
     # Test with nested model
-    model = _get_small_mlp(4, 3)
-    inner_model = _get_small_mlp(4, 5)
+    model = testing_utils.get_small_sequential_mlp(4, 3)
+    inner_model = testing_utils.get_small_sequential_mlp(4, 5)
     model.add(inner_model)
 
     model.build((None, 10))
@@ -308,8 +299,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_nesting(self):
-    model = _get_small_mlp(4, 3)
-    inner_model = _get_small_mlp(4, 5)
+    model = testing_utils.get_small_sequential_mlp(4, 3)
+    inner_model = testing_utils.get_small_sequential_mlp(4, 5)
     model.add(inner_model)
 
     model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
@@ -353,7 +344,7 @@ class TestSequentialEagerIntegration(test.TestCase):
   @tf_test_util.run_in_graph_and_eager_modes
   def test_build_before_fit(self):
     # Fix for b/112433577
-    model = _get_small_mlp(4, 5)
+    model = testing_utils.get_small_sequential_mlp(4, 5)
     model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
 
     model.build((None, 6))
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index f71388cadb..85d25411b4 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -800,18 +800,18 @@ class Model(Network):
       RuntimeError: If the model was never compiled.
     """
     if sample_weight is not None and sample_weight.all():
-      raise NotImplementedError('sample_weight is currently not supported when '
-                                'using DistributionStrategy.')
+      raise NotImplementedError('`sample_weight` is currently not supported '
+                                'when using DistributionStrategy.')
     if class_weight:
-      raise NotImplementedError('class_weight is currently not supported when '
-                                'using DistributionStrategy.')
+      raise NotImplementedError('`class_weight` is currently not supported '
+                                'when using DistributionStrategy.')
 
     # TODO(anjalisridhar): Can we use the iterator and getnext op cache?
     # We require users to pass Datasets since we distribute the dataset across
     # multiple devices.
     if not isinstance(x, dataset_ops.Dataset):
-      raise ValueError('When using DistributionStrategy you must specify a '
-                       'Dataset object instead of a %s.' % type(x))
+      raise ValueError('When using DistributionStrategy, model inputs should be'
+                       ' Dataset instances; found instead %s.' % type(x))
     # TODO(anjalisridhar): We want distribute_dataset() to accept a Dataset or a
     # function which returns a Dataset. Currently distribute_dataset() only
     # accepts a function that returns a Dataset. Once we add support for being
@@ -834,8 +834,9 @@ class Model(Network):
     next_element = iterator.get_next()
 
     if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
-      raise ValueError('Please provide data as a list or tuple of 2 elements '
-                       ' - input and target pair. Received %s' % next_element)
+      raise ValueError('Please provide model inputs as a list or tuple of 2 '
+                       'elements: input and target pair. '
+                       'Received %s' % next_element)
     x, y = next_element
     # Validate that all the elements in x and y are of the same type and shape.
     # We can then pass the first element of x and y to `_standardize_weights`
@@ -971,8 +972,9 @@ class Model(Network):
                            'required number of samples.')
 
       if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
-        raise ValueError('Please provide data as a list or tuple of 2 elements '
-                         ' - input and target pair. Received %s' % next_element)
+        raise ValueError('Please provide model inputs as a list or tuple of 2 '
+                         'elements: input and target pair. '
+                         'Received %s' % next_element)
       x, y = next_element
     x, y, sample_weights = self._standardize_weights(x, y, sample_weight,
                                                      class_weight, batch_size)
@@ -980,6 +982,10 @@ class Model(Network):
 
   def _standardize_weights(self, x, y, sample_weight=None, class_weight=None,
                            batch_size=None,):
+    if sample_weight is not None and class_weight is not None:
+      logging.warning(
+          'Received both a `sample_weight` and `class_weight` argument. '
+          'The `class_weight` argument will be ignored.')
     # First, we build/compile the model on the fly if necessary.
     all_inputs = []
     is_build_called = False
@@ -1721,6 +1727,13 @@ class Model(Network):
     if batch_size is None and steps is None:
       batch_size = 32
 
+    # Turn off prefetching since this is currently not deterministic. Once
+    # b/112498930 is fixed we can turn it back on.
+    # `_prefetch_on_device` is currently a property of only `MirroredStrategy`.
+    if (self._distribution_strategy and
+        hasattr(self._distribution_strategy, '_prefetch_on_device')):
+      self._distribution_strategy._prefetch_on_device = False  # pylint: disable=protected-access
+
     # Validate and standardize user data.
     x, _, _ = self._standardize_user_data(
         x, check_steps=True, steps_name='steps', steps=steps)
@@ -1729,8 +1742,12 @@ class Model(Network):
       return training_eager.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
     elif self._distribution_strategy:
-      return training_distributed.predict_loop(
+      results = training_distributed.predict_loop(
           self, x, verbose=verbose, steps=steps)
+      # Turn prefetching back on since we turned it off previously.
+      if hasattr(self._distribution_strategy, '_prefetch_on_device'):
+        self._distribution_strategy._prefetch_on_device = True  # pylint: disable=protected-access
+      return results
     else:
       return training_arrays.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 15e7d725de..bf2d231861 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -35,6 +35,9 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
@@ -49,289 +52,287 @@ class TrainingTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_fit_on_arrays(self):
-    with self.test_session():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
-
-      model = keras.models.Model([a, b], [d, e])
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-      model.compile(
-          optimizer,
-          loss,
-          metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-          loss_weights=loss_weights)
-
-      input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 3))
-
-      output_d_np = np.random.random((10, 4))
-      output_e_np = np.random.random((10, 4))
-
-      # Test fit at different verbosity
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          epochs=1,
-          batch_size=5,
-          verbose=0)
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          epochs=1,
-          batch_size=5,
-          verbose=1)
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          epochs=2,
-          batch_size=5,
-          verbose=2)
-      model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
-
-      # Test model with input data as a list of lists
-      model.fit(
-          [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)],
-          [output_d_np, output_e_np],
-          epochs=2,
-          batch_size=5,
-          verbose=2)
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
 
-      # Test with validation data
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          validation_data=([input_a_np, input_b_np], [output_d_np,
-                                                      output_e_np]),
-          epochs=1,
-          batch_size=5,
-          verbose=0)
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          validation_data=([input_a_np, input_b_np], [output_d_np,
-                                                      output_e_np]),
-          epochs=2,
-          batch_size=5,
-          verbose=1)
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          validation_data=([input_a_np, input_b_np], [output_d_np,
-                                                      output_e_np]),
-          epochs=2,
-          batch_size=5,
-          verbose=2)
-      # Test with validation split
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          epochs=2,
-          batch_size=5,
-          verbose=0,
-          validation_split=0.2)
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      # Test with dictionary inputs
-      model.fit(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          epochs=1,
-          batch_size=5,
-          verbose=0)
-      model.fit(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          epochs=1,
-          batch_size=5,
-          verbose=1)
-      model.fit(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          validation_data=({
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          }),
-          epochs=1,
-          batch_size=5,
-          verbose=0)
-      model.train_on_batch({
-          'input_a': input_a_np,
-          'input_b': input_b_np
-      }, {
-          'dense': output_d_np,
-          'dropout': output_e_np
-      })
-
-      # Test with lists for loss, metrics
-      loss = ['mae', 'mse']
-      model.compile(
-          optimizer,
-          loss,
-          metrics=[metrics_module.CategoricalAccuracy(), 'mae'])
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          epochs=1,
-          batch_size=5,
-          verbose=0)
+    model = keras.models.Model([a, b], [d, e])
 
-      # Test with dictionaries for loss, metrics, loss weights
-      loss = {'dense': 'mse', 'dropout': 'mae'}
-      loss_weights = {'dense': 1., 'dropout': 0.5}
-      metrics = {
-          'dense': 'mse',
-          'dropout': metrics_module.CategoricalAccuracy()
-      }
-      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    model.compile(
+        optimizer,
+        loss,
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+        loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test fit at different verbosity
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+
+    # Test model with input data as a list of lists
+    model.fit(
+        [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)],
+        [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+
+    # Test with validation data
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+    # Test with validation split
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=0,
+        validation_split=0.2)
+
+    # Test with dictionary inputs
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {
+            'dense': output_d_np,
+            'dropout': output_e_np
+        },
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {
+            'dense': output_d_np,
+            'dropout': output_e_np
+        },
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {
+            'dense': output_d_np,
+            'dropout': output_e_np
+        },
+        validation_data=({
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {
+            'dense': output_d_np,
+            'dropout': output_e_np
+        }),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.train_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    }, {
+        'dense': output_d_np,
+        'dropout': output_e_np
+    })
+
+    # Test with lists for loss, metrics
+    loss = ['mae', 'mse']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'])
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Test with dictionaries for loss, metrics, loss weights
+    loss = {'dense': 'mse', 'dropout': 'mae'}
+    loss_weights = {'dense': 1., 'dropout': 0.5}
+    metrics = {
+        'dense': 'mse',
+        'dropout': metrics_module.CategoricalAccuracy()
+    }
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Invalid use cases
+    with self.assertRaises(ValueError):
+      model.train_on_batch({'input_a': input_a_np},
+                           [output_d_np, output_e_np])
+    with self.assertRaises(AttributeError):
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
           epochs=1,
-          batch_size=5,
+          validation_data=([input_a_np, input_b_np], 0, 0),
           verbose=0)
+    with self.assertRaises(ValueError):
+      model.train_on_batch([input_a_np], [output_d_np, output_e_np])
+    with self.assertRaises(AttributeError):
+      model.train_on_batch(1, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch(input_a_np, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_input = np.random.random((11, 3))
+      model.train_on_batch([bad_input, input_b_np],
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_target = np.random.random((11, 4))
+      model.train_on_batch([input_a_np, input_b_np],
+                           [bad_target, output_e_np])
+
+    # Build single-input model
+    x = keras.layers.Input(shape=(3,), name='input_a')
+    y = keras.layers.Dense(4)(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer, loss='mse')
+    # This will work
+    model.fit([input_a_np], output_d_np, epochs=1)
+    with self.assertRaises(ValueError):
+      model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
-      # Invalid use cases
-      with self.assertRaises(ValueError):
-        model.train_on_batch({'input_a': input_a_np},
-                             [output_d_np, output_e_np])
-      with self.assertRaises(AttributeError):
-        model.fit(
-            [input_a_np, input_b_np], [output_d_np, output_e_np],
-            epochs=1,
-            validation_data=([input_a_np, input_b_np], 0, 0),
-            verbose=0)
-      with self.assertRaises(ValueError):
-        model.train_on_batch([input_a_np], [output_d_np, output_e_np])
-      with self.assertRaises(AttributeError):
-        model.train_on_batch(1, [output_d_np, output_e_np])
-      with self.assertRaises(ValueError):
-        model.train_on_batch(input_a_np, [output_d_np, output_e_np])
-      with self.assertRaises(ValueError):
-        bad_input = np.random.random((11, 3))
-        model.train_on_batch([bad_input, input_b_np],
-                             [output_d_np, output_e_np])
-      with self.assertRaises(ValueError):
-        bad_target = np.random.random((11, 4))
-        model.train_on_batch([input_a_np, input_b_np],
-                             [bad_target, output_e_np])
-
-      # Build single-input model
-      x = keras.layers.Input(shape=(3,), name='input_a')
-      y = keras.layers.Dense(4)(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer, loss='mse')
-      # This will work
-      model.fit([input_a_np], output_d_np, epochs=1)
-      with self.assertRaises(ValueError):
-        model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
-
-      # Test model on a list of floats
-      input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 4))
+    # Test model on a list of floats
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 4))
 
-      model.fit([np.ndarray.tolist(input_a_np)],
-                [np.ndarray.tolist(input_b_np)],
-                epochs=2,
-                batch_size=5,
-                verbose=2)
+    model.fit([np.ndarray.tolist(input_a_np)],
+              [np.ndarray.tolist(input_b_np)],
+              epochs=2,
+              batch_size=5,
+              verbose=2)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_evaluate_predict_on_arrays(self):
-    with self.test_session():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
-
-      model = keras.models.Model([a, b], [d, e])
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()],
-          loss_weights=loss_weights,
-          sample_weight_mode=None)
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
 
-      input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 3))
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      output_d_np = np.random.random((10, 4))
-      output_e_np = np.random.random((10, 4))
+    model = keras.models.Model([a, b], [d, e])
 
-      # Test evaluate at different verbosity
-      out = model.evaluate(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          batch_size=5,
-          verbose=0)
-      self.assertEqual(len(out), 7)
-      out = model.evaluate(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          batch_size=5,
-          verbose=1)
-      self.assertEqual(len(out), 7)
-      out = model.evaluate(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          batch_size=5,
-          verbose=2)
-      self.assertEqual(len(out), 7)
-      out = model.test_on_batch([input_a_np, input_b_np],
-                                [output_d_np, output_e_np])
-      self.assertEqual(len(out), 7)
-
-      # Test evaluate with dictionary inputs
-      model.evaluate(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          batch_size=5,
-          verbose=0)
-      model.evaluate(
-          {
-              'input_a': input_a_np,
-              'input_b': input_b_np
-          }, {
-              'dense': output_d_np,
-              'dropout': output_e_np
-          },
-          batch_size=5,
-          verbose=1)
-
-      # Test predict
-      out = model.predict([input_a_np, input_b_np], batch_size=5)
-      self.assertEqual(len(out), 2)
-      out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
-      self.assertEqual(len(out), 2)
-      out = model.predict_on_batch({
-          'input_a': input_a_np,
-          'input_b': input_b_np
-      })
-      self.assertEqual(len(out), 2)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test evaluate at different verbosity
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=0)
+    self.assertEqual(len(out), 7)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=1)
+    self.assertEqual(len(out), 7)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=2)
+    self.assertEqual(len(out), 7)
+    out = model.test_on_batch([input_a_np, input_b_np],
+                              [output_d_np, output_e_np])
+    self.assertEqual(len(out), 7)
+
+    # Test evaluate with dictionary inputs
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {
+            'dense': output_d_np,
+            'dropout': output_e_np
+        },
+        batch_size=5,
+        verbose=0)
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {
+            'dense': output_d_np,
+            'dropout': output_e_np
+        },
+        batch_size=5,
+        verbose=1)
+
+    # Test predict
+    out = model.predict([input_a_np, input_b_np], batch_size=5)
+    self.assertEqual(len(out), 2)
+    out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
+    self.assertEqual(len(out), 2)
+    out = model.predict_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    })
+    self.assertEqual(len(out), 2)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_invalid_loss(self):
@@ -340,31 +341,27 @@ class TrainingTest(test.TestCase):
     test_samples = 1000
     input_dim = 5
 
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
-      model.add(keras.layers.Activation('relu'))
-      model.add(keras.layers.Dense(num_classes))
-      model.add(keras.layers.Activation('softmax'))
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(optimizer, loss='categorical_crossentropy')
-      np.random.seed(1337)
-      (x_train, y_train), (_, _) = testing_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=test_samples,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(optimizer, loss='categorical_crossentropy')
+    np.random.seed(1337)
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
 
-      with self.assertRaises(ValueError):
-        model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
 
-      if not context.executing_eagerly():
-        # TODO(psv): Investigate these use cases in eager mode.
-        with self.assertRaises(ValueError):
-          model.fit(x_train, y_train)
+    if not context.executing_eagerly():
+      # TODO(psv): Investigate these use cases in eager mode.
+      with self.assertRaises(ValueError):
+        model.fit(x_train, y_train)
 
-        with self.assertRaises(ValueError):
-          model.compile(optimizer, loss=None)
+      with self.assertRaises(ValueError):
+        model.compile(optimizer, loss=None)
 
   def test_training_on_sparse_data_with_dense_placeholders(self):
     if scipy_sparse is None:
@@ -392,6 +389,18 @@ class TrainingTest(test.TestCase):
                 epochs=1, batch_size=2, validation_split=0.5)
       model.evaluate(test_inputs, test_outputs, batch_size=2)
 
+  def test_compile_with_sparse_placeholders(self):
+    with self.test_session():
+      input_layer = keras.layers.Input(shape=(10,), sparse=True)
+      weights = variable_scope.get_variable(name='weights', shape=(10, 1))
+      weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
+      output_layer = keras.layers.Lambda(weights_mult)(input_layer)
+      model = keras.Model([input_layer], output_layer)
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=keras.optimizers.Adam(lr=0.0001),
+          metrics=['accuracy'])
+
   def test_that_trainable_disables_updates(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -468,67 +477,63 @@ class LossWeightingTest(test.TestCase):
     input_dim = 5
     learning_rate = 0.001
 
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
-      model.add(keras.layers.Activation('relu'))
-      model.add(keras.layers.Dense(num_classes))
-      model.add(keras.layers.Activation('softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          metrics=['acc'],
-          weighted_metrics=['mae'],
-          optimizer=RMSPropOptimizer(learning_rate=learning_rate))
-
-      np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=test_samples,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
-      int_y_test = y_test.copy()
-      int_y_train = y_train.copy()
-      # convert class vectors to binary class matrices
-      y_train = keras.utils.to_categorical(y_train, num_classes)
-      y_test = keras.utils.to_categorical(y_test, num_classes)
-      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
-
-      class_weight = dict([(i, 1.) for i in range(num_classes)])
-      class_weight[weighted_class] = 2.
-
-      sample_weight = np.ones((y_train.shape[0]))
-      sample_weight[int_y_train == weighted_class] = 2.
-
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=batch_size,
-          epochs=epochs // 3,
-          verbose=0,
-          class_weight=class_weight,
-          validation_data=(x_train, y_train, sample_weight))
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=batch_size,
-          epochs=epochs // 2,
-          verbose=0,
-          class_weight=class_weight)
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=batch_size,
-          epochs=epochs // 2,
-          verbose=0,
-          class_weight=class_weight,
-          validation_split=0.1)
-
-      model.train_on_batch(
-          x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
-      ref_score = model.evaluate(x_test, y_test, verbose=0)
-      score = model.evaluate(
-          x_test[test_ids, :], y_test[test_ids, :], verbose=0)
-      self.assertLess(score[0], ref_score[0])
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+    model.compile(
+        loss='categorical_crossentropy',
+        metrics=['acc'],
+        weighted_metrics=['mae'],
+        optimizer=RMSPropOptimizer(learning_rate=learning_rate))
+
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        class_weight=class_weight,
+        validation_data=(x_train, y_train, sample_weight))
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight,
+        validation_split=0.1)
+
+    model.train_on_batch(
+        x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
+    ref_score = model.evaluate(x_test, y_test, verbose=0)
+    score = model.evaluate(
+        x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+    self.assertLess(score[0], ref_score[0])
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sample_weights(self):
@@ -541,63 +546,82 @@ class LossWeightingTest(test.TestCase):
     input_dim = 5
     learning_rate = 0.001
 
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
-      model.add(keras.layers.Activation('relu'))
-      model.add(keras.layers.Dense(num_classes))
-      model.add(keras.layers.Activation('softmax'))
-      model.compile(
-          RMSPropOptimizer(learning_rate=learning_rate),
-          metrics=['acc'],
-          weighted_metrics=['mae'],
-          loss='categorical_crossentropy')
-
-      np.random.seed(43)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=test_samples,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
-      int_y_test = y_test.copy()
-      int_y_train = y_train.copy()
-      # convert class vectors to binary class matrices
-      y_train = keras.utils.to_categorical(y_train, num_classes)
-      y_test = keras.utils.to_categorical(y_test, num_classes)
-      test_ids = np.where(int_y_test == np.array(weighted_class))[0]
-
-      sample_weight = np.ones((y_train.shape[0]))
-      sample_weight[int_y_train == weighted_class] = 2.
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+    model.compile(
+        RMSPropOptimizer(learning_rate=learning_rate),
+        metrics=['acc'],
+        weighted_metrics=['mae'],
+        loss='categorical_crossentropy')
+
+    np.random.seed(43)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight,
+        validation_split=0.1)
+
+    model.train_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+    model.test_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+    ref_score = model.evaluate(x_test, y_test, verbose=0)
+    if not context.executing_eagerly():
+      score = model.evaluate(
+          x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+      self.assertLess(score[0], ref_score[0])
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_warning_for_concurrent_sample_and_class_weights(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(3,)))
+    model.compile(
+        loss='mse',
+        optimizer=RMSPropOptimizer(learning_rate=0.01))
+    x_train = np.random.random((10, 3))
+    y_train = np.random.random((10, 10))
+    sample_weight = np.ones((y_train.shape[0]))
+    class_weight = {0: 1., 1: 1.}
+
+    with test.mock.patch.object(logging, 'warning') as mock_log:
       model.fit(
           x_train,
           y_train,
-          batch_size=batch_size,
-          epochs=epochs // 3,
-          verbose=0,
-          sample_weight=sample_weight)
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=batch_size,
-          epochs=epochs // 3,
+          epochs=1,
           verbose=0,
           sample_weight=sample_weight,
-          validation_split=0.1)
-
-      model.train_on_batch(
-          x_train[:batch_size],
-          y_train[:batch_size],
-          sample_weight=sample_weight[:batch_size])
-      model.test_on_batch(
-          x_train[:batch_size],
-          y_train[:batch_size],
-          sample_weight=sample_weight[:batch_size])
-      ref_score = model.evaluate(x_test, y_test, verbose=0)
-      if not context.executing_eagerly():
-        score = model.evaluate(
-            x_test[test_ids, :], y_test[test_ids, :], verbose=0)
-        self.assertLess(score[0], ref_score[0])
+          class_weight=class_weight)
+      msg = ('The `class_weight` argument will be ignored.')
+      self.assertRegexpMatches(str(mock_log.call_args), msg)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_temporal_sample_weights(self):
@@ -1465,9 +1489,10 @@ class TestTrainingWithDataTensors(test.TestCase):
       output_a_np = np.random.random((10, 4))
       output_b_np = np.random.random((10, 3))
 
-      a = keras.Input(
-          tensor=keras.backend.variables_module.Variable(input_a_np,
-                                                         dtype='float32'))
+      input_v = keras.backend.variables_module.Variable(
+          input_a_np, dtype='float32')
+      self.evaluate(variables_lib.variables_initializer([input_v]))
+      a = keras.Input(tensor=input_v)
       b = keras.Input(shape=(3,), name='input_b')
 
       a_2 = keras.layers.Dense(4, name='dense_1')(a)
@@ -1512,9 +1537,8 @@ class TestTrainingWithDataTensors(test.TestCase):
 
       # Now test a model with a single input
       # i.e. we don't pass any data to fit the model.
-      a = keras.Input(
-          tensor=keras.backend.variables_module.Variable(input_a_np,
-                                                         dtype='float32'))
+      self.evaluate(variables_lib.variables_initializer([input_v]))
+      a = keras.Input(tensor=input_v)
       a_2 = keras.layers.Dense(4, name='dense_1')(a)
       a_2 = keras.layers.Dropout(0.5, name='dropout')(a_2)
       model = keras.models.Model(a, a_2)
@@ -1552,9 +1576,8 @@ class TestTrainingWithDataTensors(test.TestCase):
 
       # Same, without learning phase
       # i.e. we don't pass any data to fit the model.
-      a = keras.Input(
-          tensor=keras.backend.variables_module.Variable(input_a_np,
-                                                         dtype='float32'))
+      self.evaluate(variables_lib.variables_initializer([input_v]))
+      a = keras.Input(tensor=input_v)
       a_2 = keras.layers.Dense(4, name='dense_1')(a)
       model = keras.models.Model(a, a_2)
       model.summary()
@@ -1677,9 +1700,10 @@ class TestTrainingWithDataTensors(test.TestCase):
       out = model.evaluate(input_a_np, None)
 
       # Test model with no external data at all.
-      a = keras.Input(
-          tensor=keras.backend.variables_module.Variable(input_a_np,
-                                                         dtype='float32'))
+      input_v = keras.backend.variables_module.Variable(
+          input_a_np, dtype='float32')
+      self.evaluate(variables_lib.variables_initializer([input_v]))
+      a = keras.Input(tensor=input_v)
       a_2 = keras.layers.Dense(4, name='dense_1')(a)
       a_2 = keras.layers.Dropout(0.5, name='dropout')(a_2)
       model = keras.models.Model(a, a_2)
@@ -1720,9 +1744,8 @@ class TestTrainingWithDataTensors(test.TestCase):
       self.assertEqual(out.shape, (10 * 3, 4))
 
       # Test multi-output model with no external data at all.
-      a = keras.Input(
-          tensor=keras.backend.variables_module.Variable(input_a_np,
-                                                         dtype='float32'))
+      self.evaluate(variables_lib.variables_initializer([input_v]))
+      a = keras.Input(tensor=input_v)
       a_1 = keras.layers.Dense(4, name='dense_1')(a)
       a_2 = keras.layers.Dropout(0.5, name='dropout')(a_1)
       model = keras.models.Model(a, [a_1, a_2])
@@ -1886,223 +1909,198 @@ class TestTrainingWithDatasetIterators(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_training_and_eval_methods_on_iterators_single_io(self):
-    with self.test_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae', metrics_module.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics)
-
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
-
-      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-      model.evaluate(iterator, steps=2, verbose=1)
-      model.predict(iterator, steps=2)
-      model.train_on_batch(iterator)
-      model.test_on_batch(iterator)
-      model.predict_on_batch(iterator)
-
-      # Test with validation data
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(iterator, steps=2, verbose=1)
+    model.predict(iterator, steps=2)
+    model.train_on_batch(iterator)
+    model.test_on_batch(iterator)
+    model.predict_on_batch(iterator)
+
+    # Test with validation data
+    model.fit(iterator,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=iterator, validation_steps=2)
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
       model.fit(iterator,
                 epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=iterator, validation_steps=2)
-      # Test with validation split
-      with self.assertRaisesRegexp(
-          ValueError, '`validation_split` argument is not supported '
-          'when input `x` is a dataset or a dataset iterator'):
-        model.fit(iterator,
-                  epochs=1, steps_per_epoch=2, verbose=0,
-                  validation_split=0.5, validation_steps=2)
-
-      # Test with sample weight.
-      sample_weight = np.random.random((10,))
-      with self.assertRaisesRegexp(
-          ValueError, '`sample_weight` argument is not supported '
-          'when input `x` is a dataset or a dataset iterator'):
-        model.fit(
-            iterator,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            sample_weight=sample_weight)
+                validation_split=0.5, validation_steps=2)
 
-      # Test invalid usage
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should not specify a target'):
-        model.fit(iterator, iterator,
-                  epochs=1, steps_per_epoch=2, verbose=0)
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          iterator,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
 
-      with self.assertRaisesRegexp(
-          ValueError, 'you should specify the `steps_per_epoch` argument'):
-        model.fit(iterator, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.evaluate(iterator, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.predict(iterator, verbose=0)
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(iterator, iterator,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(iterator, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(iterator, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(iterator, verbose=0)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_get_next_op_created_once(self):
-    with self.test_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics)
-
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
-
-      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-      # Finalize graph to make sure we are not appending another iterator
-      # get_next op in the graph.
-      ops.get_default_graph().finalize()
-      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    # Finalize graph to make sure we are not appending another iterator
+    # get_next op in the graph.
+    ops.get_default_graph().finalize()
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_iterators_running_out_of_data(self):
-    with self.test_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics)
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(2)
-      dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(2)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
 
-      with test.mock.patch.object(logging, 'warning') as mock_log:
-        model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
-        self.assertRegexpMatches(
-            str(mock_log.call_args),
-            'dataset iterator ran out of data')
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'dataset iterator ran out of data')
 
 
 class TestTrainingWithDataset(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_calling_model_on_same_dataset(self):
-    with self.test_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics)
-
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      # Call fit with validation data
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=dataset, validation_steps=2)
-      # Finalize the graph to make sure new ops aren't added when calling on the
-      # same dataset
-      ops.get_default_graph().finalize()
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=dataset, validation_steps=2)
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    # Call fit with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+    # Finalize the graph to make sure new ops aren't added when calling on the
+    # same dataset
+    ops.get_default_graph().finalize()
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_training_and_eval_methods_on_dataset(self):
-    with self.test_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae', metrics_module.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics)
-
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-      model.evaluate(dataset, steps=2, verbose=1)
-      model.predict(dataset, steps=2)
-      model.train_on_batch(dataset)
-      model.predict_on_batch(dataset)
-
-      # Test with validation data
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=dataset, validation_steps=2)
-
-      # Test with validation split
-      with self.assertRaisesRegexp(
-          ValueError, '`validation_split` argument is not supported '
-          'when input `x` is a dataset or a dataset iterator'):
-        model.fit(dataset,
-                  epochs=1, steps_per_epoch=2, verbose=0,
-                  validation_split=0.5, validation_steps=2)
-
-      # Test with sample weight.
-      sample_weight = np.random.random((10,))
-      with self.assertRaisesRegexp(
-          ValueError, '`sample_weight` argument is not supported '
-          'when input `x` is a dataset or a dataset iterator'):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            sample_weight=sample_weight)
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = np.zeros((10, 3))
+    targets = np.zeros((10, 4))
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+    model.train_on_batch(dataset)
+    model.predict_on_batch(dataset)
+
+    # Test with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(dataset,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
 
-      # Test invalid usage
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should not specify a target'):
-        model.fit(dataset, dataset,
-                  epochs=1, steps_per_epoch=2, verbose=0)
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          dataset,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
 
-      with self.assertRaisesRegexp(
-          ValueError, 'you should specify the `steps_per_epoch` argument'):
-        model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.evaluate(dataset, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.predict(dataset, verbose=0)
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(dataset, dataset,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(dataset, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(dataset, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(dataset, verbose=0)
 
   def test_dataset_input_shape_validation(self):
     with self.test_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(optimizer, loss)
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
 
       # User forgets to batch the dataset
       inputs = np.zeros((10, 3))
@@ -2111,7 +2109,7 @@ class TestTrainingWithDataset(test.TestCase):
       dataset = dataset.repeat(100)
 
       with self.assertRaisesRegexp(ValueError,
-                                   'expected input to have 2 dimensions'):
+                                   r'expected (.*?) to have 2 dimensions'):
         model.train_on_batch(dataset)
 
       # Wrong input shape
@@ -2122,7 +2120,7 @@ class TestTrainingWithDataset(test.TestCase):
       dataset = dataset.batch(10)
 
       with self.assertRaisesRegexp(ValueError,
-                                   'expected input to have shape'):
+                                   r'expected (.*?) to have shape \(3,\)'):
         model.train_on_batch(dataset)
 
 
@@ -2153,134 +2151,127 @@ class TestTrainingWithMetrics(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness(self):
-    with self.test_session():
-      model = keras.Sequential()
-      model.add(
-          keras.layers.Dense(
-              3, activation='relu', input_dim=4, kernel_initializer='ones'))
-      model.add(
-          keras.layers.Dense(
-              1, activation='sigmoid', kernel_initializer='ones'))
-      model.compile(
-          loss='mae',
-          metrics=['accuracy', metrics_module.BinaryAccuracy()],
-          optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-      # verify correctness of stateful and stateless metrics.
-      x = np.ones((100, 4))
-      y = np.ones((100, 1))
-      outs = model.evaluate(x, y)
-      self.assertEqual(outs[1], 1.)
-      self.assertEqual(outs[2], 1.)
-
-      y = np.zeros((100, 1))
-      outs = model.evaluate(x, y)
-      self.assertEqual(outs[1], 0.)
-      self.assertEqual(outs[2], 0.)
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Dense(
+            3, activation='relu', input_dim=4, kernel_initializer='ones'))
+    model.add(
+        keras.layers.Dense(
+            1, activation='sigmoid', kernel_initializer='ones'))
+    model.compile(
+        loss='mae',
+        metrics=['accuracy', metrics_module.BinaryAccuracy()],
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    # verify correctness of stateful and stateless metrics.
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    outs = model.evaluate(x, y)
+    self.assertEqual(outs[1], 1.)
+    self.assertEqual(outs[2], 1.)
+
+    y = np.zeros((100, 1))
+    outs = model.evaluate(x, y)
+    self.assertEqual(outs[1], 0.)
+    self.assertEqual(outs[2], 0.)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness_with_iterator(self):
-    with self.test_session():
-      model = keras.Sequential()
-      model.add(
-          keras.layers.Dense(
-              8, activation='relu', input_dim=4, kernel_initializer='ones'))
-      model.add(
-          keras.layers.Dense(
-              1, activation='sigmoid', kernel_initializer='ones'))
-      model.compile(
-          loss='binary_crossentropy',
-          metrics=['accuracy', metrics_module.BinaryAccuracy()],
-          optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-      np.random.seed(123)
-      x = np.random.randint(10, size=(100, 4)).astype(np.float32)
-      y = np.random.randint(2, size=(100, 1)).astype(np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
-      outs = model.evaluate(iterator, steps=10)
-      self.assertEqual(np.around(outs[1], decimals=1), 0.5)
-      self.assertEqual(np.around(outs[2], decimals=1), 0.5)
-
-      y = np.zeros((100, 1), dtype=np.float32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
-      outs = model.evaluate(iterator, steps=10)
-      self.assertEqual(outs[1], 0.)
-      self.assertEqual(outs[2], 0.)
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Dense(
+            8, activation='relu', input_dim=4, kernel_initializer='ones'))
+    model.add(
+        keras.layers.Dense(
+            1, activation='sigmoid', kernel_initializer='ones'))
+    model.compile(
+        loss='binary_crossentropy',
+        metrics=['accuracy', metrics_module.BinaryAccuracy()],
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(123)
+    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
+
+    y = np.zeros((100, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(outs[1], 0.)
+    self.assertEqual(outs[2], 0.)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_correctness_with_weighted_metrics(self):
-    with self.test_session():
-      np.random.seed(1337)
-      x = np.array([[[1.], [1.]], [[0.], [0.]]])
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(1, kernel_initializer='ones'),
-              input_shape=(2, 1)))
-      model.compile(
-          RMSPropOptimizer(learning_rate=0.001),
-          loss='mse',
-          sample_weight_mode='temporal',
-          weighted_metrics=['accuracy',
-                            metrics_module.BinaryAccuracy()])
-      y = np.array([[[1.], [1.]], [[1.], [1.]]])
-
-      outs = model.evaluate(x, y)
-      self.assertEqual(outs, [0.5, 0.5, 0.5])
-
-      w = np.array([[0., 0.], [0., 0.]])
-      outs = model.evaluate(x, y, sample_weight=w)
-      self.assertEqual(outs, [0., 0., 0.])
-
-      w = np.array([[3., 4.], [1., 2.]])
-      outs = model.evaluate(x, y, sample_weight=w)
-      self.assertArrayNear(outs, [0.3, 0.7, 0.7], .001)
+    np.random.seed(1337)
+    x = np.array([[[1.], [1.]], [[0.], [0.]]])
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(1, kernel_initializer='ones'),
+            input_shape=(2, 1)))
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001),
+        loss='mse',
+        sample_weight_mode='temporal',
+        weighted_metrics=['accuracy',
+                          metrics_module.BinaryAccuracy()])
+    y = np.array([[[1.], [1.]], [[1.], [1.]]])
+
+    outs = model.evaluate(x, y)
+    self.assertEqual(outs, [0.5, 0.5, 0.5])
+
+    w = np.array([[0., 0.], [0., 0.]])
+    outs = model.evaluate(x, y, sample_weight=w)
+    self.assertEqual(outs, [0., 0., 0.])
+
+    w = np.array([[3., 4.], [1., 2.]])
+    outs = model.evaluate(x, y, sample_weight=w)
+    self.assertArrayNear(outs, [0.3, 0.7, 0.7], .001)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
-    with self.test_session():
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(3, activation='relu', input_dim=4))
-      model.add(keras.layers.Dense(1, activation='sigmoid'))
-      acc_obj = metrics_module.BinaryAccuracy()
-      model.compile(
-          loss='mae',
-          metrics=[acc_obj],
-          optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-      x_train = np.random.random((100, 4))
-      y_train = np.random.random((100, 1))
-      model.fit(x_train, y_train, batch_size=5, epochs=2)
-      self.assertEqual(self.evaluate(acc_obj.count), 100)
-
-      x_test = np.random.random((10, 4))
-      y_test = np.random.random((10, 1))
-      model.evaluate(x_test, y_test, batch_size=5)
-      self.assertEqual(self.evaluate(acc_obj.count), 10)
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(3, activation='relu', input_dim=4))
+    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    acc_obj = metrics_module.BinaryAccuracy()
+    model.compile(
+        loss='mae',
+        metrics=[acc_obj],
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    x_train = np.random.random((100, 4))
+    y_train = np.random.random((100, 1))
+    model.fit(x_train, y_train, batch_size=5, epochs=2)
+    self.assertEqual(self.evaluate(acc_obj.count), 100)
+
+    x_test = np.random.random((10, 4))
+    y_test = np.random.random((10, 1))
+    model.evaluate(x_test, y_test, batch_size=5)
+    self.assertEqual(self.evaluate(acc_obj.count), 10)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_invalid_metrics(self):
     num_classes = 5
     input_dim = 5
 
-    with self.test_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(10, activation='relu', input_shape=(input_dim,)))
-      model.add(keras.layers.Dense(num_classes, activation='softmax'))
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
 
-      with self.assertRaisesRegexp(
-          TypeError, 'Type of `metrics` argument not understood. '
-          'Expected a list or dictionary, found: '):
-        model.compile(
-            RMSPropOptimizer(learning_rate=0.001),
-            loss='categorical_crossentropy',
-            metrics=metrics_module.CategoricalAccuracy())
+    with self.assertRaisesRegexp(
+        TypeError, 'Type of `metrics` argument not understood. '
+        'Expected a list or dictionary, found: '):
+      model.compile(
+          RMSPropOptimizer(learning_rate=0.001),
+          loss='categorical_crossentropy',
+          metrics=metrics_module.CategoricalAccuracy())
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_metrics_masking(self):
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 51725e03f2..8ddc9a17bf 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -40,7 +40,7 @@ class KerasInitializersTest(test.TestCase):
 
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
-    with self.test_session():
+    with self.cached_session():
       self._runner(keras.initializers.RandomUniform(minval=-1,
                                                     maxval=1,
                                                     seed=124),
@@ -49,14 +49,14 @@ class KerasInitializersTest(test.TestCase):
 
   def test_normal(self):
     tensor_shape = (8, 12, 99)
-    with self.test_session():
+    with self.cached_session():
       self._runner(keras.initializers.RandomNormal(mean=0, stddev=1, seed=153),
                    tensor_shape,
                    target_mean=0., target_std=1)
 
   def test_truncated_normal(self):
     tensor_shape = (12, 99, 7)
-    with self.test_session():
+    with self.cached_session():
       self._runner(keras.initializers.TruncatedNormal(mean=0,
                                                       stddev=1,
                                                       seed=126),
@@ -65,13 +65,13 @@ class KerasInitializersTest(test.TestCase):
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
-    with self.test_session():
+    with self.cached_session():
       self._runner(keras.initializers.Constant(2), tensor_shape,
                    target_mean=2, target_max=2, target_min=2)
 
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
@@ -79,7 +79,7 @@ class KerasInitializersTest(test.TestCase):
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
@@ -87,7 +87,7 @@ class KerasInitializersTest(test.TestCase):
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
@@ -95,7 +95,7 @@ class KerasInitializersTest(test.TestCase):
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
@@ -103,7 +103,7 @@ class KerasInitializersTest(test.TestCase):
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
@@ -111,7 +111,7 @@ class KerasInitializersTest(test.TestCase):
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
@@ -119,12 +119,12 @@ class KerasInitializersTest(test.TestCase):
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
-    with self.test_session():
+    with self.cached_session():
       self._runner(keras.initializers.orthogonal(seed=123), tensor_shape,
                    target_mean=0.)
 
   def test_identity(self):
-    with self.test_session():
+    with self.cached_session():
       tensor_shape = (3, 4, 5)
       with self.assertRaises(ValueError):
         self._runner(keras.initializers.identity(), tensor_shape,
@@ -136,13 +136,13 @@ class KerasInitializersTest(test.TestCase):
 
   def test_zero(self):
     tensor_shape = (4, 5)
-    with self.test_session():
+    with self.cached_session():
       self._runner(keras.initializers.zeros(), tensor_shape,
                    target_mean=0., target_max=0.)
 
   def test_one(self):
     tensor_shape = (4, 5)
-    with self.test_session():
+    with self.cached_session():
       self._runner(keras.initializers.ones(), tensor_shape,
                    target_mean=1., target_max=1.)
 
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 2a05699407..3c0f73b1c3 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -21,9 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.platform import test
 
 
@@ -33,7 +35,7 @@ class KerasIntegrationTest(test.TestCase):
     self.assertTrue(keras.__version__.endswith('-tf'))
 
   def test_vector_classification_sequential(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -58,7 +60,7 @@ class KerasIntegrationTest(test.TestCase):
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_functional(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -82,7 +84,7 @@ class KerasIntegrationTest(test.TestCase):
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_temporal_classification_sequential(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -103,8 +105,32 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  def test_temporal_classification_sequential_tf_rnn(self):
+    with self.cached_session():
+      np.random.seed(1337)
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(4, 10),
+          num_classes=2)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.RNN(rnn_cell.LSTMCell(5), return_sequences=True,
+                                 input_shape=x_train.shape[1:]))
+      model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
+                                                  activation='softmax',
+                                                  dtype=dtypes.float32)))
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
+                    metrics=['accuracy'])
+      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
+                          validation_data=(x_train, y_train),
+                          verbose=2)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
+
   def test_image_classification_sequential(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -138,7 +164,7 @@ class KerasIntegrationTest(test.TestCase):
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_video_classification_functional(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -168,7 +194,7 @@ class KerasIntegrationTest(test.TestCase):
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -202,7 +228,7 @@ class KerasIntegrationTest(test.TestCase):
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -233,14 +259,14 @@ class KerasIntegrationTest(test.TestCase):
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_embedding_with_clipnorm(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       model.add(keras.layers.Embedding(input_dim=1, output_dim=1))
       model.compile(optimizer=keras.optimizers.SGD(clipnorm=0.1), loss='mse')
       model.fit(np.array([[0]]), np.array([[[0.5]]]), epochs=1)
 
   def test_using_tf_layers_in_keras_sequential_model(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
@@ -263,7 +289,7 @@ class KerasIntegrationTest(test.TestCase):
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_using_tf_layers_in_keras_functional_model(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1337)
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=100,
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 53c1baa2bb..b020b6e730 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -26,44 +26,44 @@ from tensorflow.python.platform import test
 class AdvancedActivationsTest(test.TestCase):
 
   def test_leaky_relu(self):
-    with self.test_session():
+    with self.cached_session():
       for alpha in [0., .5, -1.]:
         testing_utils.layer_test(keras.layers.LeakyReLU,
                                  kwargs={'alpha': alpha},
                                  input_shape=(2, 3, 4))
 
   def test_prelu(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(keras.layers.PReLU, kwargs={},
                                input_shape=(2, 3, 4))
 
   def test_prelu_share(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(keras.layers.PReLU,
                                kwargs={'shared_axes': 1},
                                input_shape=(2, 3, 4))
 
   def test_elu(self):
-    with self.test_session():
+    with self.cached_session():
       for alpha in [0., .5, -1.]:
         testing_utils.layer_test(keras.layers.ELU,
                                  kwargs={'alpha': alpha},
                                  input_shape=(2, 3, 4))
 
   def test_thresholded_relu(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(keras.layers.ThresholdedReLU,
                                kwargs={'theta': 0.5},
                                input_shape=(2, 3, 4))
 
   def test_softmax(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(keras.layers.Softmax,
                                kwargs={'axis': 1},
                                input_shape=(2, 3, 4))
 
   def test_relu(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(keras.layers.ReLU,
                                kwargs={'max_value': 10},
                                input_shape=(2, 3, 4))
@@ -71,14 +71,14 @@ class AdvancedActivationsTest(test.TestCase):
   def test_relu_with_invalid_arg(self):
     with self.assertRaisesRegexp(
         ValueError, 'max_value of Relu layer cannot be negative value: -10'):
-      with self.test_session():
+      with self.cached_session():
         testing_utils.layer_test(keras.layers.ReLU,
                                  kwargs={'max_value': -10},
                                  input_shape=(2, 3, 4))
     with self.assertRaisesRegexp(
         ValueError,
         'negative_slope of Relu layer cannot be negative value: -2'):
-      with self.test_session():
+      with self.cached_session():
         testing_utils.layer_test(
             keras.layers.ReLU,
             kwargs={'negative_slope': -2},
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
index 4b8f6f2a14..4a75793884 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
@@ -47,7 +47,7 @@ class ConvLSTMTest(test.TestCase):
                                 input_channel)
 
       for return_sequences in [True, False]:
-        with self.test_session():
+        with self.cached_session():
           # test for return state:
           x = keras.Input(batch_shape=inputs.shape)
           kwargs = {'data_format': data_format,
@@ -92,7 +92,7 @@ class ConvLSTMTest(test.TestCase):
                             input_num_row, input_num_col,
                             input_channel)
 
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       kwargs = {'data_format': 'channels_last',
                 'return_sequences': False,
@@ -144,7 +144,7 @@ class ConvLSTMTest(test.TestCase):
                             input_num_row, input_num_col,
                             input_channel)
 
-    with self.test_session():
+    with self.cached_session():
       kwargs = {'data_format': 'channels_last',
                 'return_sequences': False,
                 'kernel_size': (num_row, num_col),
@@ -168,7 +168,7 @@ class ConvLSTMTest(test.TestCase):
 
   def test_conv_lstm_dropout(self):
     # check dropout
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(
           keras.layers.ConvLSTM2D,
           kwargs={'data_format': 'channels_last',
@@ -181,7 +181,7 @@ class ConvLSTMTest(test.TestCase):
           input_shape=(1, 2, 5, 5, 2))
 
   def test_conv_lstm_cloning(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       model.add(keras.layers.ConvLSTM2D(5, 3, input_shape=(None, 5, 5, 3)))
 
@@ -190,7 +190,7 @@ class ConvLSTMTest(test.TestCase):
       weights = model.get_weights()
 
     # Use a new graph to clone the model
-    with self.test_session():
+    with self.cached_session():
       clone = keras.models.clone_model(model)
       clone.set_weights(weights)
 
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 49ca68ee9e..1df1d575b1 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -30,16 +30,16 @@ from tensorflow.python.platform import test
 class CoreLayersTest(test.TestCase):
 
   def test_masking(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(
           keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
 
   def test_dropout(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(
           keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
 
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(
           keras.layers.Dropout,
           kwargs={'rate': 0.5,
@@ -47,7 +47,7 @@ class CoreLayersTest(test.TestCase):
           input_shape=(3, 2))
 
     # https://github.com/tensorflow/tensorflow/issues/14819
-    with self.test_session():
+    with self.cached_session():
       dropout = keras.layers.Dropout(0.5)
       self.assertEqual(True, dropout.supports_masking)
 
@@ -210,7 +210,7 @@ class CoreLayersTest(test.TestCase):
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
 
   def test_dense_regularization(self):
-    with self.test_session():
+    with self.cached_session():
       layer = keras.layers.Dense(
           3,
           kernel_regularizer=keras.regularizers.l1(0.01),
@@ -221,7 +221,7 @@ class CoreLayersTest(test.TestCase):
       self.assertEqual(3, len(layer.losses))
 
   def test_dense_constraints(self):
-    with self.test_session():
+    with self.cached_session():
       k_constraint = keras.constraints.max_norm(0.01)
       b_constraint = keras.constraints.max_norm(0.01)
       layer = keras.layers.Dense(
@@ -231,14 +231,14 @@ class CoreLayersTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
   def test_activity_regularization(self):
-    with self.test_session():
+    with self.cached_session():
       layer = keras.layers.ActivityRegularization(l1=0.1)
       layer(keras.backend.variable(np.ones((2, 4))))
       self.assertEqual(1, len(layer.losses))
       _ = layer.get_config()
 
   def test_lambda_output_shape(self):
-    with self.test_session():
+    with self.cached_session():
       l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
       l(keras.backend.variable(np.ones((1, 1))))
       self.assertEqual((1, 1), l.get_config()['output_shape'])
@@ -247,13 +247,13 @@ class CoreLayersTest(test.TestCase):
     def get_output_shape(input_shape):
       return 1 * input_shape
 
-    with self.test_session():
+    with self.cached_session():
       l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
       l(keras.backend.variable(np.ones((1, 1))))
       self.assertEqual('lambda', l.get_config()['output_shape_type'])
 
   def test_lambda_config_serialization(self):
-    with self.test_session():
+    with self.cached_session():
       # test serialization with output_shape and output_shape_type
       layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
       layer(keras.backend.variable(np.ones((1, 1))))
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index fff1c5ef98..cab176ee34 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -68,7 +68,7 @@ class EmbeddingTest(test.TestCase):
         expected_output_dtype='float32')
 
   def test_embedding_correctness(self):
-    with self.test_session():
+    with self.cached_session():
       layer = keras.layers.Embedding(output_dim=2, input_dim=2)
       layer.build((None, 2))
       matrix = np.array([[1, 1], [2, 2]])
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 0ebafe07cc..33d09a1660 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -85,6 +85,28 @@ class LocallyConnected1D(Layer):
           the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to the kernel matrix.
       bias_constraint: Constraint function applied to the bias vector.
+      implementation: implementation mode, either `1` or `2`.
+          `1` loops over input spatial locations to perform the forward pass.
+          It is memory-efficient but performs a lot of (small) ops.
+
+          `2` stores layer weights in a dense but sparsely-populated 2D matrix
+          and implements the forward pass as a single matrix-multiply. It uses
+          a lot of RAM but performs few (large) ops.
+
+          Depending on the inputs, layer parameters, hardware, and
+          `tf.executing_eagerly()` one implementation can be dramatically faster
+          (e.g. 50X) than another.
+
+          It is recommended to benchmark both in the setting of interest to pick
+          the most efficient one (in terms of speed and memory usage).
+
+          Following scenarios could benefit from setting `implementation=2`:
+              - eager execution;
+              - inference;
+              - running on CPU;
+              - large amount of RAM available;
+              - small models (few filters, small kernel);
+              - using `padding=same` (only possible with `implementation=2`).
 
   Input shape:
       3D tensor with shape: `(batch_size, steps, input_dim)`
@@ -109,15 +131,17 @@ class LocallyConnected1D(Layer):
                activity_regularizer=None,
                kernel_constraint=None,
                bias_constraint=None,
+               implementation=1,
                **kwargs):
     super(LocallyConnected1D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
     self.padding = conv_utils.normalize_padding(padding)
-    if self.padding != 'valid':
+    if self.padding != 'valid' and implementation == 1:
       raise ValueError('Invalid border mode for LocallyConnected1D '
-                       '(only "valid" is supported): ' + padding)
+                       '(only "valid" is supported if implementation is 1): '
+                       + padding)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.activation = activations.get(activation)
     self.use_bias = use_bias
@@ -128,6 +152,7 @@ class LocallyConnected1D(Layer):
     self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
+    self.implementation = implementation
     self.input_spec = InputSpec(ndim=3)
 
   @tf_utils.shape_type_conversion
@@ -142,14 +167,45 @@ class LocallyConnected1D(Layer):
                        'Found shape:', input_shape)
     self.output_length = conv_utils.conv_output_length(
         input_length, self.kernel_size[0], self.padding, self.strides[0])
-    self.kernel_shape = (self.output_length, self.kernel_size[0] * input_dim,
-                         self.filters)
-    self.kernel = self.add_weight(
-        shape=self.kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+
+    if self.implementation == 1:
+      self.kernel_shape = (self.output_length, self.kernel_size[0] * input_dim,
+                           self.filters)
+
+      self.kernel = self.add_weight(
+          shape=self.kernel_shape,
+          initializer=self.kernel_initializer,
+          name='kernel',
+          regularizer=self.kernel_regularizer,
+          constraint=self.kernel_constraint)
+
+    elif self.implementation == 2:
+      if self.data_format == 'channels_first':
+        self.kernel_shape = (input_dim, input_length,
+                             self.filters, self.output_length)
+      else:
+        self.kernel_shape = (input_length, input_dim,
+                             self.output_length, self.filters)
+
+      self.kernel = self.add_weight(shape=self.kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    name='kernel',
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint)
+
+      self.kernel_mask = get_locallyconnected_mask(
+          input_shape=(input_length,),
+          kernel_shape=self.kernel_size,
+          strides=self.strides,
+          padding=self.padding,
+          data_format=self.data_format,
+          dtype=self.kernel.dtype
+      )
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
+
     if self.use_bias:
       self.bias = self.add_weight(
           shape=(self.output_length, self.filters),
@@ -182,8 +238,17 @@ class LocallyConnected1D(Layer):
       return (input_shape[0], length, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
-                          (self.output_length,), self.data_format)
+    if self.implementation == 1:
+      output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
+                            (self.output_length,), self.data_format)
+
+    elif self.implementation == 2:
+      output = local_conv_matmul(inputs, self.kernel, self.kernel_mask,
+                                 self.compute_output_shape(inputs.shape))
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
 
     if self.use_bias:
       output = K.bias_add(output, self.bias, data_format=self.data_format)
@@ -220,7 +285,9 @@ class LocallyConnected1D(Layer):
         'kernel_constraint':
             constraints.serialize(self.kernel_constraint),
         'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+            constraints.serialize(self.bias_constraint),
+        'implementation':
+            self.implementation
     }
     base_config = super(LocallyConnected1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -284,9 +351,31 @@ class LocallyConnected2D(Layer):
           the `kernel` weights matrix.
       bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
+          the output of the layer (its "activation").
       kernel_constraint: Constraint function applied to the kernel matrix.
       bias_constraint: Constraint function applied to the bias vector.
+      implementation: implementation mode, either `1` or `2`.
+          `1` loops over input spatial locations to perform the forward pass.
+          It is memory-efficient but performs a lot of (small) ops.
+
+          `2` stores layer weights in a dense but sparsely-populated 2D matrix
+          and implements the forward pass as a single matrix-multiply. It uses
+          a lot of RAM but performs few (large) ops.
+
+          Depending on the inputs, layer parameters, hardware, and
+          `tf.executing_eagerly()` one implementation can be dramatically faster
+          (e.g. 50X) than another.
+
+          It is recommended to benchmark both in the setting of interest to pick
+          the most efficient one (in terms of speed and memory usage).
+
+          Following scenarios could benefit from setting `implementation=2`:
+              - eager execution;
+              - inference;
+              - running on CPU;
+              - large amount of RAM available;
+              - small models (few filters, small kernel);
+              - using `padding=same` (only possible with `implementation=2`).
 
   Input shape:
       4D tensor with shape:
@@ -317,15 +406,17 @@ class LocallyConnected2D(Layer):
                activity_regularizer=None,
                kernel_constraint=None,
                bias_constraint=None,
+               implementation=1,
                **kwargs):
     super(LocallyConnected2D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
     self.padding = conv_utils.normalize_padding(padding)
-    if self.padding != 'valid':
+    if self.padding != 'valid' and implementation == 1:
       raise ValueError('Invalid border mode for LocallyConnected2D '
-                       '(only "valid" is supported): ' + padding)
+                       '(only "valid" is supported if implementation is 1): '
+                       + padding)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.activation = activations.get(activation)
     self.use_bias = use_bias
@@ -336,6 +427,7 @@ class LocallyConnected2D(Layer):
     self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
+    self.implementation = implementation
     self.input_spec = InputSpec(ndim=4)
 
   @tf_utils.shape_type_conversion
@@ -357,15 +449,47 @@ class LocallyConnected2D(Layer):
                                                self.padding, self.strides[1])
     self.output_row = output_row
     self.output_col = output_col
-    self.kernel_shape = (
-        output_row * output_col,
-        self.kernel_size[0] * self.kernel_size[1] * input_filter, self.filters)
-    self.kernel = self.add_weight(
-        shape=self.kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+
+    if self.implementation == 1:
+      self.kernel_shape = (
+          output_row * output_col,
+          self.kernel_size[0] * self.kernel_size[1] * input_filter,
+          self.filters)
+
+      self.kernel = self.add_weight(
+          shape=self.kernel_shape,
+          initializer=self.kernel_initializer,
+          name='kernel',
+          regularizer=self.kernel_regularizer,
+          constraint=self.kernel_constraint)
+
+    elif self.implementation == 2:
+      if self.data_format == 'channels_first':
+        self.kernel_shape = (input_filter, input_row, input_col,
+                             self.filters, self.output_row, self.output_col)
+      else:
+        self.kernel_shape = (input_row, input_col, input_filter,
+                             self.output_row, self.output_col, self.filters)
+
+      self.kernel = self.add_weight(shape=self.kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    name='kernel',
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint)
+
+      self.kernel_mask = get_locallyconnected_mask(
+          input_shape=(input_row, input_col),
+          kernel_shape=self.kernel_size,
+          strides=self.strides,
+          padding=self.padding,
+          data_format=self.data_format,
+          dtype=self.kernel.dtype
+      )
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
+
     if self.use_bias:
       self.bias = self.add_weight(
           shape=(output_row, output_col, self.filters),
@@ -401,8 +525,18 @@ class LocallyConnected2D(Layer):
       return (input_shape[0], rows, cols, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
-                          (self.output_row, self.output_col), self.data_format)
+    if self.implementation == 1:
+      output = K.local_conv(inputs, self.kernel, self.kernel_size, self.strides,
+                            (self.output_row, self.output_col),
+                            self.data_format)
+
+    elif self.implementation == 2:
+      output = local_conv_matmul(inputs, self.kernel, self.kernel_mask,
+                                 self.compute_output_shape(inputs.shape))
+
+    else:
+      raise ValueError('Unrecognized implementation mode: %d.'
+                       % self.implementation)
 
     if self.use_bias:
       output = K.bias_add(output, self.bias, data_format=self.data_format)
@@ -439,7 +573,157 @@ class LocallyConnected2D(Layer):
         'kernel_constraint':
             constraints.serialize(self.kernel_constraint),
         'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+            constraints.serialize(self.bias_constraint),
+        'implementation':
+            self.implementation
     }
     base_config = super(LocallyConnected2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+def get_locallyconnected_mask(input_shape,
+                              kernel_shape,
+                              strides,
+                              padding,
+                              data_format,
+                              dtype):
+  """Return a mask representing connectivity of a locally-connected operation.
+
+  This method returns a masking tensor of 0s and 1s (of type `dtype`) that,
+  when element-wise multiplied with a fully-connected weight tensor, masks out
+  the weights between disconnected input-output pairs and thus implements local
+  connectivity through a sparse fully-connected weight tensor.
+
+  Assume an unshared convolution with given parameters is applied to an input
+  having N spatial dimensions with `input_shape = (d_in1, ..., d_inN)`
+  to produce an output with spatial shape `(d_out1, ..., d_outN)` (determined
+  by layer parameters such as `strides`).
+
+  This method returns a mask which can be broadcast-multiplied (element-wise)
+  with a 2*(N+1)-D weight matrix (equivalent to a fully-connected layer between
+  (N+1)-D activations (N spatial + 1 channel dimensions for input and output)
+  to make it perform an unshared convolution with given `kernel_shape`,
+  `strides`, `padding` and `data_format`.
+
+  Arguments:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+    data_format: a string, `"channels_first"` or `"channels_last"`.
+    dtype: type of the layer operation, e.g. `tf.float64`.
+
+  Returns:
+    a `dtype`-tensor of shape
+    `(1, d_in1, ..., d_inN, 1, d_out1, ..., d_outN)`
+    if `data_format == `"channels_first"`, or
+    `(d_in1, ..., d_inN, 1, d_out1, ..., d_outN, 1)`
+    if `data_format == "channels_last"`.
+
+  Raises:
+    ValueError: if `data_format` is neither `"channels_first"` nor
+                `"channels_last"`.
+  """
+  mask = conv_utils.conv_kernel_mask(
+      input_shape=input_shape,
+      kernel_shape=kernel_shape,
+      strides=strides,
+      padding=padding
+  )
+
+  ndims = int(mask.ndim / 2)
+  mask = K.variable(mask, dtype)
+
+  if data_format == 'channels_first':
+    mask = K.expand_dims(mask, 0)
+    mask = K.expand_dims(mask, - ndims - 1)
+
+  elif data_format == 'channels_last':
+    mask = K.expand_dims(mask, ndims)
+    mask = K.expand_dims(mask, -1)
+
+  else:
+    raise ValueError('Unrecognized data_format: ' + str(data_format))
+
+  return mask
+
+
+def local_conv_matmul(inputs, kernel, kernel_mask, output_shape):
+  """Apply N-D convolution with un-shared weights using a single matmul call.
+
+  This method outputs `inputs . (kernel * kernel_mask)`
+  (with `.` standing for matrix-multiply and `*` for element-wise multiply)
+  and requires a precomputed `kernel_mask` to zero-out weights in `kernel` and
+  hence perform the same operation as a convolution with un-shared
+  (the remaining entries in `kernel`) weights. It also does the necessary
+  reshapes to make `inputs` and `kernel` 2-D and `output` (N+2)-D.
+
+  Arguments:
+      inputs: (N+2)-D tensor with shape
+          `(batch_size, channels_in, d_in1, ..., d_inN)`
+          or
+          `(batch_size, d_in1, ..., d_inN, channels_in)`.
+      kernel: the unshared weights for N-D convolution,
+          an (N+2)-D tensor of shape:
+          `(d_in1, ..., d_inN, channels_in, d_out2, ..., d_outN, channels_out)`
+          or
+          `(channels_in, d_in1, ..., d_inN, channels_out, d_out2, ..., d_outN)`,
+          with the ordering of channels and spatial dimensions matching
+          that of the input.
+          Each entry is the weight between a particular input and
+          output location, similarly to a fully-connected weight matrix.
+      kernel_mask: a float 0/1 mask tensor of shape:
+           `(d_in1, ..., d_inN, 1, d_out2, ..., d_outN, 1)`
+           or
+           `(1, d_in1, ..., d_inN, 1, d_out2, ..., d_outN)`,
+           with the ordering of singleton and spatial dimensions
+           matching that of the input.
+           Mask represents the connectivity pattern of the layer and is
+           precomputed elsewhere based on layer parameters: stride,
+           padding, and the receptive field shape.
+      output_shape: a tuple of (N+2) elements representing the output shape:
+          `(batch_size, channels_out, d_out1, ..., d_outN)`
+          or
+          `(batch_size, d_out1, ..., d_outN, channels_out)`,
+          with the ordering of channels and spatial dimensions matching that of
+          the input.
+
+  Returns:
+      Output (N+2)-D tensor with shape `output_shape`.
+  """
+  inputs_flat = K.reshape(inputs, (K.shape(inputs)[0], -1))
+
+  kernel = kernel_mask * kernel
+  kernel = make_2d(kernel, split_dim=K.ndim(kernel) // 2)
+
+  output_flat = K.math_ops.sparse_matmul(inputs_flat, kernel, b_is_sparse=True)
+  output = K.reshape(output_flat,
+                     [K.shape(output_flat)[0],] + output_shape.as_list()[1:])
+  return output
+
+
+def make_2d(tensor, split_dim):
+  """Reshapes an N-dimensional tensor into a 2D tensor.
+
+  Dimensions before (excluding) and after (including) `split_dim` are grouped
+  together.
+
+  Arguments:
+    tensor: a tensor of shape `(d0, ..., d(N-1))`.
+    split_dim: an integer from 1 to N-1, index of the dimension to group
+        dimensions before (excluding) and after (including).
+
+  Returns:
+    Tensor of shape
+    `(d0 * ... * d(split_dim-1), d(split_dim) * ... * d(N-1))`.
+  """
+  shape = K.array_ops.shape(tensor)
+  in_dims = shape[:split_dim]
+  out_dims = shape[split_dim:]
+
+  in_size = K.math_ops.reduce_prod(in_dims)
+  out_size = K.math_ops.reduce_prod(out_dims)
+
+  return K.array_ops.reshape(tensor, (in_size, out_size))
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 9639e0251f..8589b32b3c 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -24,6 +24,7 @@ from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class LocallyConnectedLayersTest(test.TestCase):
@@ -36,21 +37,30 @@ class LocallyConnectedLayersTest(test.TestCase):
     filter_length = 3
     filters = 4
 
-    for padding in ['valid']:
+    for padding in ['valid', 'same']:
       for strides in [1]:
         if padding == 'same' and strides != 1:
           continue
         for data_format in ['channels_first', 'channels_last']:
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected1D,
-              kwargs={
-                  'filters': filters,
-                  'kernel_size': filter_length,
-                  'padding': padding,
-                  'strides': strides,
-                  'data_format': data_format
-              },
-              input_shape=(num_samples, num_steps, input_dim))
+          for implementation in [1, 2]:
+            kwargs = {
+                'filters': filters,
+                'kernel_size': filter_length,
+                'padding': padding,
+                'strides': strides,
+                'data_format': data_format,
+                'implementation': implementation
+            }
+
+            if padding == 'same' and implementation == 1:
+              self.assertRaises(ValueError,
+                                keras.layers.LocallyConnected1D,
+                                **kwargs)
+            else:
+              testing_utils.layer_test(
+                  keras.layers.LocallyConnected1D,
+                  kwargs=kwargs,
+                  input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -59,38 +69,47 @@ class LocallyConnectedLayersTest(test.TestCase):
     filter_length = 3
     filters = 4
     for data_format in ['channels_first', 'channels_last']:
-      kwargs = {
-          'filters': filters,
-          'kernel_size': filter_length,
-          'kernel_regularizer': 'l2',
-          'bias_regularizer': 'l2',
-          'activity_regularizer': 'l2',
-          'data_format': data_format
-      }
-
-      with self.test_session():
-        layer = keras.layers.LocallyConnected1D(**kwargs)
-        layer.build((num_samples, num_steps, input_dim))
-        self.assertEqual(len(layer.losses), 2)
-        layer(
-            keras.backend.variable(np.ones((num_samples,
-                                            num_steps,
-                                            input_dim))))
-        self.assertEqual(len(layer.losses), 3)
-
-      k_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      kwargs = {
-          'filters': filters,
-          'kernel_size': filter_length,
-          'kernel_constraint': k_constraint,
-          'bias_constraint': b_constraint,
-      }
-      with self.test_session():
-        layer = keras.layers.LocallyConnected1D(**kwargs)
-        layer.build((num_samples, num_steps, input_dim))
-        self.assertEqual(layer.kernel.constraint, k_constraint)
-        self.assertEqual(layer.bias.constraint, b_constraint)
+      for padding in ['valid', 'same']:
+        for implementation in [1, 2]:
+          kwargs = {
+              'filters': filters,
+              'kernel_size': filter_length,
+              'kernel_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'activity_regularizer': 'l2',
+              'data_format': data_format,
+              'implementation': implementation,
+              'padding': padding
+          }
+
+          if padding == 'same' and implementation == 1:
+            self.assertRaises(ValueError,
+                              keras.layers.LocallyConnected1D,
+                              **kwargs)
+          else:
+            with self.cached_session():
+              layer = keras.layers.LocallyConnected1D(**kwargs)
+              layer.build((num_samples, num_steps, input_dim))
+              self.assertEqual(len(layer.losses), 2)
+              layer(
+                  keras.backend.variable(np.ones((num_samples,
+                                                  num_steps,
+                                                  input_dim))))
+              self.assertEqual(len(layer.losses), 3)
+
+            k_constraint = keras.constraints.max_norm(0.01)
+            b_constraint = keras.constraints.max_norm(0.01)
+            kwargs = {
+                'filters': filters,
+                'kernel_size': filter_length,
+                'kernel_constraint': k_constraint,
+                'bias_constraint': b_constraint,
+            }
+            with self.cached_session():
+              layer = keras.layers.LocallyConnected1D(**kwargs)
+              layer.build((num_samples, num_steps, input_dim))
+              self.assertEqual(layer.kernel.constraint, k_constraint)
+              self.assertEqual(layer.bias.constraint, b_constraint)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_2d(self):
@@ -100,23 +119,32 @@ class LocallyConnectedLayersTest(test.TestCase):
     num_row = 6
     num_col = 10
 
-    for padding in ['valid']:
+    for padding in ['valid', 'same']:
       for strides in [(1, 1), (2, 2)]:
-        if padding == 'same' and strides != (1, 1):
-          continue
+        for implementation in [1, 2]:
+          if padding == 'same' and strides != (1, 1):
+            continue
 
-        testing_utils.layer_test(
-            keras.layers.LocallyConnected2D,
-            kwargs={
-                'filters': filters,
-                'kernel_size': 3,
-                'padding': padding,
-                'kernel_regularizer': 'l2',
-                'bias_regularizer': 'l2',
-                'strides': strides,
-                'data_format': 'channels_last'
-            },
-            input_shape=(num_samples, num_row, num_col, stack_size))
+          kwargs = {
+              'filters': filters,
+              'kernel_size': 3,
+              'padding': padding,
+              'kernel_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'strides': strides,
+              'data_format': 'channels_last',
+              'implementation': implementation
+          }
+
+          if padding == 'same' and implementation == 1:
+            self.assertRaises(ValueError,
+                              keras.layers.LocallyConnected2D,
+                              **kwargs)
+          else:
+            testing_utils.layer_test(
+                keras.layers.LocallyConnected2D,
+                kwargs=kwargs,
+                input_shape=(num_samples, num_row, num_col, stack_size))
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_locallyconnected_2d_channels_first(self):
@@ -126,14 +154,25 @@ class LocallyConnectedLayersTest(test.TestCase):
     num_row = 6
     num_col = 10
 
-    testing_utils.layer_test(
-        keras.layers.LocallyConnected2D,
-        kwargs={
+    for implementation in [1, 2]:
+      for padding in ['valid', 'same']:
+        kwargs = {
             'filters': filters,
             'kernel_size': 3,
-            'data_format': 'channels_first'
-        },
-        input_shape=(num_samples, num_row, num_col, stack_size))
+            'data_format': 'channels_first',
+            'implementation': implementation,
+            'padding': padding
+        }
+
+        if padding == 'same' and implementation == 1:
+          self.assertRaises(ValueError,
+                            keras.layers.LocallyConnected2D,
+                            **kwargs)
+        else:
+          testing_utils.layer_test(
+              keras.layers.LocallyConnected2D,
+              kwargs=kwargs,
+              input_shape=(num_samples, num_row, num_col, stack_size))
 
   def test_locallyconnected_2d_regularization(self):
     num_samples = 8
@@ -141,35 +180,271 @@ class LocallyConnectedLayersTest(test.TestCase):
     stack_size = 4
     num_row = 6
     num_col = 10
-    kwargs = {
-        'filters': filters,
-        'kernel_size': 3,
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-    }
-    with self.test_session():
-      layer = keras.layers.LocallyConnected2D(**kwargs)
-      layer.build((num_samples, num_row, num_col, stack_size))
-      self.assertEqual(len(layer.losses), 2)
-      layer(
-          keras.backend.variable(
-              np.ones((num_samples, num_row, num_col, stack_size))))
-      self.assertEqual(len(layer.losses), 3)
-
-    k_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    kwargs = {
-        'filters': filters,
-        'kernel_size': 3,
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-    }
-    with self.test_session():
-      layer = keras.layers.LocallyConnected2D(**kwargs)
-      layer.build((num_samples, num_row, num_col, stack_size))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+    for implementation in [1, 2]:
+      for padding in ['valid', 'same']:
+        kwargs = {
+            'filters': filters,
+            'kernel_size': 3,
+            'kernel_regularizer': 'l2',
+            'bias_regularizer': 'l2',
+            'activity_regularizer': 'l2',
+            'implementation': implementation,
+            'padding': padding
+        }
+
+        if padding == 'same' and implementation == 1:
+          self.assertRaises(ValueError,
+                            keras.layers.LocallyConnected2D,
+                            **kwargs)
+        else:
+          with self.cached_session():
+            layer = keras.layers.LocallyConnected2D(**kwargs)
+            layer.build((num_samples, num_row, num_col, stack_size))
+            self.assertEqual(len(layer.losses), 2)
+            layer(
+                keras.backend.variable(
+                    np.ones((num_samples, num_row, num_col, stack_size))))
+            self.assertEqual(len(layer.losses), 3)
+
+          k_constraint = keras.constraints.max_norm(0.01)
+          b_constraint = keras.constraints.max_norm(0.01)
+          kwargs = {
+              'filters': filters,
+              'kernel_size': 3,
+              'kernel_constraint': k_constraint,
+              'bias_constraint': b_constraint,
+          }
+          with self.cached_session():
+            layer = keras.layers.LocallyConnected2D(**kwargs)
+            layer.build((num_samples, num_row, num_col, stack_size))
+            self.assertEqual(layer.kernel.constraint, k_constraint)
+            self.assertEqual(layer.bias.constraint, b_constraint)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_locallyconnected_implementation(self):
+    n_train = 4
+    n_classes = 3
+    n_epochs = 2
+
+    np.random.seed(1)
+    targets = np.random.randint(0, n_classes, (n_train,))
+
+    for width in [1, 17]:
+      for height in [16]:
+        for filters in [2]:
+          for data_format in ['channels_first', 'channels_last']:
+            inputs = get_inputs(data_format, filters, height, n_train, width)
+
+            for kernel_x in [(3,)]:
+              for kernel_y in [()] if width == 1 else [(2,)]:
+                for stride_x in [(1,)]:
+                  for stride_y in [()] if width == 1 else [(3,)]:
+                    for layers in [2]:
+                      kwargs = {
+                          'layers': layers,
+                          'filters': filters,
+                          'kernel_size': kernel_x + kernel_y,
+                          'strides': stride_x + stride_y,
+                          'data_format': data_format,
+                          'n_classes': n_classes,
+                          'input_shape': inputs.shape
+                      }
+
+                      model_1 = get_model(implementation=1, **kwargs)
+                      model_2 = get_model(implementation=2, **kwargs)
+
+                      copy_model_weights(model_2, model_1)
+
+                      # Compare outputs at initialization.
+                      out_1 = model_1.call(inputs)
+                      out_2 = model_2.call(inputs)
+                      self.assertAllCloseAccordingToType(out_1, out_2,
+                                                         rtol=1e-5, atol=1e-5)
+
+                      # Train.
+                      model_1.fit(x=inputs,
+                                  y=targets,
+                                  epochs=n_epochs,
+                                  batch_size=n_train)
+
+                      model_2.fit(x=inputs,
+                                  y=targets,
+                                  epochs=n_epochs,
+                                  batch_size=n_train)
+
+                      # Compare outputs after a few training steps.
+                      out_1 = model_1.call(inputs)
+                      out_2 = model_2.call(inputs)
+                      self.assertAllCloseAccordingToType(out_1, out_2,
+                                                         rtol=1e-5, atol=1e-5)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_make_2d(self):
+    input_shapes = [
+        (0,),
+        (0, 0),
+        (1,),
+        (2,),
+        (3,),
+        (1, 0),
+        (0, 3),
+        (1, 1),
+        (1, 2),
+        (3, 1),
+        (2, 2),
+        (3, 3),
+        (1, 0, 1),
+        (5, 2, 3),
+        (3, 5, 6, 7, 0),
+        (3, 2, 2, 4, 4),
+        (1, 2, 3, 4, 7, 2),
+    ]
+    np.random.seed(1)
+
+    for input_shape in input_shapes:
+      inputs = np.random.normal(0, 1, input_shape)
+      inputs_tf = keras.backend.variable(inputs)
+
+      split_dim = np.random.randint(0, inputs.ndim + 1)
+      shape_2d = (int(np.prod(inputs.shape[:split_dim])),
+                  int(np.prod(inputs.shape[split_dim:])))
+      inputs_2d = np.reshape(inputs, shape_2d)
+
+      inputs_2d_tf = keras.layers.local.make_2d(inputs_tf, split_dim)
+      inputs_2d_tf = keras.backend.get_value(inputs_2d_tf)
+
+      self.assertAllCloseAccordingToType(inputs_2d, inputs_2d_tf)
+
+
+def get_inputs(data_format, filters, height, n_train, width):
+  if data_format == 'channels_first':
+    if width == 1:
+      input_shape = (filters, height)
+    else:
+      input_shape = (filters, height, width)
+
+  elif data_format == 'channels_last':
+    if width == 1:
+      input_shape = (height, filters)
+    else:
+      input_shape = (height, width, filters)
+
+  else:
+    raise NotImplementedError(data_format)
+
+  inputs = np.random.normal(0, 1,
+                            (n_train,) + input_shape).astype(np.float32)
+  return inputs
+
+
+def xent(y_true, y_pred):
+  y_true = keras.backend.cast(
+      keras.backend.reshape(y_true, (-1,)),
+      keras.backend.dtypes_module.int32)
+
+  return keras.backend.nn.sparse_softmax_cross_entropy_with_logits(
+      labels=y_true,
+      logits=y_pred)
+
+
+def get_model(implementation,
+              filters,
+              kernel_size,
+              strides,
+              layers,
+              n_classes,
+              data_format,
+              input_shape):
+  model = keras.Sequential()
+
+  if len(kernel_size) == 1:
+    lc_layer = keras.layers.LocallyConnected1D
+  elif len(kernel_size) == 2:
+    lc_layer = keras.layers.LocallyConnected2D
+  else:
+    raise NotImplementedError(kernel_size)
+
+  for _ in range(layers):
+    model.add(lc_layer(
+        padding='valid',
+        kernel_initializer=keras.initializers.random_normal(),
+        bias_initializer=keras.initializers.random_normal(),
+        filters=filters,
+        strides=strides,
+        kernel_size=kernel_size,
+        activation=keras.activations.relu,
+        data_format=data_format,
+        implementation=implementation))
+
+  model.add(keras.layers.Flatten())
+  model.add(keras.layers.Dense(n_classes))
+  model.compile(
+      optimizer=RMSPropOptimizer(0.01),
+      metrics=[keras.metrics.categorical_accuracy],
+      loss=xent
+  )
+  model.build(input_shape)
+  return model
+
+
+def copy_lc_weights(lc_layer_2_from, lc_layer_1_to):
+  lc_2_kernel, lc_2_bias = lc_layer_2_from.weights
+  lc_2_kernel_masked = lc_2_kernel * lc_layer_2_from.kernel_mask
+
+  data_format = lc_layer_2_from.data_format
+
+  if data_format == 'channels_first':
+    if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
+      permutation = (3, 0, 1, 2)
+    elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
+      permutation = (4, 5, 0, 1, 2, 3)
+    else:
+      raise NotImplementedError(lc_layer_2_from)
+
+  elif data_format == 'channels_last':
+    if isinstance(lc_layer_2_from, keras.layers.LocallyConnected1D):
+      permutation = (2, 0, 1, 3)
+    elif isinstance(lc_layer_2_from, keras.layers.LocallyConnected2D):
+      permutation = (3, 4, 0, 1, 2, 5)
+    else:
+      raise NotImplementedError(lc_layer_2_from)
+
+  else:
+    raise NotImplementedError(data_format)
+
+  lc_2_kernel_masked = keras.backend.permute_dimensions(
+      lc_2_kernel_masked, permutation)
+
+  lc_2_kernel_mask = keras.backend.math_ops.not_equal(
+      lc_2_kernel_masked, 0)
+  lc_2_kernel_flat = keras.backend.array_ops.boolean_mask(
+      lc_2_kernel_masked, lc_2_kernel_mask)
+  lc_2_kernel_reshaped = keras.backend.reshape(lc_2_kernel_flat,
+                                               lc_layer_1_to.kernel.shape)
+
+  lc_2_kernel_reshaped = keras.backend.get_value(lc_2_kernel_reshaped)
+  lc_2_bias = keras.backend.get_value(lc_2_bias)
+
+  lc_layer_1_to.set_weights([lc_2_kernel_reshaped, lc_2_bias])
+
+
+def copy_model_weights(model_2_from, model_1_to):
+  for l in range(len(model_2_from.layers)):
+    layer_2_from = model_2_from.layers[l]
+    layer_1_to = model_1_to.layers[l]
+
+    if isinstance(layer_2_from, (keras.layers.LocallyConnected2D,
+                                 keras.layers.LocallyConnected1D)):
+      copy_lc_weights(layer_2_from, layer_1_to)
+
+    elif isinstance(layer_2_from, keras.layers.Dense):
+      weights_2, bias_2 = layer_2_from.weights
+      weights_2 = keras.backend.get_value(weights_2)
+      bias_2 = keras.backend.get_value(bias_2)
+      layer_1_to.set_weights([weights_2, bias_2])
+
+    else:
+      continue
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 39bc98d039..7bcfcaeddb 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -46,7 +46,7 @@ class MergeLayersTest(test.TestCase):
     self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
 
   def test_merge_add_masking(self):
-    with self.test_session():
+    with self.cached_session():
       i1 = keras.layers.Input(shape=(4, 5))
       i2 = keras.layers.Input(shape=(4, 5))
       m1 = keras.layers.Masking()(i1)
@@ -57,7 +57,7 @@ class MergeLayersTest(test.TestCase):
       self.assertListEqual(mask.get_shape().as_list(), [None, 4])
 
   def test_merge_add_dynamic_shape(self):
-    with self.test_session():
+    with self.cached_session():
       i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
       i2 = array_ops.placeholder(shape=(4, 5), dtype='float32')
       layer = keras.layers.Add()
@@ -149,7 +149,7 @@ class MergeLayersTest(test.TestCase):
     self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
 
   def test_merge_concatenate_masking(self):
-    with self.test_session():
+    with self.cached_session():
       i1 = keras.layers.Input(shape=(4, 5))
       i2 = keras.layers.Input(shape=(4, 5))
       m1 = keras.layers.Masking()(i1)
diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
index aa2be62390..cea304680b 100644
--- a/tensorflow/python/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -27,14 +27,14 @@ from tensorflow.python.platform import test
 class NoiseLayersTest(test.TestCase):
 
   def test_GaussianNoise(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(
           keras.layers.GaussianNoise,
           kwargs={'stddev': 1.},
           input_shape=(3, 2, 3))
 
   def test_GaussianDropout(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(
           keras.layers.GaussianDropout,
           kwargs={'rate': 0.5},
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index a7835bc0a2..cd26e04c39 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -36,7 +36,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -345,16 +345,16 @@ class BatchNormalization(Layer):
               aggregation=variable_scope.VariableAggregation.MEAN)
           return var
 
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_mean):
+        with distribution_strategy_context.get_distribution_strategy(
+        ).colocate_vars_with(self.moving_mean):
           self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
           self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
         # We initialize renorm_stddev to 0, and maintain the (0-initialized)
         # renorm_stddev_weight. This allows us to (1) mix the average
         # stddev with the minibatch stddev early in training, and (2) compute
         # the unbiased average stddev by dividing renorm_stddev by the weight.
-        with distribute_lib.get_distribution_strategy().colocate_vars_with(
-            self.moving_variance):
+        with distribution_strategy_context.get_distribution_strategy(
+        ).colocate_vars_with(self.moving_variance):
           self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
           self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
                                                        ())
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index a97b4cac46..2844b84799 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 class NormalizationLayersTest(test.TestCase):
 
   def test_basic_batchnorm(self):
-    with self.test_session():
+    with self.cached_session():
       testing_utils.layer_test(
           keras.layers.BatchNormalization,
           kwargs={
@@ -54,7 +54,7 @@ class NormalizationLayersTest(test.TestCase):
           input_shape=(3, 3))
 
   def test_batchnorm_weights(self):
-    with self.test_session():
+    with self.cached_session():
       layer = keras.layers.BatchNormalization(scale=False, center=False)
       layer.build((None, 3, 4))
       self.assertEqual(len(layer.trainable_weights), 0)
@@ -66,7 +66,7 @@ class NormalizationLayersTest(test.TestCase):
       self.assertEqual(len(layer.weights), 4)
 
   def test_batchnorm_regularization(self):
-    with self.test_session():
+    with self.cached_session():
       layer = keras.layers.BatchNormalization(
           gamma_regularizer='l1', beta_regularizer='l1')
       layer.build((None, 3, 4))
@@ -79,7 +79,7 @@ class NormalizationLayersTest(test.TestCase):
       self.assertEqual(layer.beta.constraint, max_norm)
 
   def test_batchnorm_correctness(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
       model.add(norm)
@@ -96,7 +96,7 @@ class NormalizationLayersTest(test.TestCase):
       np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
   def test_batchnorm_mixed_precision(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
       model.add(norm)
@@ -133,7 +133,7 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
   def test_batchnorm_convnet_channel_last(self):
-    with self.test_session():
+    with self.cached_session():
       # keras.backend.set_learning_phase(True)
 
       model = keras.models.Sequential()
@@ -155,7 +155,7 @@ class NormalizationLayersTest(test.TestCase):
   def test_shared_batchnorm(self):
     """Test that a BN layer can be shared across different data streams.
     """
-    with self.test_session():
+    with self.cached_session():
       # Test single layer reuse
       bn = keras.layers.BatchNormalization()
       x1 = keras.layers.Input(shape=(10,))
@@ -187,7 +187,7 @@ class NormalizationLayersTest(test.TestCase):
       new_model.train_on_batch(x, x)
 
   def test_that_trainable_disables_updates(self):
-    with self.test_session():
+    with self.cached_session():
       val_a = np.random.random((10, 4))
       val_out = np.random.random((10, 4))
 
@@ -230,7 +230,7 @@ class NormalizationLayersTest(test.TestCase):
     Computes mean and std for current inputs then
     applies batch normalization using them.
     """
-    with self.test_session():
+    with self.cached_session():
       bn_mean = 0.5
       bn_std = 10.
       val_a = np.expand_dims(np.arange(10.), axis=1)
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 66c68e2085..04b3aecff8 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -33,7 +33,6 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -74,19 +73,27 @@ class StackedRNNCells(Layer):
                          '`state_size` attribute. '
                          'received cells:', cells)
     self.cells = cells
+    # reverse_state_order determines whether the state size will be in a reverse
+    # order of the cells' state. User might want to set this to True to keep the
+    # existing behavior. This is only useful when use RNN(return_state=True)
+    # since the state will be returned as the same order of state_size.
+    self.reverse_state_order = kwargs.pop('reverse_state_order', False)
+    if self.reverse_state_order:
+      logging.warning('reverse_state_order=True in StackedRNNCells will soon '
+                      'be deprecated. Please update the code to work with the '
+                      'natural order of states if you reply on the RNN states, '
+                      'eg RNN(return_state=True).')
     super(StackedRNNCells, self).__init__(**kwargs)
 
   @property
   def state_size(self):
-    # States are a flat list
-    # in reverse order of the cell stack.
-    # This allows to preserve the requirement
-    # `stack.state_size[0] == output_dim`.
-    # e.g. states of a 2-layer LSTM would be
-    # `[h2, c2, h1, c1]`
+    # States are a flat list of the individual cell state size.
+    # e.g. states of a 2-layer LSTM would be `[h1, c1, h2, c2]`.
     # (assuming one LSTM has states [h, c])
+    # In the case of reverse_state_order=True, the state_size will be
+    # [h2, c2, h1, c1].
     state_size = []
-    for cell in self.cells[::-1]:
+    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
       if _is_multiple_state(cell.state_size):
         state_size += list(cell.state_size)
       else:
@@ -95,22 +102,40 @@ class StackedRNNCells(Layer):
 
   @property
   def output_size(self):
-    if hasattr(self.cells[-1], 'output_size'):
+    if getattr(self.cells[-1], 'output_size', None) is not None:
       return self.cells[-1].output_size
+    elif _is_multiple_state(self.cells[-1].state_size):
+      return self.cells[-1].state_size[0]
     else:
-      return self.state_size[0]
+      return self.cells[-1].state_size
+
+  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+    # The init state is flattened into a list because state_size is a flattened
+    # list.
+    initial_states = []
+    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
+      get_initial_state_fn = getattr(cell, 'get_initial_state', None)
+      if get_initial_state_fn:
+        initial_states.append(get_initial_state_fn(
+            inputs=inputs, batch_size=batch_size, dtype=dtype))
+      else:
+        initial_states.append(_generate_zero_filled_state_for_cell(
+            cell, inputs, batch_size, dtype))
+
+    return nest.flatten(initial_states)
 
   def call(self, inputs, states, constants=None, **kwargs):
     # Recover per-cell states.
     nested_states = []
-    for cell in self.cells[::-1]:
+    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
       if _is_multiple_state(cell.state_size):
         nested_states.append(states[:len(cell.state_size)])
         states = states[len(cell.state_size):]
       else:
         nested_states.append([states[0]])
         states = states[1:]
-    nested_states = nested_states[::-1]
+    if self.reverse_state_order:
+      nested_states = nested_states[::-1]
 
     # Call the cells in order and store the returned states.
     new_nested_states = []
@@ -124,11 +149,12 @@ class StackedRNNCells(Layer):
       new_nested_states.append(states)
 
     # Format the new states as a flat list
-    # in reverse cell order.
-    states = []
-    for cell_states in new_nested_states[::-1]:
-      states += cell_states
-    return inputs, states
+    new_states = []
+    if self.reverse_state_order:
+      new_nested_states = new_nested_states[::-1]
+    for cell_states in new_nested_states:
+      new_states += cell_states
+    return inputs, new_states
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -141,7 +167,9 @@ class StackedRNNCells(Layer):
           cell.build([input_shape] + constants_shape)
         else:
           cell.build(input_shape)
-      if _is_multiple_state(cell.state_size):
+      if getattr(cell, 'output_size', None) is not None:
+        output_dim = cell.output_size
+      elif _is_multiple_state(cell.state_size):
         output_dim = cell.state_size[0]
       else:
         output_dim = cell.state_size
@@ -261,6 +289,22 @@ class RNN(Layer):
               compatible reason, if this attribute is not available for the
               cell, the value will be inferred by the first element of the
               `state_size`.
+          - a `get_initial_state(inputs=None, batch_size=None, dtype=None)`
+              method that creates a tensor meant to be fed to `call()` as the
+              initial state, if user didn't specify any initial state via other
+              means. The returned initial state should be in shape of
+              [batch, cell.state_size]. Cell might choose to create zero filled
+              tensor, or with other values based on the cell implementations.
+              `inputs` is the input tensor to the RNN layer, which should
+              contain the batch size as its shape[0], and also dtype. Note that
+              the shape[0] might be None during the graph construction. Either
+              the `inputs` or the pair of `batch` and `dtype `are provided.
+              `batch` is a scalar tensor that represent the batch size
+              of the input. `dtype` is `tf.dtype` that represent the dtype of
+              the input.
+              For backward compatible reason, if this method is not implemented
+              by the cell, RNN layer will create a zero filled tensors with the
+              size of [batch, cell.state_size].
           In the case that `cell` is a list of RNN cell instances, the cells
           will be stacked on after the other in the RNN, implementing an
           efficient stacked RNN.
@@ -453,7 +497,7 @@ class RNN(Layer):
     else:
       state_size = [self.cell.state_size]
 
-    if hasattr(self.cell, 'output_size'):
+    if getattr(self.cell, 'output_size', None) is not None:
       output_dim = tensor_shape.as_shape(self.cell.output_size).as_list()
     else:
       # Note that state_size[0] could be a tensor_shape or int.
@@ -553,26 +597,18 @@ class RNN(Layer):
       raise validation_error
 
   def get_initial_state(self, inputs):
-    # build an all-zero tensor of shape (batch, cell.state_size)
-    initial_state = array_ops.zeros_like(inputs)
-    # shape of initial_state = (batch, timesteps, ...)
-    initial_state = math_ops.reduce_sum(
-        initial_state, axis=list(range(1, len(inputs.shape))))
-    # shape of initial_state = (batch,)
-    if _is_multiple_state(self.cell.state_size):
-      states = []
-      for dims in self.cell.state_size:
-        state = initial_state
-        flat_dims = tensor_shape.as_shape(dims).as_list()
-        # reshape the state to (batch, 1, 1, ....) and then expand each state.
-        state = array_ops.reshape(state, [-1,] + [1] * len(flat_dims))
-        states.append(K.tile(state, [1] + flat_dims))
-      return states
+    get_initial_state_fn = getattr(self.cell, 'get_initial_state', None)
+    if get_initial_state_fn:
+      init_state = get_initial_state_fn(
+          inputs=inputs, batch_size=None, dtype=None)
     else:
-      flat_dims = tensor_shape.as_shape(self.cell.state_size).as_list()
-      initial_state = array_ops.reshape(
-          initial_state, [-1] + [1] * len(flat_dims))
-      return [K.tile(initial_state, [1] + flat_dims)]
+      init_state = _generate_zero_filled_state(
+          array_ops.shape(inputs)[0], self.cell.state_size, inputs.dtype)
+    # Keras RNN expect the states in a list, even if it's a single state tensor.
+    if not nest.is_sequence(init_state):
+      init_state = [init_state]
+    # Force the state to be a list in case it is a namedtuple eg LSTMStateTuple.
+    return list(init_state)
 
   def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
     inputs, initial_state, constants = _standardize_args(inputs,
@@ -636,6 +672,14 @@ class RNN(Layer):
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
     if isinstance(inputs, list):
+      # get initial_state from full input spec
+      # as they could be copied to multiple GPU.
+      if self._num_constants is None:
+        initial_state = inputs[1:]
+      else:
+        initial_state = inputs[1:-self._num_constants]
+      if len(initial_state) == 0:
+        initial_state = None
       inputs = inputs[0]
     if initial_state is not None:
       pass
@@ -670,6 +714,8 @@ class RNN(Layer):
     if generic_utils.has_arg(self.cell.call, 'training'):
       kwargs['training'] = training
 
+    # TF RNN cells expect single tensor as state instead of list wrapped tensor.
+    is_tf_rnn_cell = getattr(self.cell, '_is_tf_rnn_cell', None) is not None
     if constants:
       if not generic_utils.has_arg(self.cell.call, 'constants'):
         raise ValueError('RNN cell does not support constants')
@@ -677,11 +723,21 @@ class RNN(Layer):
       def step(inputs, states):
         constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
         states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-        return self.cell.call(inputs, states, constants=constants, **kwargs)
+
+        states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
+        output, new_states = self.cell.call(
+            inputs, states, constants=constants, **kwargs)
+        if not nest.is_sequence(new_states):
+          new_states = [new_states]
+        return output, new_states
     else:
 
       def step(inputs, states):
-        return self.cell.call(inputs, states, **kwargs)
+        states = states[0] if len(states) == 1 and is_tf_rnn_cell else states
+        output, new_states = self.cell.call(inputs, states, **kwargs)
+        if not nest.is_sequence(new_states):
+          new_states = [new_states]
+        return output, new_states
 
     last_output, outputs, states = K.rnn(
         step,
@@ -974,6 +1030,9 @@ class SimpleRNNCell(Layer):
         output._uses_learning_phase = True
     return output, [output]
 
+  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+    return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
+
   def get_config(self):
     config = {
         'units':
@@ -1505,6 +1564,9 @@ class GRUCell(Layer):
     base_config = super(GRUCell, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+    return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
+
 
 @tf_export('keras.layers.GRU')
 class GRU(RNN):
@@ -2030,6 +2092,9 @@ class LSTMCell(Layer):
     base_config = super(LSTMCell, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+    return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
+
 
 @tf_export('keras.layers.LSTM')
 class LSTM(RNN):
@@ -2342,3 +2407,30 @@ def _is_multiple_state(state_size):
   """Check whether the state_size contains multiple states."""
   return (hasattr(state_size, '__len__') and
           not isinstance(state_size, tensor_shape.TensorShape))
+
+
+def _generate_zero_filled_state_for_cell(cell, inputs, batch_size, dtype):
+  if inputs is not None:
+    batch_size = array_ops.shape(inputs)[0]
+    dtype = inputs.dtype
+  return _generate_zero_filled_state(batch_size, cell.state_size, dtype)
+
+
+def _generate_zero_filled_state(batch_size_tensor, state_size, dtype):
+  """Generate a zero filled tensor with shape [batch_size, state_size]."""
+  if None in [batch_size_tensor, dtype]:
+    raise ValueError(
+        'batch_size and dtype cannot be None while constructing initial state: '
+        'batch_size={}, dtype={}'.format(batch_size_tensor, dtype))
+  if _is_multiple_state(state_size):
+    states = []
+    for dims in state_size:
+      flat_dims = tensor_shape.as_shape(dims).as_list()
+      init_state_size = [batch_size_tensor] + flat_dims
+      init_state = array_ops.zeros(init_state_size, dtype=dtype)
+      states.append(init_state)
+    return states
+  else:
+    flat_dims = tensor_shape.as_shape(state_size).as_list()
+    init_state_size = [batch_size_tensor] + flat_dims
+    return array_ops.zeros(init_state_size, dtype=dtype)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index 13bd070528..a3861e44d5 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -50,7 +50,7 @@ class RNNTest(test.TestCase):
         output = keras.backend.dot(inputs, self.kernel) + prev_output
         return output, [output]
 
-    with self.test_session():
+    with self.cached_session():
       # Basic test case.
       cell = MinimalRNNCell(32, 5)
       x = keras.Input((None, 5))
@@ -88,7 +88,7 @@ class RNNTest(test.TestCase):
         output -= prev_output_2
         return output, [output * 2, output * 3]
 
-    with self.test_session():
+    with self.cached_session():
       # Basic test case.
       cell = MinimalRNNCell(32, 5)
       x = keras.Input((None, 5))
@@ -103,7 +103,8 @@ class RNNTest(test.TestCase):
                MinimalRNNCell(16, 8),
                MinimalRNNCell(32, 16)]
       layer = keras.layers.RNN(cells)
-      assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
+      self.assertEqual(layer.cell.state_size, (8, 8, 16, 16, 32, 32))
+      self.assertEqual(layer.cell.output_size, 32)
       y = layer(x)
       model = keras.models.Model(x, y)
       model.compile(optimizer='rmsprop', loss='mse')
@@ -139,7 +140,7 @@ class RNNTest(test.TestCase):
         base_config = super(MinimalRNNCell, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    with self.test_session():
+    with self.cached_session():
       # Test basic case.
       x = keras.Input((None, 5))
       cell = MinimalRNNCell(32)
@@ -228,7 +229,7 @@ class RNNTest(test.TestCase):
         base_config = super(RNNCellWithConstants, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    with self.test_session():
+    with self.cached_session():
       # Test basic case.
       x = keras.Input((None, 5))
       c = keras.Input((3,))
@@ -243,7 +244,7 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
-    with self.test_session():
+    with self.cached_session():
       # Test basic case serialization.
       x_np = np.random.random((6, 5, 5))
       c_np = np.random.random((6, 3))
@@ -259,7 +260,7 @@ class RNNTest(test.TestCase):
       y_np_2 = model.predict([x_np, c_np])
       self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
-    with self.test_session():
+    with self.cached_session():
       # test flat list inputs.
       with keras.utils.CustomObjectScope(custom_objects):
         layer = keras.layers.RNN.from_config(config.copy())
@@ -269,7 +270,7 @@ class RNNTest(test.TestCase):
       y_np_3 = model.predict([x_np, c_np])
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
-    with self.test_session():
+    with self.cached_session():
       # Test stacking.
       cells = [keras.layers.recurrent.GRUCell(8),
                RNNCellWithConstants(12),
@@ -283,7 +284,7 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
-    with self.test_session():
+    with self.cached_session():
       # Test GRUCell reset_after property.
       x = keras.Input((None, 5))
       c = keras.Input((3,))
@@ -297,7 +298,7 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
-    with self.test_session():
+    with self.cached_session():
       # Test stacked RNN serialization
       x_np = np.random.random((6, 5, 5))
       c_np = np.random.random((6, 3))
@@ -355,7 +356,7 @@ class RNNTest(test.TestCase):
         base_config = super(RNNCellWithConstants, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    with self.test_session():
+    with self.cached_session():
       # Test basic case.
       x = keras.Input((None, 5))
       c = keras.Input((3,))
@@ -370,7 +371,7 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
-    with self.test_session():
+    with self.cached_session():
       # Test basic case serialization.
       x_np = np.random.random((6, 5, 5))
       s_np = np.random.random((6, 32))
@@ -392,7 +393,7 @@ class RNNTest(test.TestCase):
       with self.assertRaises(AssertionError):
         self.assertAllClose(y_np, y_np_2_different_s, atol=1e-4)
 
-    with self.test_session():
+    with self.cached_session():
       # test flat list inputs
       with keras.utils.CustomObjectScope(custom_objects):
         layer = keras.layers.RNN.from_config(config.copy())
@@ -467,7 +468,7 @@ class RNNTest(test.TestCase):
     timesteps = 2
     num_samples = 2
 
-    with self.test_session():
+    with self.cached_session():
       input1 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
       layer = layer_class(units,
                           return_state=True,
@@ -487,7 +488,7 @@ class RNNTest(test.TestCase):
     for cell_class in [keras.layers.SimpleRNNCell,
                        keras.layers.GRUCell,
                        keras.layers.LSTMCell]:
-      with self.test_session():
+      with self.cached_session():
         # Test basic case.
         x = keras.Input((None, 5))
         cell = cell_class(32)
@@ -534,7 +535,7 @@ class RNNTest(test.TestCase):
              keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1)]
     layer = keras.layers.RNN(cells)
 
-    with self.test_session():
+    with self.cached_session():
       x = keras.Input((None, 5))
       y = layer(x)
       model = keras.models.Model(x, y)
@@ -551,6 +552,21 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cells, return_state=True, return_sequences=True)
     output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
     expected_output_shape = [(None, timesteps, 6),
+                             (None, 3),
+                             (None, 3),
+                             (None, 6),
+                             (None, 6)]
+    self.assertEqual(
+        [tuple(o.as_list()) for o in output_shape],
+        expected_output_shape)
+
+    # Test reverse_state_order = True for stacked cell.
+    stacked_cell = keras.layers.StackedRNNCells(
+        cells, reverse_state_order=True)
+    layer = keras.layers.RNN(
+        stacked_cell, return_state=True, return_sequences=True)
+    output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
+    expected_output_shape = [(None, timesteps, 6),
                              (None, 6),
                              (None, 6),
                              (None, 3),
@@ -561,7 +577,7 @@ class RNNTest(test.TestCase):
 
   def test_checkpointable_dependencies(self):
     rnn = keras.layers.SimpleRNN
-    with self.test_session():
+    with self.cached_session():
       x = np.random.random((2, 2, 2))
       y = np.random.random((2, 2))
       model = keras.models.Sequential()
@@ -576,7 +592,7 @@ class RNNTest(test.TestCase):
         self.assertIn(v, checkpointed_objects)
 
   def test_high_dimension_RNN(self):
-    with self.test_session():
+    with self.cached_session():
       # Basic test case.
       unit_a = 10
       unit_b = 20
@@ -626,7 +642,7 @@ class RNNTest(test.TestCase):
     batch = 32
     time_step = 4
 
-    with self.test_session():
+    with self.cached_session():
       # Basic test case.
       cell = Minimal2DRNNCell(unit_a, unit_b)
       x = keras.Input((None, input_a, input_b))
@@ -642,7 +658,7 @@ class RNNTest(test.TestCase):
       ], np.zeros((batch, unit_a, unit_b)))
       self.assertEqual(model.output_shape, (None, unit_a, unit_b))
 
-    with self.test_session():
+    with self.cached_session():
       # Bad init state shape.
       bad_shape_a = unit_a * 2
       bad_shape_b = unit_b * 2
@@ -655,7 +671,7 @@ class RNNTest(test.TestCase):
         layer(x, initial_state=s)
 
   def test_inconsistent_output_state_size(self):
-    with self.test_session():
+    with self.cached_session():
       batch = 32
       time_step = 4
       state_size = 5
@@ -678,6 +694,23 @@ class RNNTest(test.TestCase):
           np.zeros((batch, input_size)))
       self.assertEqual(model.output_shape, (None, input_size))
 
+  def test_get_initial_state(self):
+    cell = keras.layers.SimpleRNNCell(5)
+    with self.assertRaisesRegexp(ValueError,
+                                 'batch_size and dtype cannot be None'):
+      cell.get_initial_state(None, None, None)
+
+    inputs = keras.Input((None, 2, 10))
+    initial_state = cell.get_initial_state(inputs, None, None)
+    self.assertEqual(initial_state.shape.as_list(), [None, 5])
+    self.assertEqual(initial_state.dtype, inputs.dtype)
+
+    batch = array_ops.shape(inputs)[0]
+    dtype = inputs.dtype
+    initial_state = cell.get_initial_state(None, batch, dtype)
+    self.assertEqual(initial_state.shape.as_list(), [None, 5])
+    self.assertEqual(initial_state.dtype, inputs.dtype)
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 9b8d5fc5cc..a1933c11b0 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -545,11 +545,27 @@ class Bidirectional(Wrapper):
 
     if initial_state is not None and generic_utils.has_arg(
         self.layer.call, 'initial_state'):
-      forward_state = initial_state[:len(initial_state) // 2]
-      backward_state = initial_state[len(initial_state) // 2:]
-      y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
-      y_rev = self.backward_layer.call(
-          inputs, initial_state=backward_state, **kwargs)
+      forward_inputs = [inputs[0]]
+      backward_inputs = [inputs[0]]
+      pivot = len(initial_state) // 2 + 1
+      # add forward initial state
+      forward_state = inputs[1:pivot]
+      forward_inputs += forward_state
+      if self._num_constants is None:
+        # add backward initial state
+        backward_state = inputs[pivot:]
+        backward_inputs += backward_state
+      else:
+        # add backward initial state
+        backward_state = inputs[pivot:-self._num_constants]
+        backward_inputs += backward_state
+        # add constants for forward and backward layers
+        forward_inputs += inputs[-self._num_constants:]
+        backward_inputs += inputs[-self._num_constants:]
+      y = self.forward_layer.call(forward_inputs,
+                                  initial_state=forward_state, **kwargs)
+      y_rev = self.backward_layer.call(backward_inputs,
+                                       initial_state=backward_state, **kwargs)
     else:
       y = self.forward_layer.call(inputs, **kwargs)
       y_rev = self.backward_layer.call(inputs, **kwargs)
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 0cd774ef0f..965960917c 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -113,7 +113,7 @@ class TimeDistributedTest(test.TestCase):
       keras.layers.TimeDistributed(x)
 
   def test_timedistributed_conv2d(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       model.add(
           keras.layers.TimeDistributed(
@@ -128,7 +128,7 @@ class TimeDistributedTest(test.TestCase):
       model.summary()
 
   def test_timedistributed_stacked(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       model.add(
           keras.layers.TimeDistributed(
@@ -144,7 +144,7 @@ class TimeDistributedTest(test.TestCase):
           batch_size=10)
 
   def test_regularizers(self):
-    with self.test_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       model.add(
           keras.layers.TimeDistributed(
@@ -155,7 +155,7 @@ class TimeDistributedTest(test.TestCase):
       self.assertEqual(len(model.losses), 1)
 
   def test_TimeDistributed_learning_phase(self):
-    with self.test_session():
+    with self.cached_session():
       # test layers that need learning_phase to be set
       np.random.seed(1234)
       x = keras.layers.Input(shape=(3, 2))
@@ -166,7 +166,7 @@ class TimeDistributedTest(test.TestCase):
       self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
 
   def test_TimeDistributed_batchnorm(self):
-    with self.test_session():
+    with self.cached_session():
       # test that wrapped BN updates still work.
       model = keras.models.Sequential()
       model.add(keras.layers.TimeDistributed(
@@ -202,7 +202,7 @@ class TimeDistributedTest(test.TestCase):
     assert len(layer.trainable_weights) == 2
 
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
-    with self.test_session():
+    with self.cached_session():
       # test with unspecified shape and Embeddings with mask_zero
       model = keras.models.Sequential()
       model.add(keras.layers.TimeDistributed(
@@ -234,7 +234,7 @@ class TimeDistributedTest(test.TestCase):
       self.assertIs(mask_outputs[-1], None)  # final layer
 
   def test_TimeDistributed_with_masking_layer(self):
-    with self.test_session():
+    with self.cached_session():
       # test with Masking layer
       model = keras.models.Sequential()
       model.add(keras.layers.TimeDistributed(keras.layers.Masking(
@@ -266,7 +266,7 @@ class BidirectionalTest(test.TestCase):
     dim = 2
     timesteps = 2
     output_dim = 2
-    with self.test_session():
+    with self.cached_session():
       for mode in ['sum', 'concat', 'ave', 'mul']:
         x = np.random.random((samples, timesteps, dim))
         target_dim = 2 * output_dim if mode == 'concat' else output_dim
@@ -310,7 +310,7 @@ class BidirectionalTest(test.TestCase):
     dim = 2
     timesteps = 2
     output_dim = 2
-    with self.test_session():
+    with self.cached_session():
       x = np.random.random((samples, timesteps, dim))
       model = keras.models.Sequential()
       model.add(
@@ -331,7 +331,7 @@ class BidirectionalTest(test.TestCase):
     output_dim = 2
     mode = 'sum'
 
-    with self.test_session():
+    with self.cached_session():
       x = np.random.random((samples, timesteps, dim))
       target_dim = 2 * output_dim if mode == 'concat' else output_dim
       y = np.random.random((samples, target_dim))
@@ -363,7 +363,7 @@ class BidirectionalTest(test.TestCase):
     output_dim = 2
     mode = 'sum'
 
-    with self.test_session():
+    with self.cached_session():
       x = np.random.random((samples, timesteps, dim))
       target_dim = 2 * output_dim if mode == 'concat' else output_dim
       y = np.random.random((samples, target_dim))
@@ -383,7 +383,7 @@ class BidirectionalTest(test.TestCase):
     units = 3
     x = [np.random.rand(samples, timesteps, dim)]
 
-    with self.test_session():
+    with self.cached_session():
       for merge_mode in ['sum', 'mul', 'ave', 'concat', None]:
         if merge_mode == 'sum':
           merge_func = lambda y, y_rev: y + y_rev
@@ -447,7 +447,7 @@ class BidirectionalTest(test.TestCase):
     merge_mode = 'sum'
     x = [np.random.rand(samples, timesteps, dim)]
 
-    with self.test_session():
+    with self.cached_session():
       inputs = keras.Input((timesteps, dim))
       wrapped = keras.layers.Bidirectional(
           rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
@@ -474,7 +474,7 @@ class BidirectionalTest(test.TestCase):
     timesteps = 3
     units = 3
 
-    with self.test_session():
+    with self.cached_session():
       input1 = keras.layers.Input((timesteps, dim))
       layer = keras.layers.Bidirectional(
           rnn(units, return_state=True, return_sequences=True))
@@ -498,7 +498,7 @@ class BidirectionalTest(test.TestCase):
 
   def test_Bidirectional_trainable(self):
     # test layers that need learning_phase to be set
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
       layer = keras.layers.Bidirectional(keras.layers.SimpleRNN(3))
       _ = layer(x)
@@ -509,7 +509,7 @@ class BidirectionalTest(test.TestCase):
       assert len(layer.trainable_weights) == 6
 
   def test_Bidirectional_updates(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
       x_reachable_update = x * x
       layer = keras.layers.Bidirectional(keras.layers.SimpleRNN(3))
@@ -526,7 +526,7 @@ class BidirectionalTest(test.TestCase):
       assert len(layer.get_updates_for(x)) == 2
 
   def test_Bidirectional_losses(self):
-    with self.test_session():
+    with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
       x_reachable_loss = x * x
       layer = keras.layers.Bidirectional(
@@ -545,7 +545,7 @@ class BidirectionalTest(test.TestCase):
       assert len(layer.get_losses_for(x)) == 2
 
   def test_Bidirectional_with_constants(self):
-    with self.test_session():
+    with self.cached_session():
       # Test basic case.
       x = keras.Input((5, 5))
       c = keras.Input((3,))
@@ -586,7 +586,7 @@ class BidirectionalTest(test.TestCase):
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
   def test_Bidirectional_with_constants_layer_passing_initial_state(self):
-    with self.test_session():
+    with self.cached_session():
       # Test basic case.
       x = keras.Input((5, 5))
       c = keras.Input((3,))
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 3098a6d071..c7015270ac 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -63,7 +63,7 @@ class _MSEMAELoss(object):
 class KerasLossesTest(test.TestCase):
 
   def test_objective_shapes_3d(self):
-    with self.test_session():
+    with self.cached_session():
       y_a = keras.backend.variable(np.random.random((5, 6, 7)))
       y_b = keras.backend.variable(np.random.random((5, 6, 7)))
       for obj in ALL_LOSSES:
@@ -71,7 +71,7 @@ class KerasLossesTest(test.TestCase):
         self.assertListEqual(objective_output.get_shape().as_list(), [5, 6])
 
   def test_objective_shapes_2d(self):
-    with self.test_session():
+    with self.cached_session():
       y_a = keras.backend.variable(np.random.random((6, 7)))
       y_b = keras.backend.variable(np.random.random((6, 7)))
       for obj in ALL_LOSSES:
@@ -79,7 +79,7 @@ class KerasLossesTest(test.TestCase):
         self.assertListEqual(objective_output.get_shape().as_list(), [6,])
 
   def test_cce_one_hot(self):
-    with self.test_session():
+    with self.cached_session():
       y_a = keras.backend.variable(np.random.randint(0, 7, (5, 6)))
       y_b = keras.backend.variable(np.random.random((5, 6, 7)))
       objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
@@ -119,7 +119,7 @@ class KerasLossesTest(test.TestCase):
     self.addCleanup(shutil.rmtree, tmpdir)
     model_filename = os.path.join(tmpdir, 'custom_loss.h5')
 
-    with self.test_session():
+    with self.cached_session():
       with keras.utils.custom_object_scope({'_MSEMAELoss': _MSEMAELoss}):
         loss = _MSEMAELoss(0.3)
         inputs = keras.layers.Input((2,))
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 2dde9ee41f..44ae6c5b1f 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -55,9 +55,10 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 def check_is_tensor_or_operation(x, name):
@@ -111,7 +112,7 @@ def result_wrapper(result_fn):
 
   def decorated(metric_obj, *args):
     """Decorated function with merge_call."""
-    tower_context = distribute_lib.get_tower_context()
+    tower_context = distribution_strategy_context.get_tower_context()
     if tower_context is None:  # if in cross tower context already
       result_t = result_fn(*args)
     else:
@@ -241,7 +242,7 @@ class Metric(Layer):
 
   ```python
   m = SomeMetric(...)
-  init_op = tf.global_variables_initializer()  # Initialize variables
+  init_op = tf.variables_initializer(m.variables)  # Initialize variables
   with tf.Session() as sess:
     sess.run(init_op)
     for input in ...:
@@ -388,6 +389,7 @@ class Metric(Layer):
     return cls(**config)
 
   ### For use by subclasses ###
+  @doc_controls.for_subclass_implementers
   def add_weight(self,
                  name,
                  shape=(),
@@ -401,6 +403,7 @@ class Metric(Layer):
         dtype=self._dtype,
         trainable=False,
         initializer=initializer,
+        collections=[],
         synchronization=synchronization,
         aggregation=aggregation)
 
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 2ac74219d4..0bc95a3952 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.training.checkpointable import util as checkpointable_uti
 class KerasMetricsTest(test.TestCase):
 
   def test_metrics(self):
-    with self.test_session():
+    with self.cached_session():
       y_a = K.variable(np.random.random((6, 7)))
       y_b = K.variable(np.random.random((6, 7)))
       for metric in [metrics.binary_accuracy, metrics.categorical_accuracy]:
@@ -48,14 +48,14 @@ class KerasMetricsTest(test.TestCase):
         self.assertEqual(K.eval(output).shape, (6,))
 
   def test_sparse_categorical_accuracy(self):
-    with self.test_session():
+    with self.cached_session():
       metric = metrics.sparse_categorical_accuracy
       y_a = K.variable(np.random.randint(0, 7, (6,)))
       y_b = K.variable(np.random.random((6, 7)))
       self.assertEqual(K.eval(metric(y_a, y_b)).shape, (6,))
 
   def test_sparse_top_k_categorical_accuracy(self):
-    with self.test_session():
+    with self.cached_session():
       y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
       y_true = K.variable(np.array([[1], [0]]))
       result = K.eval(
@@ -69,7 +69,7 @@ class KerasMetricsTest(test.TestCase):
       self.assertEqual(result, 0.)
 
   def test_top_k_categorical_accuracy(self):
-    with self.test_session():
+    with self.cached_session():
       y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
       y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
       result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
@@ -80,7 +80,7 @@ class KerasMetricsTest(test.TestCase):
       self.assertEqual(result, 0.)
 
   def test_stateful_metrics(self):
-    with self.test_session():
+    with self.cached_session():
       np.random.seed(1334)
 
       class BinaryTruePositives(layers.Layer):
@@ -198,7 +198,7 @@ class KerasMetricsTest(test.TestCase):
     self.assertTrue(m.stateful)
     self.assertEqual(m.dtype, dtypes.float32)
     self.assertEqual(len(m.variables), 2)
-    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(variables.variables_initializer(m.variables))
 
     # check initial state
     self.assertEqual(self.evaluate(m.total), 0)
@@ -225,7 +225,7 @@ class KerasMetricsTest(test.TestCase):
   def test_mean_with_sample_weight(self):
     m = metrics.Mean(dtype=dtypes.float64)
     self.assertEqual(m.dtype, dtypes.float64)
-    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(variables.variables_initializer(m.variables))
 
     # check scalar weight
     result_t = m(100, sample_weight=0.5)
@@ -266,11 +266,11 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(np.round(self.evaluate(m.count), decimals=2), 5.6)
 
   def test_mean_graph_with_placeholder(self):
-    with context.graph_mode(), self.test_session() as sess:
+    with context.graph_mode(), self.cached_session() as sess:
       m = metrics.Mean()
       v = array_ops.placeholder(dtypes.float32)
       w = array_ops.placeholder(dtypes.float32)
-      sess.run(variables.global_variables_initializer())
+      sess.run(variables.variables_initializer(m.variables))
 
       # check __call__()
       result_t = m(v, sample_weight=w)
@@ -291,7 +291,7 @@ class KerasMetricsTest(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
     m = metrics.Mean()
     checkpoint = checkpointable_utils.Checkpoint(mean=m)
-    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(variables.variables_initializer(m.variables))
 
     # update state
     self.evaluate(m(100.))
@@ -325,7 +325,7 @@ class KerasMetricsTest(test.TestCase):
     self.assertTrue(acc_obj.stateful)
     self.assertEqual(len(acc_obj.variables), 2)
     self.assertEqual(acc_obj.dtype, dtypes.float32)
-    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
 
     # verify that correct value is returned
     update_op = acc_obj.update_state([[1], [0]], [[1], [0]])
@@ -357,7 +357,7 @@ class KerasMetricsTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy_threshold(self):
     acc_obj = metrics.BinaryAccuracy(threshold=0.7)
-    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
     result_t = acc_obj([[1], [1], [0], [0]], [[0.9], [0.6], [0.4], [0.8]])
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.5, 2)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 0bd6620220..39b6042597 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -20,13 +20,20 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils import generic_utils
-
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.util.tf_export import tf_export
 
 # API entries importable from `keras.models`:
 Model = training.Model  # pylint: disable=invalid-name
@@ -220,6 +227,7 @@ def _clone_sequential_model(model, input_tensors=None):
     return Sequential(layers=[input_layer] + layers, name=model.name)
 
 
+@tf_export('keras.models.clone_model')
 def clone_model(model, input_tensors=None):
   """Clone any `Model` instance.
 
@@ -246,3 +254,216 @@ def clone_model(model, input_tensors=None):
     return _clone_sequential_model(model, input_tensors=input_tensors)
   else:
     return _clone_functional_model(model, input_tensors=input_tensors)
+
+
+# "Clone" a subclassed model by reseting all of the attributes.
+
+
+def _in_place_subclassed_model_reset(model):
+  """Substitute for model cloning that works for subclassed models.
+
+  Subclassed models cannot be cloned because their topology is not serializable.
+  To "instantiate" an identical model in a new TF graph, we reuse the original
+  model object, but we clear its state.
+
+  After calling this function on a model instance, you can use the model
+  instance as if it were a model clone (in particular you can use it in a new
+  graph).
+
+  This method clears the state of the input model. It is thus destructive.
+  However the original state can be restored fully by calling
+  `_in_place_subclassed_model_state_restoration`.
+
+  Args:
+    model: Instance of a Keras model created via subclassing.
+
+  Raises:
+    ValueError: In case the model uses a subclassed model as inner layer.
+  """
+  assert not model._is_graph_network  # Only makes sense for subclassed networks
+  # Retrieve all layers tracked by the model as well as their attribute names
+  attributes_cache = {}
+  for name in dir(model):
+    try:
+      value = getattr(model, name)
+    except (AttributeError, ValueError, TypeError):
+      continue
+    if isinstance(value, Layer):
+      attributes_cache[name] = value
+      assert value in model._layers
+    elif isinstance(value, (list, tuple)) and name not in ('layers', '_layers'):
+      # Handle case: list/tuple of layers (also tracked by the Network API).
+      if value and all(isinstance(val, Layer) for val in value):
+        raise ValueError('We do not support the use of list-of-layers '
+                         'attributes in subclassed models used with '
+                         '`model_to_estimator` at this time. Found list '
+                         'model: %s' % name)
+
+  # Replace layers on the model with fresh layers
+  layers_to_names = {value: key for key, value in attributes_cache.items()}
+  original_layers = model._layers[:]
+  model._layers = data_structures.NoDependency([])
+  for layer in original_layers:  # We preserve layer order.
+    config = layer.get_config()
+    # This will not work for nested subclassed models used as layers.
+    # This would be theoretically possible to support, but would add complexity.
+    # Only do it if users complain.
+    if isinstance(layer, Network) and not layer._is_graph_network:
+      raise ValueError('We do not support the use of nested subclassed models '
+                       'in `model_to_estimator` at this time. Found nested '
+                       'model: %s' % layer)
+    fresh_layer = layer.__class__.from_config(config)
+    name = layers_to_names[layer]
+    setattr(model, name, fresh_layer)
+
+  # Cache original model build attributes (in addition to layers)
+  if (not hasattr(model, '_original_attributes_cache') or
+      model._original_attributes_cache is None):
+    if model.built:
+      attributes_to_cache = [
+          'inputs',
+          'outputs',
+          '_feed_outputs',
+          '_feed_output_names',
+          '_feed_output_shapes',
+          '_feed_loss_fns',
+          'loss_weights_list',
+          'targets',
+          '_feed_targets',
+          'sample_weight_modes',
+          'weighted_metrics',
+          'metrics_names',
+          'metrics_tensors',
+          'metrics_updates',
+          'stateful_metric_names',
+          'total_loss',
+          'sample_weights',
+          '_feed_sample_weights',
+          'train_function',
+          'test_function',
+          'predict_function',
+          '_collected_trainable_weights',
+          '_feed_inputs',
+          '_feed_input_names',
+          '_feed_input_shapes',
+          'optimizer',
+      ]
+      for name in attributes_to_cache:
+        attributes_cache[name] = getattr(model, name)
+  model._original_attributes_cache = data_structures.NoDependency(
+      attributes_cache)
+  # Reset built state
+  model.built = False
+  model.inputs = None
+  model.outputs = None
+
+
+def in_place_subclassed_model_state_restoration(model):
+  """Restores the original state of a model after it was "reset".
+
+  This undoes this action of `_in_place_subclassed_model_reset`, which is called
+  in `clone_and_build_model` if `in_place_reset` is set to True.
+
+  Args:
+    model: Instance of a Keras model created via subclassing, on which
+      `_in_place_subclassed_model_reset` was previously called.
+  """
+  assert not model._is_graph_network
+  # Restore layers and build attributes
+  if (hasattr(model, '_original_attributes_cache') and
+      model._original_attributes_cache is not None):
+    # Models have sticky attribute assignment, so we want to be careful to add
+    # back the previous attributes and track Layers by their original names
+    # without adding dependencies on "utility" attributes which Models exempt
+    # when they're constructed.
+    model._layers = data_structures.NoDependency([])
+    for name, value in model._original_attributes_cache.items():
+      if not isinstance(value, checkpointable.CheckpointableBase):
+        # If this value is not already checkpointable, it's probably that way
+        # for a reason; we don't want to start tracking data structures that the
+        # original Model didn't.
+        value = data_structures.NoDependency(value)
+      setattr(model, name, value)
+    model._original_attributes_cache = None
+  else:
+    # Restore to the state of a never-called model.
+    model.built = False
+    model.inputs = None
+    model.outputs = None
+
+
+def clone_and_build_model(
+    model, input_tensors=None, target_tensors=None, custom_objects=None,
+    compile_clone=True, in_place_reset=False):
+  """Clone a `Model` and build/compile it with the same settings used before.
+
+  This function should be run in the same graph as the model.
+
+  Args:
+    model: `tf.keras.Model` object. Can be Functional, Sequential, or
+      sub-classed.
+    input_tensors: Optional list of input tensors to build the model upon. If
+      not provided, placeholders will be created.
+    target_tensors: Optional list of target tensors for compiling the model. If
+      not provided, placeholders will be created.
+    custom_objects: Optional dictionary mapping string names to custom classes
+      or functions.
+    compile_clone: Boolean, whether to compile model clone (default `True`).
+    in_place_reset: Boolean, whether to reset the model in place. Only used if
+      the model is not a graph network. If the model is a subclassed model, then
+      this argument must be set to `True` (default `False`). To restore the
+      original model, use the function
+      `in_place_subclassed_model_state_restoration(model)`.
+
+  Returns:
+    Clone of the model.
+
+  Raises:
+    ValueError: if trying to clone a subclassed model, and `in_place_reset` is
+      set to False.
+  """
+  if model._is_graph_network:
+    if custom_objects:
+      with CustomObjectScope(custom_objects):
+        clone = clone_model(model, input_tensors=input_tensors)
+    else:
+      clone = clone_model(model, input_tensors=input_tensors)
+  else:
+    if not in_place_reset:
+      raise ValueError(
+          'Model is not a graph network (usually means that it is a subclassed '
+          'model). The model cannot be cloned, but there is a workaround where '
+          'the model is reset in-place. To use this, please set the argument '
+          '`in_place_reset` to `True`. This will reset the attributes in the '
+          'original model. To restore the attributes, call '
+          '`in_place_subclassed_model_state_restoration(model)`.')
+    clone = model
+    _in_place_subclassed_model_reset(clone)
+    if input_tensors is not None:
+      clone._set_inputs(input_tensors)
+
+  # Compile/Build model
+  if not compile_clone:
+    if isinstance(clone, Sequential):
+      clone.build()
+  elif model.optimizer:
+    if isinstance(model.optimizer, optimizers.TFOptimizer):
+      optimizer = model.optimizer
+      K.track_tf_optimizer(optimizer)
+    else:
+      optimizer_config = model.optimizer.get_config()
+      optimizer = model.optimizer.__class__.from_config(optimizer_config)
+    global_step = training_util.get_or_create_global_step()
+    K.track_variable(global_step)
+    optimizer.iterations = global_step
+
+    clone.compile(
+        optimizer,
+        model.loss,
+        metrics=model.metrics,
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=model.weighted_metrics,
+        target_tensors=target_tensors)
+
+  return clone
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 1385ad5390..1d0f56f3c8 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -18,16 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import os
 
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras import models
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 
 
+class TestModel(keras.Model):
+  """A model subclass."""
+
+  def __init__(self, n_outputs=4, trainable=True):
+    """A test class with one dense layer and number of outputs as a variable."""
+    super(TestModel, self).__init__()
+    self.layer1 = keras.layers.Dense(n_outputs)
+    self.n_outputs = resource_variable_ops.ResourceVariable(
+        n_outputs, trainable=trainable)
+
+  def call(self, x):
+    return self.layer1(x)
+
+
 class TestModelCloning(test.TestCase):
 
   def test_clone_sequential_model(self):
@@ -169,6 +189,7 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
+
 class TestModelBackend(test.TestCase):
 
   def test_model_backend_float64_use_cases(self):
@@ -183,5 +204,166 @@ class TestModelBackend(test.TestCase):
 
     keras.backend.set_floatx(floatx)
 
+
+class TestModelDeepCopy(test.TestCase):
+
+  def test_deep_copy_eager_mode_trainable(self):
+    with context.eager_mode():
+      x = random_ops.random_normal((32, 4))
+      model = TestModel(trainable=True)
+      model(x)  # Initialize Variables.
+      model_copy = copy.deepcopy(model)
+      self.assertEqual(len(model_copy.trainable_variables), 3)
+      model_copy.n_outputs.assign(1200)
+      self.assertFalse(
+          np.allclose(model_copy.n_outputs.numpy(),
+                      model.n_outputs.numpy()))
+
+  def test_deep_copy_eager_mode_not_trainable(self):
+    with context.eager_mode():
+      x = random_ops.random_normal((32, 4))
+      model = TestModel(trainable=False)
+      model(x)
+      model_copy = copy.deepcopy(model)
+      self.assertEqual(len(model_copy.trainable_variables), 2)
+
+      weights = model_copy.get_weights()
+      weights = [w * 4 for w in weights]
+      model_copy.set_weights(weights)
+      self.assertFalse(
+          np.allclose(model.get_weights()[0],
+                      model_copy.get_weights()[0]))
+
+
+class TestCloneAndBuildModel(test.TestCase):
+
+  def test_clone_and_build_non_compiled_model(self):
+    with self.test_session():
+      inp = np.random.random((10, 4))
+      out = np.random.random((10, 4))
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(4, input_shape=(4,)))
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dropout(0.5))
+      model.add(keras.layers.Dense(4))
+
+    # Everything should work in a new session.
+    keras.backend.clear_session()
+
+    with self.test_session():
+      # With placeholder creation
+      new_model = models.clone_and_build_model(model, compile_clone=True)
+      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+        new_model.evaluate(inp, out)
+      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+        new_model.train_on_batch(inp, out)
+      new_model.compile('rmsprop', 'mse')
+      new_model.train_on_batch(inp, out)
+
+      # Create new tensors for inputs and targets
+      input_a = keras.Input(shape=(4,))
+      target_a = keras.Input(shape=(4,))
+      new_model = models.clone_and_build_model(model, input_tensors=input_a,
+                                               target_tensors=[target_a],
+                                               compile_clone=True)
+      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+        new_model.evaluate(inp, out)
+      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+        new_model.train_on_batch(inp, out)
+      new_model.compile('rmsprop', 'mse')
+      new_model.train_on_batch(inp, out)
+
+  def _assert_same_compile_params(self, model):
+    """Assert that two models have the same compile parameters."""
+
+    self.assertEqual('mse', model.loss)
+    self.assertTrue(
+        isinstance(model.optimizer, keras.optimizers.RMSprop))
+    self.assertEqual(['acc', metrics.categorical_accuracy], model.metrics)
+
+  def _clone_and_build_test_helper(self, model, is_subclassed=False):
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
+
+    # Everything should work in a new session.
+    keras.backend.clear_session()
+
+    with self.test_session():
+      # With placeholder creation
+      new_model = models.clone_and_build_model(
+          model, compile_clone=True, in_place_reset=is_subclassed)
+
+      self._assert_same_compile_params(new_model)
+      new_model.train_on_batch(inp, out)
+      new_model.evaluate(inp, out)
+
+      # Create new tensors for inputs and targets
+      input_a = keras.Input(shape=(4,), name='a')
+      new_model = models.clone_and_build_model(
+          model, input_tensors=input_a, compile_clone=True,
+          in_place_reset=is_subclassed)
+      self._assert_same_compile_params(new_model)
+      new_model.train_on_batch(inp, out)
+      new_model.evaluate(inp, out)
+
+      target_a = keras.Input(shape=(4,), name='b')
+      new_model = models.clone_and_build_model(
+          model, input_tensors=input_a, target_tensors=[target_a],
+          compile_clone=True, in_place_reset=is_subclassed)
+      self._assert_same_compile_params(new_model)
+      new_model.train_on_batch(inp, out)
+      new_model.evaluate(inp, out)
+
+  def test_clone_and_build_compiled_sequential_model(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(4, input_shape=(4,)))
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dropout(0.5))
+      model.add(keras.layers.Dense(4))
+      model.compile('rmsprop', 'mse',
+                    metrics=['acc', metrics.categorical_accuracy])
+
+    self._clone_and_build_test_helper(model)
+
+  def test_clone_and_build_functional_model(self):
+    with self.test_session():
+      input_a = keras.Input(shape=(4,))
+      dense_1 = keras.layers.Dense(4,)
+      dense_2 = keras.layers.Dense(4,)
+
+      x_a = dense_1(input_a)
+      x_a = keras.layers.Dropout(0.5)(x_a)
+      x_a = keras.layers.BatchNormalization()(x_a)
+      x_a = dense_2(x_a)
+      model = keras.models.Model(input_a, x_a)
+      model.compile('rmsprop', 'mse',
+                    metrics=['acc', metrics.categorical_accuracy])
+
+    self._clone_and_build_test_helper(model)
+
+  def test_clone_and_build_subclassed_model(self):
+    class SubclassedModel(keras.Model):
+
+      def __init__(self):
+        super(SubclassedModel, self).__init__()
+        self.layer1 = keras.layers.Dense(4)
+        self.layer2 = keras.layers.Dense(4)
+
+      def call(self, inp):
+        out = self.layer1(inp)
+        out = keras.layers.BatchNormalization()(out)
+        out = keras.layers.Dropout(0.5)(out)
+        out = self.layer2(out)
+        return out
+
+    with self.test_session():
+      model = SubclassedModel()
+      model.compile('rmsprop', 'mse',
+                    metrics=['acc', metrics.categorical_accuracy])
+    self._clone_and_build_test_helper(model, True)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 4f97442e82..2ce79285db 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -699,13 +699,13 @@ class TFOptimizer(Optimizer, checkpointable.CheckpointableBase):
       self.iterations = K.variable(0, dtype='int64', name='iterations')
 
   def apply_gradients(self, grads):
-    self.optimizer.apply_gradients(grads)
+    self.optimizer.apply_gradients(grads, global_step=self.iterations)
 
   def get_grads(self, loss, params):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    if distribute_lib.has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       self.updates = []
 
       if not params:
@@ -813,7 +813,9 @@ def get(identifier):
   """
   # Wrap TF optimizer instances
   if isinstance(identifier, tf_optimizer_module.Optimizer):
-    return TFOptimizer(identifier)
+    opt = TFOptimizer(identifier)
+    K.track_tf_optimizer(opt)
+    return opt
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 4d295351f5..9a68fc0e35 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.adam import AdamOptimizer
@@ -140,6 +142,7 @@ class KerasOptimizersTest(test.TestCase):
         2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
     # This is possible
     model.compile(loss='mean_squared_error', optimizer=optimizer)
+    keras.backend.track_tf_optimizer(optimizer)
     model.fit(np.random.random((5, 3)),
               np.random.random((5, 2)),
               epochs=1,
@@ -153,6 +156,7 @@ class KerasOptimizersTest(test.TestCase):
     with self.assertRaises(NotImplementedError):
       optimizer.from_config(None)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_tfoptimizer_iterations(self):
     with self.test_session():
       optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
@@ -160,6 +164,7 @@ class KerasOptimizersTest(test.TestCase):
       model.add(keras.layers.Dense(
           2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
       model.compile(loss='mean_squared_error', optimizer=optimizer)
+      keras.backend.track_tf_optimizer(optimizer)
       self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 0)
 
       model.fit(np.random.random((55, 3)),
@@ -169,11 +174,15 @@ class KerasOptimizersTest(test.TestCase):
                 verbose=0)
       self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 11)
 
-      model.fit(np.random.random((20, 3)),
-                np.random.random((20, 2)),
-                steps_per_epoch=8,
-                verbose=0)
-      self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 19)
+      if not context.executing_eagerly():
+        # TODO(kathywu): investigate why training with an array input and
+        # setting the argument steps_per_epoch does not work in eager mode.
+        model.fit(np.random.random((20, 3)),
+                  np.random.random((20, 2)),
+                  steps_per_epoch=8,
+                  verbose=0)
+        self.assertEqual(
+            keras.backend.get_value(model.optimizer.iterations), 19)
 
   def test_negative_clipvalue_or_clipnorm(self):
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/keras/preprocessing/__init__.py b/tensorflow/python/keras/preprocessing/__init__.py
index 2f08f88600..0860eed3cf 100644
--- a/tensorflow/python/keras/preprocessing/__init__.py
+++ b/tensorflow/python/keras/preprocessing/__init__.py
@@ -23,6 +23,8 @@ import keras_preprocessing
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import utils
 
+# This exists for compatibility with prior version of keras_preprocessing.
+# TODO(fchollet): remove in the future.
 keras_preprocessing.set_keras_submodules(backend=backend, utils=utils)
 
 from tensorflow.python.keras.preprocessing import image
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index ba227385ef..e33993950d 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -27,6 +27,9 @@ try:
 except ImportError:
   pass
 
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import utils
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 random_rotation = image.random_rotation
@@ -38,14 +41,482 @@ random_channel_shift = image.random_channel_shift
 apply_brightness_shift = image.apply_brightness_shift
 random_brightness = image.random_brightness
 apply_affine_transform = image.apply_affine_transform
-array_to_img = image.array_to_img
-img_to_array = image.img_to_array
-save_img = image.save_img
 load_img = image.load_img
-ImageDataGenerator = image.ImageDataGenerator
-Iterator = image.Iterator
-NumpyArrayIterator = image.NumpyArrayIterator
-DirectoryIterator = image.DirectoryIterator
+
+
+@tf_export('keras.preprocessing.image.array_to_img')
+def array_to_img(x, data_format=None, scale=True, dtype=None):
+  """Converts a 3D Numpy array to a PIL Image instance.
+
+  Arguments:
+      x: Input Numpy array.
+      data_format: Image data format.
+          either "channels_first" or "channels_last".
+      scale: Whether to rescale image values
+          to be within `[0, 255]`.
+      dtype: Dtype to use.
+
+  Returns:
+      A PIL Image instance.
+
+  Raises:
+      ImportError: if PIL is not available.
+      ValueError: if invalid `x` or `data_format` is passed.
+  """
+
+  if data_format is None:
+    data_format = backend.image_data_format()
+  kwargs = {}
+  if 'dtype' in tf_inspect.getfullargspec(image.array_to_img)[0]:
+    if dtype is None:
+      dtype = backend.floatx()
+    kwargs['dtype'] = dtype
+  return image.array_to_img(x, data_format=data_format, scale=scale, **kwargs)
+
+
+@tf_export('keras.preprocessing.image.img_to_array')
+def img_to_array(img, data_format=None, dtype=None):
+  """Converts a PIL Image instance to a Numpy array.
+
+  Arguments:
+      img: PIL Image instance.
+      data_format: Image data format,
+          either "channels_first" or "channels_last".
+      dtype: Dtype to use for the returned array.
+
+  Returns:
+      A 3D Numpy array.
+
+  Raises:
+      ValueError: if invalid `img` or `data_format` is passed.
+  """
+
+  if data_format is None:
+    data_format = backend.image_data_format()
+  kwargs = {}
+  if 'dtype' in tf_inspect.getfullargspec(image.img_to_array)[0]:
+    if dtype is None:
+      dtype = backend.floatx()
+    kwargs['dtype'] = dtype
+  return image.img_to_array(img, data_format=data_format, **kwargs)
+
+
+@tf_export('keras.preprocessing.image.save_img')
+def save_img(path,
+             x,
+             data_format=None,
+             file_format=None,
+             scale=True,
+             **kwargs):
+  """Saves an image stored as a Numpy array to a path or file object.
+
+  Arguments:
+      path: Path or file object.
+      x: Numpy array.
+      data_format: Image data format,
+          either "channels_first" or "channels_last".
+      file_format: Optional file format override. If omitted, the
+          format to use is determined from the filename extension.
+          If a file object was used instead of a filename, this
+          parameter should always be used.
+      scale: Whether to rescale image values to be within `[0, 255]`.
+      **kwargs: Additional keyword arguments passed to `PIL.Image.save()`.
+  """
+  if data_format is None:
+    data_format = backend.image_data_format()
+  image.save_img(path,
+                 x,
+                 data_format=data_format,
+                 file_format=file_format,
+                 scale=scale, **kwargs)
+
+
+@tf_export('keras.preprocessing.image.Iterator')
+class Iterator(image.Iterator, utils.Sequence):
+  pass
+
+
+@tf_export('keras.preprocessing.image.DirectoryIterator')
+class DirectoryIterator(image.DirectoryIterator, Iterator):
+  """Iterator capable of reading images from a directory on disk.
+
+  Arguments:
+      directory: Path to the directory to read images from.
+          Each subdirectory in this directory will be
+          considered to contain images from one class,
+          or alternatively you could specify class subdirectories
+          via the `classes` argument.
+      image_data_generator: Instance of `ImageDataGenerator`
+          to use for random transformations and normalization.
+      target_size: tuple of integers, dimensions to resize input images to.
+      color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`.
+          Color mode to read images.
+      classes: Optional list of strings, names of subdirectories
+          containing images from each class (e.g. `["dogs", "cats"]`).
+          It will be computed automatically if not set.
+      class_mode: Mode for yielding the targets:
+          `"binary"`: binary targets (if there are only two classes),
+          `"categorical"`: categorical targets,
+          `"sparse"`: integer targets,
+          `"input"`: targets are images identical to input images (mainly
+              used to work with autoencoders),
+          `None`: no targets get yielded (only input images are yielded).
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures
+          being yielded, in a viewable format. This is useful
+          for visualizing the random transformations being
+          applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample
+          images (if `save_to_dir` is set).
+      save_format: Format to use for saving sample images
+          (if `save_to_dir` is set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+          validation_split is set in ImageDataGenerator.
+      interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image.
+          Supported methods are "nearest", "bilinear", and "bicubic".
+          If PIL version 1.1.3 or newer is installed, "lanczos" is also
+          supported. If PIL version 3.4.0 or newer is installed, "box" and
+          "hamming" are also supported. By default, "nearest" is used.
+      dtype: Dtype to use for generated arrays.
+  """
+
+  def __init__(self, directory, image_data_generator,
+               target_size=(256, 256),
+               color_mode='rgb',
+               classes=None,
+               class_mode='categorical',
+               batch_size=32,
+               shuffle=True,
+               seed=None,
+               data_format=None,
+               save_to_dir=None,
+               save_prefix='',
+               save_format='png',
+               follow_links=False,
+               subset=None,
+               interpolation='nearest',
+               dtype=None):
+    if data_format is None:
+      data_format = backend.image_data_format()
+    kwargs = {}
+    if 'dtype' in tf_inspect.getfullargspec(
+        image.ImageDataGenerator.__init__)[0]:
+      if dtype is None:
+        dtype = backend.floatx()
+      kwargs['dtype'] = dtype
+    super(DirectoryIterator, self).__init__(
+        directory, image_data_generator,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        data_format=data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        follow_links=follow_links,
+        subset=subset,
+        interpolation=interpolation,
+        **kwargs)
+
+
+@tf_export('keras.preprocessing.image.NumpyArrayIterator')
+class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
+  """Iterator yielding data from a Numpy array.
+
+  Arguments:
+      x: Numpy array of input data or tuple.
+          If tuple, the second elements is either
+          another numpy array or a list of numpy arrays,
+          each of which gets passed
+          through as an output without any modifications.
+      y: Numpy array of targets data.
+      image_data_generator: Instance of `ImageDataGenerator`
+          to use for random transformations and normalization.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      sample_weight: Numpy array of sample weights.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures
+          being yielded, in a viewable format. This is useful
+          for visualizing the random transformations being
+          applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample
+          images (if `save_to_dir` is set).
+      save_format: Format to use for saving sample images
+          (if `save_to_dir` is set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+          validation_split is set in ImageDataGenerator.
+      dtype: Dtype to use for the generated arrays.
+  """
+
+  def __init__(self, x, y, image_data_generator,
+               batch_size=32,
+               shuffle=False,
+               sample_weight=None,
+               seed=None,
+               data_format=None,
+               save_to_dir=None,
+               save_prefix='',
+               save_format='png',
+               subset=None,
+               dtype=None):
+    if data_format is None:
+      data_format = backend.image_data_format()
+    kwargs = {}
+    if 'dtype' in tf_inspect.getfullargspec(
+        image.NumpyArrayIterator.__init__)[0]:
+      if dtype is None:
+        dtype = backend.floatx()
+      kwargs['dtype'] = dtype
+    super(NumpyArrayIterator, self).__init__(
+        x, y, image_data_generator,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        sample_weight=sample_weight,
+        seed=seed,
+        data_format=data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset,
+        **kwargs)
+
+
+@tf_export('keras.preprocessing.image.ImageDataGenerator')
+class ImageDataGenerator(image.ImageDataGenerator):
+  """Generate batches of tensor image data with real-time data augmentation.
+
+   The data will be looped over (in batches).
+
+  Arguments:
+      featurewise_center: Boolean.
+          Set input mean to 0 over the dataset, feature-wise.
+      samplewise_center: Boolean. Set each sample mean to 0.
+      featurewise_std_normalization: Boolean.
+          Divide inputs by std of the dataset, feature-wise.
+      samplewise_std_normalization: Boolean. Divide each input by its std.
+      zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
+      zca_whitening: Boolean. Apply ZCA whitening.
+      rotation_range: Int. Degree range for random rotations.
+      width_shift_range: Float, 1-D array-like or int
+          - float: fraction of total width, if < 1, or pixels if >= 1.
+          - 1-D array-like: random elements from the array.
+          - int: integer number of pixels from interval
+              `(-width_shift_range, +width_shift_range)`
+          - With `width_shift_range=2` possible values
+              are integers `[-1, 0, +1]`,
+              same as with `width_shift_range=[-1, 0, +1]`,
+              while with `width_shift_range=1.0` possible values are floats
+              in the interval [-1.0, +1.0).
+      height_shift_range: Float, 1-D array-like or int
+          - float: fraction of total height, if < 1, or pixels if >= 1.
+          - 1-D array-like: random elements from the array.
+          - int: integer number of pixels from interval
+              `(-height_shift_range, +height_shift_range)`
+          - With `height_shift_range=2` possible values
+              are integers `[-1, 0, +1]`,
+              same as with `height_shift_range=[-1, 0, +1]`,
+              while with `height_shift_range=1.0` possible values are floats
+              in the interval [-1.0, +1.0).
+      brightness_range: Tuple or list of two floats. Range for picking
+          a brightness shift value from.
+      shear_range: Float. Shear Intensity
+          (Shear angle in counter-clockwise direction in degrees)
+      zoom_range: Float or [lower, upper]. Range for random zoom.
+          If a float, `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
+      channel_shift_range: Float. Range for random channel shifts.
+      fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}.
+          Default is 'nearest'.
+          Points outside the boundaries of the input are filled
+          according to the given mode:
+          - 'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
+          - 'nearest':  aaaaaaaa|abcd|dddddddd
+          - 'reflect':  abcddcba|abcd|dcbaabcd
+          - 'wrap':  abcdabcd|abcd|abcdabcd
+      cval: Float or Int.
+          Value used for points outside the boundaries
+          when `fill_mode = "constant"`.
+      horizontal_flip: Boolean. Randomly flip inputs horizontally.
+      vertical_flip: Boolean. Randomly flip inputs vertically.
+      rescale: rescaling factor. Defaults to None.
+          If None or 0, no rescaling is applied,
+          otherwise we multiply the data by the value provided
+          (after applying all other transformations).
+      preprocessing_function: function that will be implied on each input.
+          The function will run after the image is resized and augmented.
+          The function should take one argument:
+          one image (Numpy tensor with rank 3),
+          and should output a Numpy tensor with the same shape.
+      data_format: Image data format,
+          either "channels_first" or "channels_last".
+          "channels_last" mode means that the images should have shape
+          `(samples, height, width, channels)`,
+          "channels_first" mode means that the images should have shape
+          `(samples, channels, height, width)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+      validation_split: Float. Fraction of images reserved for validation
+          (strictly between 0 and 1).
+      dtype: Dtype to use for the generated arrays.
+
+  Examples:
+
+  Example of using `.flow(x, y)`:
+
+  ```python
+  (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+  y_train = np_utils.to_categorical(y_train, num_classes)
+  y_test = np_utils.to_categorical(y_test, num_classes)
+  datagen = ImageDataGenerator(
+      featurewise_center=True,
+      featurewise_std_normalization=True,
+      rotation_range=20,
+      width_shift_range=0.2,
+      height_shift_range=0.2,
+      horizontal_flip=True)
+  # compute quantities required for featurewise normalization
+  # (std, mean, and principal components if ZCA whitening is applied)
+  datagen.fit(x_train)
+  # fits the model on batches with real-time data augmentation:
+  model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
+                      steps_per_epoch=len(x_train) / 32, epochs=epochs)
+  # here's a more "manual" example
+  for e in range(epochs):
+      print('Epoch', e)
+      batches = 0
+      for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
+          model.fit(x_batch, y_batch)
+          batches += 1
+          if batches >= len(x_train) / 32:
+              # we need to break the loop by hand because
+              # the generator loops indefinitely
+              break
+  ```
+
+  Example of using `.flow_from_directory(directory)`:
+
+  ```python
+  train_datagen = ImageDataGenerator(
+          rescale=1./255,
+          shear_range=0.2,
+          zoom_range=0.2,
+          horizontal_flip=True)
+  test_datagen = ImageDataGenerator(rescale=1./255)
+  train_generator = train_datagen.flow_from_directory(
+          'data/train',
+          target_size=(150, 150),
+          batch_size=32,
+          class_mode='binary')
+  validation_generator = test_datagen.flow_from_directory(
+          'data/validation',
+          target_size=(150, 150),
+          batch_size=32,
+          class_mode='binary')
+  model.fit_generator(
+          train_generator,
+          steps_per_epoch=2000,
+          epochs=50,
+          validation_data=validation_generator,
+          validation_steps=800)
+  ```
+
+  Example of transforming images and masks together.
+
+  ```python
+  # we create two instances with the same arguments
+  data_gen_args = dict(featurewise_center=True,
+                       featurewise_std_normalization=True,
+                       rotation_range=90,
+                       width_shift_range=0.1,
+                       height_shift_range=0.1,
+                       zoom_range=0.2)
+  image_datagen = ImageDataGenerator(**data_gen_args)
+  mask_datagen = ImageDataGenerator(**data_gen_args)
+  # Provide the same seed and keyword arguments to the fit and flow methods
+  seed = 1
+  image_datagen.fit(images, augment=True, seed=seed)
+  mask_datagen.fit(masks, augment=True, seed=seed)
+  image_generator = image_datagen.flow_from_directory(
+      'data/images',
+      class_mode=None,
+      seed=seed)
+  mask_generator = mask_datagen.flow_from_directory(
+      'data/masks',
+      class_mode=None,
+      seed=seed)
+  # combine generators into one which yields image and masks
+  train_generator = zip(image_generator, mask_generator)
+  model.fit_generator(
+      train_generator,
+      steps_per_epoch=2000,
+      epochs=50)
+  ```
+  """
+
+  def __init__(self,
+               featurewise_center=False,
+               samplewise_center=False,
+               featurewise_std_normalization=False,
+               samplewise_std_normalization=False,
+               zca_whitening=False,
+               zca_epsilon=1e-6,
+               rotation_range=0,
+               width_shift_range=0.,
+               height_shift_range=0.,
+               brightness_range=None,
+               shear_range=0.,
+               zoom_range=0.,
+               channel_shift_range=0.,
+               fill_mode='nearest',
+               cval=0.,
+               horizontal_flip=False,
+               vertical_flip=False,
+               rescale=None,
+               preprocessing_function=None,
+               data_format=None,
+               validation_split=0.0,
+               dtype=None):
+    if data_format is None:
+      data_format = backend.image_data_format()
+    kwargs = {}
+    if 'dtype' in tf_inspect.getfullargspec(
+        image.ImageDataGenerator.__init__)[0]:
+      if dtype is None:
+        dtype = backend.floatx()
+      kwargs['dtype'] = dtype
+    super(ImageDataGenerator, self).__init__(
+        featurewise_center=featurewise_center,
+        samplewise_center=samplewise_center,
+        featurewise_std_normalization=featurewise_std_normalization,
+        samplewise_std_normalization=samplewise_std_normalization,
+        zca_whitening=zca_whitening,
+        zca_epsilon=zca_epsilon,
+        rotation_range=rotation_range,
+        width_shift_range=width_shift_range,
+        height_shift_range=height_shift_range,
+        brightness_range=brightness_range,
+        shear_range=shear_range,
+        zoom_range=zoom_range,
+        channel_shift_range=channel_shift_range,
+        fill_mode=fill_mode,
+        cval=cval,
+        horizontal_flip=horizontal_flip,
+        vertical_flip=vertical_flip,
+        rescale=rescale,
+        preprocessing_function=preprocessing_function,
+        data_format=data_format,
+        validation_split=validation_split,
+        **kwargs)
 
 tf_export('keras.preprocessing.image.random_rotation')(random_rotation)
 tf_export('keras.preprocessing.image.random_shift')(random_shift)
@@ -59,11 +530,4 @@ tf_export(
 tf_export('keras.preprocessing.image.random_brightness')(random_brightness)
 tf_export(
     'keras.preprocessing.image.apply_affine_transform')(apply_affine_transform)
-tf_export('keras.preprocessing.image.array_to_img')(array_to_img)
-tf_export('keras.preprocessing.image.img_to_array')(img_to_array)
-tf_export('keras.preprocessing.image.save_img')(save_img)
 tf_export('keras.preprocessing.image.load_img')(load_img)
-tf_export('keras.preprocessing.image.ImageDataGenerator')(ImageDataGenerator)
-tf_export('keras.preprocessing.image.Iterator')(Iterator)
-tf_export('keras.preprocessing.image.NumpyArrayIterator')(NumpyArrayIterator)
-tf_export('keras.preprocessing.image.DirectoryIterator')(DirectoryIterator)
diff --git a/tensorflow/python/keras/preprocessing/sequence.py b/tensorflow/python/keras/preprocessing/sequence.py
index 116d3108d9..f014668909 100644
--- a/tensorflow/python/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/preprocessing/sequence.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from keras_preprocessing import sequence
 
+from tensorflow.python.keras import utils
 from tensorflow.python.util.tf_export import tf_export
 
 pad_sequences = sequence.pad_sequences
@@ -28,11 +29,67 @@ make_sampling_table = sequence.make_sampling_table
 skipgrams = sequence.skipgrams
 # TODO(fchollet): consider making `_remove_long_seq` public.
 _remove_long_seq = sequence._remove_long_seq  # pylint: disable=protected-access
-TimeseriesGenerator = sequence.TimeseriesGenerator
+
+
+@tf_export('keras.preprocessing.sequence.TimeseriesGenerator')
+class TimeseriesGenerator(sequence.TimeseriesGenerator, utils.Sequence):
+  """Utility class for generating batches of temporal data.
+  This class takes in a sequence of data-points gathered at
+  equal intervals, along with time series parameters such as
+  stride, length of history, etc., to produce batches for
+  training/validation.
+  # Arguments
+      data: Indexable generator (such as list or Numpy array)
+          containing consecutive data points (timesteps).
+          The data should be at 2D, and axis 0 is expected
+          to be the time dimension.
+      targets: Targets corresponding to timesteps in `data`.
+          It should have same length as `data`.
+      length: Length of the output sequences (in number of timesteps).
+      sampling_rate: Period between successive individual timesteps
+          within sequences. For rate `r`, timesteps
+          `data[i]`, `data[i-r]`, ... `data[i - length]`
+          are used for create a sample sequence.
+      stride: Period between successive output sequences.
+          For stride `s`, consecutive output samples would
+          be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
+      start_index: Data points earlier than `start_index` will not be used
+          in the output sequences. This is useful to reserve part of the
+          data for test or validation.
+      end_index: Data points later than `end_index` will not be used
+          in the output sequences. This is useful to reserve part of the
+          data for test or validation.
+      shuffle: Whether to shuffle output samples,
+          or instead draw them in chronological order.
+      reverse: Boolean: if `true`, timesteps in each output sample will be
+          in reverse chronological order.
+      batch_size: Number of timeseries samples in each batch
+          (except maybe the last one).
+  # Returns
+      A [Sequence](/utils/#sequence) instance.
+  # Examples
+  ```python
+  from keras.preprocessing.sequence import TimeseriesGenerator
+  import numpy as np
+  data = np.array([[i] for i in range(50)])
+  targets = np.array([[i] for i in range(50)])
+  data_gen = TimeseriesGenerator(data, targets,
+                                 length=10, sampling_rate=2,
+                                 batch_size=2)
+  assert len(data_gen) == 20
+  batch_0 = data_gen[0]
+  x, y = batch_0
+  assert np.array_equal(x,
+                        np.array([[[0], [2], [4], [6], [8]],
+                                  [[1], [3], [5], [7], [9]]]))
+  assert np.array_equal(y,
+                        np.array([[10], [11]]))
+  ```
+  """
+  pass
+
 
 tf_export('keras.preprocessing.sequence.pad_sequences')(pad_sequences)
 tf_export(
     'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
 tf_export('keras.preprocessing.sequence.skipgrams')(skipgrams)
-tf_export(
-    'keras.preprocessing.sequence.TimeseriesGenerator')(TimeseriesGenerator)
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index e2075785d8..bba4ebb287 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -50,7 +50,7 @@ def create_model(kernel_regularizer=None, activity_regularizer=None):
 class KerasRegularizersTest(test.TestCase):
 
   def test_kernel_regularization(self):
-    with self.test_session():
+    with self.cached_session():
       (x_train, y_train), _ = get_data()
       for reg in [keras.regularizers.l1(),
                   keras.regularizers.l2(),
@@ -62,7 +62,7 @@ class KerasRegularizersTest(test.TestCase):
                   epochs=1, verbose=0)
 
   def test_activity_regularization(self):
-    with self.test_session():
+    with self.cached_session():
       (x_train, y_train), _ = get_data()
       for reg in [keras.regularizers.l1(), keras.regularizers.l2()]:
         model = create_model(activity_regularizer=reg)
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 6e8ee06ff5..58405c550b 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -184,3 +184,22 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   # for further checks in the caller function
   return actual_output
 
+
+def get_small_sequential_mlp(num_hidden, num_classes, input_dim=None):
+  model = keras.models.Sequential()
+  if input_dim:
+    model.add(keras.layers.Dense(num_hidden, activation='relu',
+                                 input_dim=input_dim))
+  else:
+    model.add(keras.layers.Dense(num_hidden, activation='relu'))
+  activation = 'sigmoid' if num_classes == 1 else 'softmax'
+  model.add(keras.layers.Dense(num_classes, activation=activation))
+  return model
+
+
+def get_small_functional_mlp(num_hidden, num_classes, input_dim):
+  inputs = keras.Input(shape=(input_dim,))
+  outputs = keras.layers.Dense(num_hidden, activation='relu')(inputs)
+  activation = 'sigmoid' if num_classes == 1 else 'softmax'
+  outputs = keras.layers.Dense(num_classes, activation=activation)(outputs)
+  return keras.Model(inputs, outputs)
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index 5419e7ae05..3a176c3316 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
@@ -199,3 +200,168 @@ def convert_kernel(kernel):
   no_flip = (slice(None, None), slice(None, None))
   slices[-2:] = no_flip
   return np.copy(kernel[slices])
+
+
+def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
+  """Compute a mask representing the connectivity of a convolution operation.
+
+  Assume a convolution with given parameters is applied to an input having N
+  spatial dimensions with `input_shape = (d_in1, ..., d_inN)` to produce an
+  output with shape `(d_out1, ..., d_outN)`. This method returns a boolean array
+  of shape `(d_in1, ..., d_inN, d_out1, ..., d_outN)` with `True` entries
+  indicating pairs of input and output locations that are connected by a weight.
+
+  Example:
+    ```python
+        >>> input_shape = (4,)
+        >>> kernel_shape = (2,)
+        >>> strides = (1,)
+        >>> padding = "valid"
+        >>> conv_kernel_mask(input_shape, kernel_shape, strides, padding)
+        array([[ True, False, False],
+               [ True,  True, False],
+               [False,  True,  True],
+               [False, False,  True]], dtype=bool)
+    ```
+    where rows and columns correspond to inputs and outputs respectively.
+
+
+  Args:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+
+  Returns:
+    A boolean 2N-D `np.ndarray` of shape
+    `(d_in1, ..., d_inN, d_out1, ..., d_outN)`, where `(d_out1, ..., d_outN)`
+    is the spatial shape of the output. `True` entries in the mask represent
+    pairs of input-output locations that are connected by a weight.
+
+  Raises:
+    ValueError: if `input_shape`, `kernel_shape` and `strides` don't have the
+        same number of dimensions.
+    NotImplementedError: if `padding` is not in {`"same"`, `"valid"`}.
+  """
+  if padding not in {'same', 'valid'}:
+    raise NotImplementedError('Padding type %s not supported. '
+                              'Only "valid" and "same" '
+                              'are implemented.' % padding)
+
+  in_dims = len(input_shape)
+  if isinstance(kernel_shape, int):
+    kernel_shape = (kernel_shape,) * in_dims
+  if isinstance(strides, int):
+    strides = (strides,) * in_dims
+
+  kernel_dims = len(kernel_shape)
+  stride_dims = len(strides)
+  if kernel_dims != in_dims or stride_dims != in_dims:
+    raise ValueError('Number of strides, input and kernel dimensions must all '
+                     'match. Received: %d, %d, %d.' %
+                     (stride_dims, in_dims, kernel_dims))
+
+  output_shape = conv_output_shape(input_shape, kernel_shape, strides, padding)
+
+  mask_shape = input_shape + output_shape
+  mask = np.zeros(mask_shape, np.bool)
+
+  output_axes_ticks = [range(dim) for dim in output_shape]
+  for output_position in itertools.product(*output_axes_ticks):
+    input_axes_ticks = conv_connected_inputs(input_shape,
+                                             kernel_shape,
+                                             output_position,
+                                             strides,
+                                             padding)
+    for input_position in itertools.product(*input_axes_ticks):
+      mask[input_position + output_position] = True
+
+  return mask
+
+
+def conv_connected_inputs(input_shape,
+                          kernel_shape,
+                          output_position,
+                          strides,
+                          padding):
+  """Return locations of the input connected to an output position.
+
+  Assume a convolution with given parameters is applied to an input having N
+  spatial dimensions with `input_shape = (d_in1, ..., d_inN)`. This method
+  returns N ranges specifying the input region that was convolved with the
+  kernel to produce the output at position
+  `output_position = (p_out1, ..., p_outN)`.
+
+  Example:
+    ```python
+        >>> input_shape = (4, 4)
+        >>> kernel_shape = (2, 1)
+        >>> output_position = (1, 1)
+        >>> strides = (1, 1)
+        >>> padding = "valid"
+        >>> conv_connected_inputs(input_shape, kernel_shape, output_position,
+        >>>                       strides, padding)
+        [xrange(1, 3), xrange(1, 2)]
+    ```
+  Args:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    output_position: tuple of size N: `(p_out1, ..., p_outN)`,
+                     a single position in the output of the convolution.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+
+  Returns:
+    N ranges `[[p_in_left1, ..., p_in_right1], ...,
+              [p_in_leftN, ..., p_in_rightN]]` specifying the region in the
+    input connected to output_position.
+  """
+  ranges = []
+
+  ndims = len(input_shape)
+  for d in range(ndims):
+    left_shift = int(kernel_shape[d] / 2)
+    right_shift = kernel_shape[d] - left_shift
+
+    center = output_position[d] * strides[d]
+
+    if padding == 'valid':
+      center += left_shift
+
+    start = max(0, center - left_shift)
+    end = min(input_shape[d], center + right_shift)
+
+    ranges.append(range(start, end))
+
+  return ranges
+
+
+def conv_output_shape(input_shape, kernel_shape, strides, padding):
+  """Return the output shape of an N-D convolution.
+
+  Forces dimensions where input is empty (size 0) to remain empty.
+
+  Args:
+    input_shape: tuple of size N: `(d_in1, ..., d_inN)`,
+                 spatial shape of the input.
+    kernel_shape: tuple of size N, spatial shape of the convolutional kernel
+                  / receptive field.
+    strides: tuple of size N, strides along each spatial dimension.
+    padding: type of padding, string `"same"` or `"valid"`.
+
+  Returns:
+    tuple of size N: `(d_out1, ..., d_outN)`, spatial shape of the output.
+  """
+  dims = range(len(kernel_shape))
+  output_shape = [conv_output_length(input_shape[d],
+                                     kernel_shape[d],
+                                     padding,
+                                     strides[d])
+                  for d in dims]
+  output_shape = tuple([0 if input_shape[d] == 0 else output_shape[d]
+                        for d in dims])
+  return output_shape
diff --git a/tensorflow/python/keras/utils/conv_utils_test.py b/tensorflow/python/keras/utils/conv_utils_test.py
new file mode 100644
index 0000000000..eb2a360bfd
--- /dev/null
+++ b/tensorflow/python/keras/utils/conv_utils_test.py
@@ -0,0 +1,232 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for conv_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.platform import test
+
+
+def _get_const_output_shape(input_shape, dim):
+  return tuple([min(d, dim) for d in input_shape])
+
+
+input_shapes = [
+    (0,),
+    (0, 0),
+    (1,),
+    (2,),
+    (3,),
+    (1, 0),
+    (0, 3),
+    (1, 1),
+    (1, 2),
+    (3, 1),
+    (2, 2),
+    (3, 3),
+    (1, 0, 1),
+    (5, 2, 3),
+    (3, 5, 6, 7, 0),
+    (3, 2, 2, 4, 4),
+    (1, 2, 3, 4, 7, 2),
+]
+
+
+@parameterized.parameters(input_shapes)
+class TestConvUtils(test.TestCase, parameterized.TestCase):
+
+  def test_conv_kernel_mask_fc(self, *input_shape):
+    padding = 'valid'
+    kernel_shape = input_shape
+    ndims = len(input_shape)
+    strides = (1,) * ndims
+    output_shape = _get_const_output_shape(input_shape, dim=1)
+    mask = np.ones(input_shape + output_shape, np.bool)
+    self.assertAllEqual(
+        mask,
+        conv_utils.conv_kernel_mask(
+            input_shape,
+            kernel_shape,
+            strides,
+            padding
+        )
+    )
+
+  def test_conv_kernel_mask_diag(self, *input_shape):
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = (1,) * ndims
+
+    for padding in ['valid', 'same']:
+      mask = np.identity(int(np.prod(input_shape)), np.bool)
+      mask = np.reshape(mask, input_shape * 2)
+      self.assertAllEqual(
+          mask,
+          conv_utils.conv_kernel_mask(
+              input_shape,
+              kernel_shape,
+              strides,
+              padding
+          )
+      )
+
+  def test_conv_kernel_mask_full_stride(self, *input_shape):
+    padding = 'valid'
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = tuple([max(d, 1) for d in input_shape])
+    output_shape = _get_const_output_shape(input_shape, dim=1)
+
+    mask = np.zeros(input_shape + output_shape, np.bool)
+    if all(d > 0 for d in mask.shape):
+      mask[(0,) * len(output_shape)] = True
+
+    self.assertAllEqual(
+        mask,
+        conv_utils.conv_kernel_mask(
+            input_shape,
+            kernel_shape,
+            strides,
+            padding
+        )
+    )
+
+  def test_conv_kernel_mask_almost_full_stride(self, *input_shape):
+    padding = 'valid'
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = tuple([max(d - 1, 1) for d in input_shape])
+    output_shape = _get_const_output_shape(input_shape, dim=2)
+
+    mask = np.zeros(input_shape + output_shape, np.bool)
+    if all(d > 0 for d in mask.shape):
+      for in_position in itertools.product(*[[0, d - 1] for d in input_shape]):
+        out_position = tuple([min(p, 1) for p in in_position])
+        mask[in_position + out_position] = True
+
+    self.assertAllEqual(
+        mask,
+        conv_utils.conv_kernel_mask(
+            input_shape,
+            kernel_shape,
+            strides,
+            padding
+        )
+    )
+
+  def test_conv_kernel_mask_rect_kernel(self, *input_shape):
+    padding = 'valid'
+    ndims = len(input_shape)
+    strides = (1,) * ndims
+
+    for d in range(ndims):
+      kernel_shape = [1] * ndims
+      kernel_shape[d] = input_shape[d]
+
+      output_shape = list(input_shape)
+      output_shape[d] = min(1, input_shape[d])
+
+      mask = np.identity(int(np.prod(input_shape)), np.bool)
+      mask = np.reshape(mask, input_shape * 2)
+
+      for p in itertools.product(*[range(input_shape[dim])
+                                   for dim in range(ndims)]):
+        p = list(p)
+        p[d] = slice(None)
+        mask[p * 2] = True
+
+      mask = np.take(mask, range(0, min(1, input_shape[d])), ndims + d)
+
+      self.assertAllEqual(
+          mask,
+          conv_utils.conv_kernel_mask(
+              input_shape,
+              kernel_shape,
+              strides,
+              padding
+          )
+      )
+
+  def test_conv_kernel_mask_wrong_padding(self, *input_shape):
+    ndims = len(input_shape)
+    kernel_shape = (1,) * ndims
+    strides = (1,) * ndims
+
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'valid'
+    )
+
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'same'
+    )
+
+    self.assertRaises(NotImplementedError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'full')
+
+  def test_conv_kernel_mask_wrong_dims(self, *input_shape):
+    kernel_shape = 1
+    strides = 1
+
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'valid'
+    )
+
+    ndims = len(input_shape)
+
+    kernel_shape = (2,) * (ndims + 1)
+    self.assertRaises(ValueError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'same')
+
+    strides = (1,) * ndims
+    self.assertRaises(ValueError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'valid')
+
+    kernel_shape = (1,) * ndims
+    strides = (2,) * (ndims - 1)
+    self.assertRaises(ValueError,
+                      conv_utils.conv_kernel_mask,
+                      input_shape, kernel_shape, strides, 'valid')
+
+    strides = (2,) * ndims
+    conv_utils.conv_kernel_mask(
+        input_shape,
+        kernel_shape,
+        strides,
+        'valid'
+    )
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 77792d14f5..c7e94998b4 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -180,6 +180,23 @@ class TestMultiGPUModel(test.TestCase):
           target_tensors=[targets])
       parallel_model.fit(epochs=1, steps_per_epoch=3)
 
+  def test_multi_gpu_with_multi_input_layers(self):
+    gpus = 2
+
+    if not check_if_compatible_devices(gpus=gpus):
+      return
+
+    with self.test_session():
+      inputs = keras.Input((4, 3))
+      init_state = keras.Input((3,))
+      outputs = keras.layers.SimpleRNN(
+          3, return_sequences=True)(inputs, initial_state=init_state)
+      x = [np.random.randn(2, 4, 3), np.random.randn(2, 3)]
+      y = np.random.randn(2, 4, 3)
+      model = keras.Model([inputs, init_state], outputs)
+      parallel_model = keras.utils.multi_gpu_model(model, gpus=gpus)
+      parallel_model.compile(loss='mean_squared_error', optimizer='adam')
+      parallel_model.train_on_batch(x, y)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 162e5b2cd6..cfdb3de2aa 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import nest
@@ -109,10 +110,10 @@ def get_reachable_from_inputs(inputs, targets=None):
     if isinstance(x, ops.Operation):
       outputs = x.outputs[:] or []
       outputs += x._control_outputs  # pylint: disable=protected-access
-    elif isinstance(x, ops.Tensor):
-      outputs = x.consumers()
     elif isinstance(x, variables.Variable):
       outputs = [x.op]
+    elif tensor_util.is_tensor(x):
+      outputs = x.consumers()
     else:
       raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x))
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 2451dc7257..a9982a7ae0 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -73,6 +73,36 @@ tf_py_test(
 )
 
 tf_py_test(
+    name = "batch_gather_op_test",
+    srcs = ["batch_gather_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+tf_py_test(
+    name = "batch_scatter_ops_test",
+    srcs = ["batch_scatter_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
     name = "bcast_ops_test",
     size = "small",
     srcs = ["bcast_ops_test.py"],
@@ -571,7 +601,7 @@ tf_py_test(
 
 tf_py_test(
     name = "matrix_logarithm_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["matrix_logarithm_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -634,7 +664,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "parameterized_truncated_normal_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["parameterized_truncated_normal_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -736,6 +766,7 @@ tf_py_test(
     size = "small",
     srcs = ["regex_replace_op_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -949,6 +980,17 @@ tf_py_test(
 )
 
 tf_py_test(
+    name = "string_length_op_test",
+    size = "small",
+    srcs = ["string_length_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+tf_py_test(
     name = "string_strip_op_test",
     size = "small",
     srcs = ["string_strip_op_test.py"],
@@ -2181,7 +2223,6 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
     ],
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 40567571e6..81442d12e9 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -245,6 +245,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
         array_ops.boolean_mask(tensor, mask).eval()
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class OperatorShapeTest(test_util.TensorFlowTestCase):
 
   def testExpandScalar(self):
@@ -262,7 +263,8 @@ class OperatorShapeTest(test_util.TensorFlowTestCase):
     matrix_squeezed = array_ops.squeeze(matrix, [0])
     self.assertEqual(matrix_squeezed.get_shape(), (3))
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        Exception, "Can not squeeze dim.1., expected a dimension of 1, got 3"):
       matrix_squeezed = array_ops.squeeze(matrix, [1])
 
   def testSqueezeScalarDim(self):
@@ -270,6 +272,11 @@ class OperatorShapeTest(test_util.TensorFlowTestCase):
     matrix_squeezed = array_ops.squeeze(matrix, 0)
     self.assertEqual(matrix_squeezed.get_shape(), (3))
 
+  def testExpandDimsWithNonScalarDim(self):
+    with self.assertRaisesRegexp(Exception,
+                                 "must be a tensor with a single value"):
+      array_ops.expand_dims(1, axis=[0, 1])
+
 
 class ReverseV2Test(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
new file mode 100644
index 0000000000..8e7ae89f9d
--- /dev/null
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -0,0 +1,116 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
+
+
+class GatherTest(test.TestCase):
+
+  def _buildParams(self, data, dtype):
+    data = data.astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
+
+  def testSimpleGather(self):
+    data = np.array([0, 1, 2, 3, 7, 5, 8, 9, 10, 11, 15, 13])
+    indices = [3, 4]
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices_tf = constant_op.constant(indices)
+        gather_t = array_ops.batch_gather(params, indices_tf)
+        expected_result = np.array([3, 7])
+        np_val = self._buildParams(expected_result, dtype)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(np_val, gather_val)
+        self.assertEqual(np_val.shape, gather_t.get_shape())
+
+  def test2DArray(self):
+    data = np.array([[0, 1, 2, 3, 7, 5], [8, 9, 10, 11, 15, 13]])
+    indices = [[3], [4]]
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices_tf = constant_op.constant(indices)
+        gather_t = array_ops.batch_gather(params, indices_tf)
+        expected_result = np.array([[3], [15]])
+        np_val = self._buildParams(expected_result, dtype)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(np_val, gather_val)
+        self.assertEqual(np_val.shape, gather_t.get_shape())
+
+  def testHigherRank(self):
+    data = np.array([[[0, 1, 2], [3, 7, 5]], [[8, 9, 10], [11, 15, 13]]])
+    indices = [[[2, 0], [1, 2]], [[2, 0], [0, 1]]]
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices_tf = constant_op.constant(indices)
+        gather_t = array_ops.batch_gather(params, indices_tf)
+        gather_val = gather_t.eval()
+        expected_result = np.array([[[2, 0], [7, 5]], [[10, 8], [11, 15]]])
+        np_val = self._buildParams(expected_result, dtype)
+        self.assertAllEqual(np_val, gather_val)
+        self.assertEqual(np_val.shape, gather_t.get_shape())
+
+  def testString(self):
+    params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
+    with self.test_session():
+      indices_tf = constant_op.constant([1])
+      self.assertAllEqual([[b"qwer", b"uiop"]],
+                          array_ops.batch_gather(params, indices_tf).eval())
+
+  def testUnknownIndices(self):
+    params = constant_op.constant([[0, 1, 2]])
+    indices = array_ops.placeholder(dtypes.int32, shape=[None, None])
+    gather_t = array_ops.batch_gather(params, indices)
+    self.assertEqual([1, None], gather_t.get_shape().as_list())
+
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0\] = 7 is not in \[0, 2\)"):
+        array_ops.batch_gather(params, [7]).eval()
+
+  def testEmptySlices(self):
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
+        for itype in np.int32, np.int64:
+          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
+          indices = np.array([3, 4], dtype=itype)
+          gather = array_ops.batch_gather(params, indices)
+          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
new file mode 100644
index 0000000000..0d41a7e3b3
--- /dev/null
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -0,0 +1,129 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.scatter."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def _AsType(v, vtype):
+  return v.astype(vtype) if isinstance(v, np.ndarray) else vtype(v)
+
+
+def _NumpyUpdate(ref, indices, updates):
+  for i, indx in np.ndenumerate(indices):
+    indx = i[:-1] + (indx,)
+    ref[indx] = updates[i]
+
+
+_TF_OPS_TO_NUMPY = {
+    state_ops.batch_scatter_update: _NumpyUpdate,
+}
+
+
+class ScatterTest(test.TestCase):
+
+  def _VariableRankTest(self,
+                        tf_scatter,
+                        vtype,
+                        itype,
+                        repeat_indices=False,
+                        updates_are_scalar=False):
+    np.random.seed(8)
+    with self.test_session(use_gpu=False):
+      for indices_shape in (2,), (3, 7), (3, 4, 7):
+        for extra_shape in (), (5,), (5, 9):
+          # Generate random indices with no duplicates for easy numpy comparison
+          sparse_dim = len(indices_shape) - 1
+          indices = np.random.randint(
+              indices_shape[sparse_dim], size=indices_shape, dtype=itype)
+          updates = _AsType(
+              np.random.randn(*(indices_shape + extra_shape)), vtype)
+
+          old = _AsType(np.random.randn(*(indices_shape + extra_shape)), vtype)
+
+          # Scatter via numpy
+          new = old.copy()
+          np_scatter = _TF_OPS_TO_NUMPY[tf_scatter]
+          np_scatter(new, indices, updates)
+          # Scatter via tensorflow
+          ref = variables.Variable(old)
+          ref.initializer.run()
+          tf_scatter(ref, indices, updates).eval()
+          self.assertAllClose(ref.eval(), new)
+
+  def _VariableRankTests(self,
+                         tf_scatter):
+    vtypes = [np.float32, np.float64]
+    if tf_scatter != state_ops.scatter_div:
+      vtypes.append(np.int32)
+
+    for vtype in vtypes:
+      for itype in (np.int32, np.int64):
+        self._VariableRankTest(tf_scatter, vtype, itype)
+
+  def testVariableRankUpdate(self):
+    vtypes = [np.float32, np.float64]
+    for vtype in vtypes:
+      for itype in (np.int32, np.int64):
+        self._VariableRankTest(
+            state_ops.batch_scatter_update, vtype, itype)
+
+  def testBooleanScatterUpdate(self):
+    with self.test_session(use_gpu=False) as session:
+      var = variables.Variable([True, False])
+      update0 = state_ops.batch_scatter_update(var, [1], [True])
+      update1 = state_ops.batch_scatter_update(
+          var, constant_op.constant(
+              [0], dtype=dtypes.int64), [False])
+      var.initializer.run()
+
+      session.run([update0, update1])
+
+      self.assertAllEqual([False, True], var.eval())
+
+  def testScatterOutOfRange(self):
+    params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
+    updates = np.array([-3, -4, -5]).astype(np.float32)
+    with self.test_session(use_gpu=False):
+      ref = variables.Variable(params)
+      ref.initializer.run()
+
+      # Indices all in range, no problem.
+      indices = np.array([2, 0, 5])
+      state_ops.batch_scatter_update(ref, indices, updates).eval()
+
+      # Test some out of range errors.
+      indices = np.array([-1, 0, 5])
+      with self.assertRaisesOpError(
+          r'indices\[0\] = \[-1\] does not index into shape \[6\]'):
+        state_ops.batch_scatter_update(ref, indices, updates).eval()
+
+      indices = np.array([2, 0, 6])
+      with self.assertRaisesOpError(r'indices\[2\] = \[6\] does not index into '
+                                    r'shape \[6\]'):
+        state_ops.batch_scatter_update(ref, indices, updates).eval()
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index b9910133d8..0dc3c53bc0 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,9 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2
@@ -158,7 +158,7 @@ class CondV2Test(test.TestCase):
 
     def true_fn():
 
-      @function.Defun()
+      @function.defun
       def fn():
         return x * y * 2.0
 
@@ -172,6 +172,8 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [y])
 
   def testNestedDefunInCond(self):
+    self.skipTest("b/110550782")
+
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -180,10 +182,10 @@ class CondV2Test(test.TestCase):
 
     def false_fn():
 
-      @function.Defun()
+      @function.defun
       def fn():
 
-        @function.Defun()
+        @function.defun
         def nested_fn():
           return x * y * 2.0
 
@@ -196,18 +198,20 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [y])
 
   def testDoubleNestedDefunInCond(self):
+    self.skipTest("b/110550782")
+
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
     def true_fn():
 
-      @function.Defun()
+      @function.defun
       def fn():
 
-        @function.Defun()
+        @function.defun
         def nested_fn():
 
-          @function.Defun()
+          @function.defun
           def nested_nested_fn():
             return x * y * 2.0
 
@@ -368,7 +372,7 @@ class CondV2Test(test.TestCase):
           pred_outer, true_fn, false_fn, name="outer_cond")
 
       # Compute grads inside a Defun.
-      @function.Defun()
+      @function.defun
       def nesting_fn():
         return gradients_impl.gradients(cond_outer, [x, y])
 
@@ -426,10 +430,10 @@ class CondV2Test(test.TestCase):
           pred_outer, true_fn, false_fn, name="outer_cond")
 
       # Compute grads inside a Defun.
-      @function.Defun()
+      @function.defun
       def nesting_fn():
 
-        @function.Defun()
+        @function.defun
         def inner_nesting_fn():
           return gradients_impl.gradients(cond_outer, [x, y])
 
@@ -464,6 +468,7 @@ class CondV2Test(test.TestCase):
             }), [5., 0.])
 
   def testBuildCondAndGradientInsideDefun(self):
+    self.skipTest("b/110550782")
 
     def build_graph():
       pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
@@ -472,7 +477,7 @@ class CondV2Test(test.TestCase):
       y = constant_op.constant(2.0, name="y")
 
       # Build cond and its gradient inside a Defun.
-      @function.Defun()
+      @function.defun
       def fn():
 
         def true_fn():
@@ -718,6 +723,7 @@ class CondV2ContainerTest(test.TestCase):
     Make sure the containers are set correctly for both variable creation
     (tested by variables.Variable) and for stateful ops (tested by FIFOQueue)
     """
+    self.skipTest("b/113048653")
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g):
 
@@ -795,6 +801,7 @@ class CondV2ContainerTest(test.TestCase):
 class CondV2ColocationGroupAndDeviceTest(test.TestCase):
 
   def testColocateWithBeforeCond(self):
+    self.skipTest("b/112414483")
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g):
 
@@ -819,6 +826,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
             self.assertEquals(cond_v2.cond_v2(True, fn2, fn2)[0].eval(), 3)
 
   def testColocateWithInAndOutOfCond(self):
+    self.skipTest("b/112414483")
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g):
 
@@ -866,6 +874,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
         self.assertTrue(len(run_metadata.partition_graphs) >= 2)
 
   def testDeviceBeforeCond(self):
+    self.skipTest("b/112166045")
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g):
         def fn():
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index ae6875340e..93f5323c41 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -448,7 +448,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError,
-          "Tried to explicitly squeeze dimension 2"):
+          "Can not squeeze dim\[2\]"):
         dynamic_labels.eval(feed_dict=feed_dict)
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
@@ -475,7 +475,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           label_values, dynamic_labels.eval(feed_dict=feed_dict))
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError,
-          "Tried to explicitly squeeze dimension 2"):
+          "Can not squeeze dim\[2\]"):
         dynamic_predictions.eval(feed_dict=feed_dict)
 
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 1a29d0816d..4a3e767f4d 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 import collections
 import math
 import time
+import unittest
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -31,6 +32,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as _  # pylint: disable=unused-import
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -38,6 +40,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -122,6 +125,7 @@ def isum(s, maximum_iterations=None):
   return r_s
 
 
+@test_util.with_cond_v2
 class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
@@ -329,6 +333,9 @@ class ControlFlowTest(test.TestCase):
         res.eval(feed_dict={data: 1.0})
 
   def testCondBool(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     values = constant_op.constant(10)
     fn1 = lambda: math_ops.add(values, 1)
     fn2 = lambda: math_ops.subtract(values, 1)
@@ -377,6 +384,9 @@ class ControlFlowTest(test.TestCase):
               sess.run(r, feed_dict={t: 3})
 
   def testCondIndexedSlices(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       values = constant_op.constant(10)
       indices = constant_op.constant(0)
@@ -392,6 +402,9 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(0, ind)
 
   def testCondSparseTensor(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       values = constant_op.constant([2.0, 4.0], name="values")
       indices = constant_op.constant(
@@ -409,6 +422,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(r.values.get_shape(), (2,))
 
   def testCondResource(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       rv = resource_variable_ops.ResourceVariable(True)
       variables.global_variables_initializer().run()
@@ -422,6 +438,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
   def testCondIndexedSlicesDifferentTypes(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       values = constant_op.constant(10)
       i_32 = ops.convert_to_tensor(0, name="one", dtype=dtypes.int32)
@@ -465,10 +484,16 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     self._testCond_1(use_gpu=False)
     self._testCond_1(use_gpu=True)
 
   def testCond_2(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       x = constant_op.constant(10)
       r = control_flow_ops.cond(
@@ -478,6 +503,9 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       x = constant_op.constant(10)
       pred = math_ops.less(1, 2)
@@ -490,6 +518,9 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(12, result)
 
   def testCond_4(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       v1 = variables.Variable(7)
       v2 = variables.Variable(7)
@@ -511,6 +542,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(7, v3.eval())
 
   def testCond_5(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       alive = constant_op.constant(True, name="alive")
       count = constant_op.constant(0, name="count")
@@ -525,6 +559,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(4, count.eval())
 
   def testCond_6(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       v1 = variables.Variable([7])
 
@@ -549,6 +586,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual([11, 12], sess.run(r))
 
   def testCondRef(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       x = gen_state_ops.variable(
           shape=[1],
@@ -562,6 +602,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual([2.0], r.eval())
 
   def testCondWithControl(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session() as sess:
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
       a = constant_op.constant(3)
@@ -601,6 +644,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual([1.0], sess.run(merged_op.output))
 
   def testCondSwitchIdentity(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     # Make sure the recv identity is not removed by optimization.
     with session.Session(config=opt_cfg()) as sess:
       pred = constant_op.constant(True)
@@ -615,6 +661,9 @@ class ControlFlowTest(test.TestCase):
       sess.run(r)
 
   def testCondRecvIdentity(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     # Make sure the switch identity is not removed by optimization.
     with session.Session(config=opt_cfg()) as sess:
       with ops.device(test.gpu_device_name()):
@@ -631,6 +680,9 @@ class ControlFlowTest(test.TestCase):
       sess.run(r)
 
   def testCondGrad_1(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     graph = ops.Graph()
     with graph.as_default():
       x = constant_op.constant(10.0, name="x")
@@ -651,6 +703,9 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual(zero.control_inputs[0].inputs[0].op.type, "Switch")
 
   def testCondGrad_2(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
       x = constant_op.constant(10.0)
@@ -664,6 +719,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(3.0, grad.eval(feed_dict={c: 3}))
 
   def testCondGrad_3(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
       ox = constant_op.constant(10.0)
@@ -681,6 +739,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
 
   def testNestedCond_Simple(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       x = constant_op.constant(0., name="X")
       y = control_flow_ops.cond(
@@ -696,6 +757,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(1.0, result.eval())
 
   def testCondGrad_Gather(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session() as sess:
       v1 = variables.Variable([1.0, 42.0])
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -868,6 +932,9 @@ class ControlFlowTest(test.TestCase):
       _ = gradients_impl.gradients(loop_with_maxiter, v)
 
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     v = constant_op.constant(1.0)
 
     def create_while_loop():
@@ -1324,6 +1391,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, sess.run(r, {b: True}))
 
   def testWhileCondWithControl(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     # Ensure that no control edges by an outer control dependency context are
     # added to nodes inside cond/while contexts.
     with self.test_session() as sess:
@@ -1338,6 +1408,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(0, sess.run(loop))
 
   def testWhileCondWithControl_1(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       v = variable_scope.get_variable(
           "v", [], initializer=init_ops.constant_initializer(2))
@@ -1360,6 +1433,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(65536.0, v.eval())
 
   def testWhileCondExitControl(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       v = variables.Variable(1)
 
@@ -1383,6 +1459,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(99, v.eval())
 
   def testCondWhile_1(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       n = ops.convert_to_tensor(0, name="n")
       c = lambda x: math_ops.less(x, 10)
@@ -1393,6 +1472,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def testCondWhile_2(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       n = ops.convert_to_tensor(0)
       c = lambda x: math_ops.less(x, 10)
@@ -1403,6 +1485,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def _testCondWhile_3(self, use_gpu):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session(use_gpu=use_gpu) as sess:
       p = array_ops.placeholder(dtypes.bool)
       n = constant_op.constant(0.0)
@@ -1429,6 +1514,9 @@ class ControlFlowTest(test.TestCase):
     self._testCondWhile_3(use_gpu=True)
 
   def testWhileCond_1(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       i = ops.convert_to_tensor(0, name="i")
       n = ops.convert_to_tensor(10, name="n")
@@ -1444,6 +1532,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def testWhileCond_2(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       n = ops.convert_to_tensor(0, name="n")
       c = lambda x: math_ops.less(x, 10)
@@ -1452,6 +1543,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(10, r.eval())
 
   def testWhileCond_3(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       n = ops.convert_to_tensor(0)
       c = lambda x: math_ops.less(x, 10)
@@ -1713,6 +1807,9 @@ class ControlFlowTest(test.TestCase):
     self._testWhileGrad_ColocateGradients(colocate=True)
 
   def testWhileGrad_Square(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       v = constant_op.constant(2.0, name="v")
       c = lambda v: math_ops.less(v, 100.0)
@@ -1794,6 +1891,9 @@ class ControlFlowTest(test.TestCase):
     self._testWhileGrad_Mul(use_gpu=True, p_iters=10)
 
   def _testNestedWhileCondWhileGrad(self, use_gpu):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session(use_gpu=use_gpu):
       v = constant_op.constant(1.0)
 
@@ -1832,6 +1932,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(216.0, r[0].eval())
 
   def testWhileGradInCond(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       n = ops.convert_to_tensor(1.0, name="n")
       x = array_ops.placeholder(dtypes.float32, shape=None)
@@ -1880,6 +1983,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
   def testCondGradInNestedWhiles(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     def outer_body(i, x):
       _, x = control_flow_ops.while_loop(
           lambda j, x: j < 3, inner_body, [0, 0.0])
@@ -2193,10 +2299,16 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(1024.0, r.eval())
 
   def testWhileCondGrad_Simple(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     self._testWhileCondGrad_Simple(use_gpu=False)
     self._testWhileCondGrad_Simple(use_gpu=True)
 
   def testWhileCondGrad_UnknownShape(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session() as sess:
       v = array_ops.placeholder(dtypes.float32)
       n = ops.convert_to_tensor(100.0, name="n")
@@ -2543,6 +2655,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(5.0, result.eval())
 
   def testOneValueCond(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
       one = ops.convert_to_tensor(1, name="one")
@@ -2558,6 +2673,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([2], i.eval(feed_dict={c: 0}))
 
   def testExampleCond(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       x = ops.convert_to_tensor([-2.0, 2.0], name="x")
       d = array_ops.placeholder(dtypes.int32, shape=[])
@@ -2573,6 +2691,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(2.0 * math.sqrt(2), i.eval(feed_dict={d: 2}))
 
   def testCase(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       x = constant_op.constant(1)
       y = constant_op.constant(2)
@@ -2625,6 +2746,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(r6.eval(), 0)
 
   def testCaseSideEffects(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session() as sess:
       v0 = variables.Variable(-1)
       v1 = variables.Variable(-1)
@@ -2660,6 +2784,9 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(sess.run([v0, v1, v2]), [0, -1, -1])
 
   def testOneOpCond(self):
+    if control_flow_ops._ENABLE_COND_V2:
+      return unittest.skip("disabled when using cond_v2")
+
     with self.test_session():
       v = variables.Variable(0)
       c = ops.convert_to_tensor(0)
diff --git a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
index e1920eb568..41ae0b456f 100644
--- a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
+++ b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
@@ -188,11 +188,11 @@ class CTCGreedyDecoderTest(test.TestCase):
         ],
         dtype=np.float32)
     # Add arbitrary offset - this is fine
-    input_log_prob_matrix_0 = np.log(input_prob_matrix_0) + 2.0
+    input_prob_matrix_0 = input_prob_matrix_0 + 2.0
 
     # len max_time_steps array of batch_size x depth matrices
     inputs = ([
-        input_log_prob_matrix_0[t, :][np.newaxis, :] for t in range(seq_len_0)
+        input_prob_matrix_0[t, :][np.newaxis, :] for t in range(seq_len_0)
     ]  # Pad to max_time_steps = 8
               + 2 * [np.zeros(
                   (1, depth), dtype=np.float32)])
@@ -200,11 +200,11 @@ class CTCGreedyDecoderTest(test.TestCase):
     # batch_size length vector of sequence_lengths
     seq_lens = np.array([seq_len_0], dtype=np.int32)
 
-    # batch_size length vector of negative log probabilities
+    # batch_size length vector of log probabilities
     log_prob_truth = np.array(
         [
-            0.584855,  # output beam 0
-            0.389139  # output beam 1
+            -5.811451,  # output beam 0
+            -6.63339  # output beam 1
         ],
         np.float32)[np.newaxis, :]
 
@@ -215,11 +215,11 @@ class CTCGreedyDecoderTest(test.TestCase):
             [[0, 0], [0, 1]], dtype=np.int64), np.array(
                 [1, 0], dtype=np.int64), np.array(
                     [1, 2], dtype=np.int64)),
-        # beam 1, batch 0, three outputs decoded
+        # beam 1, batch 0, one output decoded
         (np.array(
-            [[0, 0], [0, 1], [0, 2]], dtype=np.int64), np.array(
-                [0, 1, 0], dtype=np.int64), np.array(
-                    [1, 3], dtype=np.int64)),
+            [[0, 0]], dtype=np.int64), np.array(
+                [1], dtype=np.int64), np.array(
+                    [1, 1], dtype=np.int64)),
     ]
 
     # Test correct decoding.
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index d8939433ce..c6bb06eab3 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -47,7 +47,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
   def testP(self):
     p = [0.2, 0.8]
     dist = categorical.Categorical(probs=p)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(p, dist.probs.eval())
       self.assertAllEqual([2], dist.logits.get_shape())
 
@@ -55,14 +55,14 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     p = np.array([0.2, 0.8], dtype=np.float32)
     logits = np.log(p) - 50.
     dist = categorical.Categorical(logits=logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([2], dist.probs.get_shape())
       self.assertAllEqual([2], dist.logits.get_shape())
       self.assertAllClose(dist.probs.eval(), p)
       self.assertAllClose(dist.logits.eval(), logits)
 
   def testShapes(self):
-    with self.test_session():
+    with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
         dist = make_categorical(batch_shape, 10)
         self.assertAllEqual(batch_shape, dist.batch_shape)
@@ -108,7 +108,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(dist.dtype, dist.sample(5).dtype)
 
   def testUnknownShape(self):
-    with self.test_session():
+    with self.cached_session():
       logits = array_ops.placeholder(dtype=dtypes.float32)
       dist = categorical.Categorical(logits)
       sample = dist.sample()
@@ -124,13 +124,13 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
   def testPMFWithBatch(self):
     histograms = [[0.2, 0.8], [0.6, 0.4]]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(dist.prob([0, 1]).eval(), [0.2, 0.4])
 
   def testPMFNoBatch(self):
     histograms = [0.2, 0.8]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(dist.prob(0).eval(), 0.2)
 
   def testCDFWithDynamicEventShapeKnownNdims(self):
@@ -162,7 +162,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
         event: event_feed_two
     }
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_cdf_one = sess.run(cdf_op, feed_dict=feed_dict_one)
       actual_cdf_two = sess.run(cdf_op, feed_dict=feed_dict_two)
 
@@ -192,7 +192,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     dist = categorical.Categorical(probs=histograms)
     cdf_op = dist.cdf(event)
 
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(cdf_op.eval(), expected_cdf)
 
   def testCDFNoBatch(self):
@@ -202,7 +202,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     dist = categorical.Categorical(probs=histogram)
     cdf_op = dist.cdf(event)
 
-    with self.test_session():
+    with self.cached_session():
       self.assertAlmostEqual(cdf_op.eval(), expected_cdf)
 
   def testCDFBroadcasting(self):
@@ -228,7 +228,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     expected_cdf_result[2, 0] = 0.3
     expected_cdf_result[2, 1] = 0.75
 
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(dist.cdf(devent).eval(), expected_cdf_result)
 
   def testBroadcastWithBatchParamsAndBiggerEvent(self):
@@ -286,7 +286,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
         "norm_log_cdf": norm.log_cdf(real_event_tf),
     }
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       run_result = sess.run(to_run)
 
     self.assertAllEqual(run_result["cat_prob"].shape,
@@ -301,28 +301,28 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
   def testLogPMF(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(dist.log_prob([0, 1]).eval(), np.log([0.2, 0.4]))
       self.assertAllClose(dist.log_prob([0.0, 1.0]).eval(), np.log([0.2, 0.4]))
 
   def testEntropyNoBatch(self):
     logits = np.log([0.2, 0.8]) - 50.
     dist = categorical.Categorical(logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(dist.entropy().eval(),
                           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)))
 
   def testEntropyWithBatch(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(dist.entropy().eval(), [
           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)),
           -(0.6 * np.log(0.6) + 0.4 * np.log(0.4))
       ])
 
   def testEntropyGradient(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])
 
       probabilities = nn_ops.softmax(logits)
@@ -348,7 +348,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
                           res["categorical_entropy_g"])
 
   def testSample(self):
-    with self.test_session():
+    with self.cached_session():
       histograms = [[[0.2, 0.8], [0.4, 0.6]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       n = 10000
@@ -366,7 +366,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
               sample_values == 1, axis=0), atol=1e-2)
 
   def testSampleWithSampleShape(self):
-    with self.test_session():
+    with self.cached_session():
       histograms = [[[0.2, 0.8], [0.4, 0.6]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       samples = dist.sample((100, 100), seed=123)
@@ -387,7 +387,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertIsNone(grad_p)
 
   def testLogPMFBroadcasting(self):
-    with self.test_session():
+    with self.cached_session():
       # 1 x 2 x 2
       histograms = [[[0.2, 0.8], [0.4, 0.6]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
@@ -415,7 +415,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
                           prob.eval())
 
   def testLogPMFShape(self):
-    with self.test_session():
+    with self.cached_session():
       # shape [1, 2, 2]
       histograms = [[[0.2, 0.8], [0.4, 0.6]]]
       dist = categorical.Categorical(math_ops.log(histograms))
@@ -441,7 +441,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2, 2, 2], log_prob.get_shape())
 
   def testMode(self):
-    with self.test_session():
+    with self.cached_session():
       histograms = [[[0.2, 0.8], [0.6, 0.4]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       self.assertAllEqual(dist.mode().eval(), [[1, 0]])
@@ -452,7 +452,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       exp_logits = np.exp(logits)
       return exp_logits / exp_logits.sum(axis=-1, keepdims=True)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for categories in [2, 4]:
         for batch_size in [1, 10]:
           a_logits = np.random.randn(batch_size, categories)
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index 1b9edcc85a..d558ca09cc 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -37,7 +37,7 @@ class DirichletMultinomialTest(test.TestCase):
     self._rng = np.random.RandomState(42)
 
   def testSimpleShapes(self):
-    with self.test_session():
+    with self.cached_session():
       alpha = np.random.rand(3)
       dist = ds.DirichletMultinomial(1., alpha)
       self.assertEqual(3, dist.event_shape_tensor().eval())
@@ -46,7 +46,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
   def testComplexShapes(self):
-    with self.test_session():
+    with self.cached_session():
       alpha = np.random.rand(3, 2, 2)
       n = [[3., 2], [4, 5], [6, 7]]
       dist = ds.DirichletMultinomial(n, alpha)
@@ -58,14 +58,14 @@ class DirichletMultinomialTest(test.TestCase):
   def testNproperty(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(n, alpha)
       self.assertEqual([1, 1], dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
   def testAlphaProperty(self):
     alpha = [[1., 2, 3]]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(1, alpha)
       self.assertEqual([1, 3], dist.concentration.get_shape())
       self.assertAllClose(alpha, dist.concentration.eval())
@@ -73,7 +73,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfNandCountsAgree(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(n, alpha, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
@@ -86,7 +86,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfNonIntegerCounts(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(n, alpha, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
@@ -104,7 +104,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfBothZeroBatches(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
-    with self.test_session():
+    with self.cached_session():
       # Both zero-batches.  No broadcast
       alpha = [1., 2]
       counts = [1., 0]
@@ -116,7 +116,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfBothZeroBatchesNontrivialN(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
-    with self.test_session():
+    with self.cached_session():
       # Both zero-batches.  No broadcast
       alpha = [1., 2]
       counts = [3., 2]
@@ -128,7 +128,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfBothZeroBatchesMultidimensionalN(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
-    with self.test_session():
+    with self.cached_session():
       alpha = [1., 2]
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
@@ -140,7 +140,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfAlphaStretchedInBroadcastWhenSameRank(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
-    with self.test_session():
+    with self.cached_session():
       alpha = [[1., 2]]
       counts = [[1., 0], [0., 1]]
       dist = ds.DirichletMultinomial([1.], alpha)
@@ -151,7 +151,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfAlphaStretchedInBroadcastWhenLowerRank(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
-    with self.test_session():
+    with self.cached_session():
       alpha = [1., 2]
       counts = [[1., 0], [0., 1]]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
@@ -161,7 +161,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
-    with self.test_session():
+    with self.cached_session():
       alpha = [[1., 2], [2., 3]]
       counts = [[1., 0]]
       pmf = ds.DirichletMultinomial([1., 1.], alpha).prob(counts)
@@ -171,7 +171,7 @@ class DirichletMultinomialTest(test.TestCase):
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
-    with self.test_session():
+    with self.cached_session():
       alpha = [[1., 2], [2., 3]]
       counts = [1., 0]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
@@ -182,7 +182,7 @@ class DirichletMultinomialTest(test.TestCase):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
     alpha = [1., 2, 3]
-    with self.test_session():
+    with self.cached_session():
       for class_num in range(3):
         counts = np.zeros([3], dtype=np.float32)
         counts[class_num] = 1
@@ -199,7 +199,7 @@ class DirichletMultinomialTest(test.TestCase):
     # DirichletMultinomial(2, alpha) is twice as much as the probability of one
     # vote falling into class k for DirichletMultinomial(1, alpha)
     alpha = [1., 2, 3]
-    with self.test_session():
+    with self.cached_session():
       for class_num in range(3):
         counts_one = np.zeros([3], dtype=np.float32)
         counts_one[class_num] = 1.
@@ -223,7 +223,7 @@ class DirichletMultinomialTest(test.TestCase):
     # Ideally we'd be able to test broadcasting but, the multinomial sampler
     # doesn't support different total counts.
     n = np.float32(5)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # batch_shape=[2], event_shape=[3]
       dist = ds.DirichletMultinomial(n, alpha)
       x = dist.sample(int(250e3), seed=1)
@@ -281,7 +281,7 @@ class DirichletMultinomialTest(test.TestCase):
         variance_entry(alpha[1], alpha_0)
     ]])
 
-    with self.test_session():
+    with self.cached_session():
       for n in ns:
         # n is shape [] and alpha is shape [2].
         dist = ds.DirichletMultinomial(n, alpha)
@@ -319,7 +319,7 @@ class DirichletMultinomialTest(test.TestCase):
         ]]],
         dtype=np.float32)
 
-    with self.test_session():
+    with self.cached_session():
       # ns is shape [4, 1], and alpha is shape [4, 3].
       dist = ds.DirichletMultinomial(ns, alpha)
       covariance = dist.covariance()
@@ -336,7 +336,7 @@ class DirichletMultinomialTest(test.TestCase):
     ns = np.random.randint(low=1, high=11, size=[3, 5, 1]).astype(np.float32)
     ns2 = np.random.randint(low=1, high=11, size=[6, 1, 1]).astype(np.float32)
 
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(ns, alpha)
       dist2 = ds.DirichletMultinomial(ns2, alpha2)
 
@@ -350,7 +350,7 @@ class DirichletMultinomialTest(test.TestCase):
     # probability 1.
     alpha = [5, 0.5]
     counts = [0., 0]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(0., alpha)
       pmf = dist.prob(counts)
       self.assertAllClose(1.0, pmf.eval())
@@ -365,7 +365,7 @@ class DirichletMultinomialTest(test.TestCase):
     # One (three sided) coin flip.  Prob[coin 3] = 0.8.
     # Note that since it was one flip, value of tau didn't matter.
     counts = [0., 0, 1]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
       self.assertAllClose(0.8, pmf.eval(), atol=1e-4)
@@ -373,7 +373,7 @@ class DirichletMultinomialTest(test.TestCase):
 
     # Two (three sided) coin flips.  Prob[coin 3] = 0.8.
     counts = [0., 0, 2]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(2., alpha)
       pmf = dist.prob(counts)
       self.assertAllClose(0.8**2, pmf.eval(), atol=1e-2)
@@ -381,7 +381,7 @@ class DirichletMultinomialTest(test.TestCase):
 
     # Three (three sided) coin flips.
     counts = [1., 0, 2]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(3., alpha)
       pmf = dist.prob(counts)
       self.assertAllClose(3 * 0.1 * 0.8 * 0.8, pmf.eval(), atol=1e-2)
@@ -396,7 +396,7 @@ class DirichletMultinomialTest(test.TestCase):
 
     # If there is only one draw, it is still a coin flip, even with small tau.
     counts = [1., 0]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
       self.assertAllClose(0.5, pmf.eval())
@@ -405,7 +405,7 @@ class DirichletMultinomialTest(test.TestCase):
     # If there are two draws, it is much more likely that they are the same.
     counts_same = [2., 0]
     counts_different = [1, 1.]
-    with self.test_session():
+    with self.cached_session():
       dist = ds.DirichletMultinomial(2., alpha)
       pmf_same = dist.prob(counts_same)
       pmf_different = dist.prob(counts_different)
@@ -414,7 +414,7 @@ class DirichletMultinomialTest(test.TestCase):
 
   def testNonStrictTurnsOffAllChecks(self):
     # Make totally invalid input.
-    with self.test_session():
+    with self.cached_session():
       alpha = [[-1., 2]]  # alpha should be positive.
       counts = [[1., 0], [0., -1]]  # counts should be non-negative.
       n = [-5.3]  # n should be a non negative integer equal to counts.sum.
@@ -422,7 +422,7 @@ class DirichletMultinomialTest(test.TestCase):
       dist.prob(counts).eval()  # Should not raise.
 
   def testSampleUnbiasedNonScalarBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
           total_count=5.,
           concentration=1. + 2. * self._rng.rand(4, 3, 2).astype(np.float32))
@@ -451,7 +451,7 @@ class DirichletMultinomialTest(test.TestCase):
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
   def testSampleUnbiasedScalarBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
           total_count=5.,
           concentration=1. + 2. * self._rng.rand(4).astype(np.float32))
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index b347c20db2..e35a8e1cdd 100644
--- a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -42,7 +42,7 @@ class IdentityBijectorTest(test.TestCase):
             bijector.forward_log_det_jacobian(x, event_ndims=3)))
 
   def testScalarCongruency(self):
-    with self.test_session():
+    with self.cached_session():
       bijector = identity_bijector.Identity()
       bijector_test_util.assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index d0fa1fe989..e77e1117d4 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -58,7 +58,7 @@ class KLTest(test.TestCase):
 
     # pylint: disable=unused-argument,unused-variable
 
-    with self.test_session():
+    with self.cached_session():
       a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=False)
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index bfd40ba2b7..3840d7331c 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -34,7 +34,7 @@ class MultinomialTest(test.TestCase):
     self._rng = np.random.RandomState(42)
 
   def testSimpleShapes(self):
-    with self.test_session():
+    with self.cached_session():
       p = [.1, .3, .6]
       dist = multinomial.Multinomial(total_count=1., probs=p)
       self.assertEqual(3, dist.event_shape_tensor().eval())
@@ -43,7 +43,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
   def testComplexShapes(self):
-    with self.test_session():
+    with self.cached_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
       n = [[3., 2], [4, 5], [6, 7]]
       dist = multinomial.Multinomial(total_count=n, probs=p)
@@ -55,14 +55,14 @@ class MultinomialTest(test.TestCase):
   def testN(self):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
-    with self.test_session():
+    with self.cached_session():
       dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
-    with self.test_session():
+    with self.cached_session():
       dist = multinomial.Multinomial(total_count=3., probs=p)
       self.assertEqual((1, 3), dist.probs.get_shape())
       self.assertEqual((1, 3), dist.logits.get_shape())
@@ -71,7 +71,7 @@ class MultinomialTest(test.TestCase):
   def testLogits(self):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
-    with self.test_session():
+    with self.cached_session():
       multinom = multinomial.Multinomial(total_count=3., logits=logits)
       self.assertEqual((1, 3), multinom.probs.get_shape())
       self.assertEqual((1, 3), multinom.logits.get_shape())
@@ -80,7 +80,7 @@ class MultinomialTest(test.TestCase):
 
   def testPmfUnderflow(self):
     logits = np.array([[-200, 0]], dtype=np.float32)
-    with self.test_session():
+    with self.cached_session():
       dist = multinomial.Multinomial(total_count=1., logits=logits)
       lp = dist.log_prob([1., 0.]).eval()[0]
       self.assertAllClose(-200, lp, atol=0, rtol=1e-6)
@@ -88,7 +88,7 @@ class MultinomialTest(test.TestCase):
   def testPmfandCountsAgree(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
-    with self.test_session():
+    with self.cached_session():
       dist = multinomial.Multinomial(total_count=n, probs=p, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
@@ -100,7 +100,7 @@ class MultinomialTest(test.TestCase):
   def testPmfNonIntegerCounts(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
-    with self.test_session():
+    with self.cached_session():
       # No errors with integer n.
       multinom = multinomial.Multinomial(
           total_count=n, probs=p, validate_args=True)
@@ -122,7 +122,7 @@ class MultinomialTest(test.TestCase):
       multinom.prob([1.0, 2.5, 1.5]).eval()
 
   def testPmfBothZeroBatches(self):
-    with self.test_session():
+    with self.cached_session():
       # Both zero-batches.  No broadcast
       p = [0.5, 0.5]
       counts = [1., 0]
@@ -131,7 +131,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
-    with self.test_session():
+    with self.cached_session():
       # Both zero-batches.  No broadcast
       p = [0.1, 0.9]
       counts = [3., 2]
@@ -142,7 +142,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenSameRank(self):
-    with self.test_session():
+    with self.cached_session():
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
@@ -150,7 +150,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenLowerRank(self):
-    with self.test_session():
+    with self.cached_session():
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
@@ -158,7 +158,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
-    with self.test_session():
+    with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [[1., 0]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
@@ -166,7 +166,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
-    with self.test_session():
+    with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [1., 0]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
@@ -174,7 +174,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(pmf.get_shape(), (2))
 
   def testPmfShapeCountsStretchedN(self):
-    with self.test_session():
+    with self.cached_session():
       # [2, 2, 2]
       p = [[[0.1, 0.9], [0.1, 0.9]], [[0.7, 0.3], [0.7, 0.3]]]
       # [2, 2]
@@ -186,7 +186,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(pmf.get_shape(), (2, 2))
 
   def testPmfShapeCountsPStretchedN(self):
-    with self.test_session():
+    with self.cached_session():
       p = [0.1, 0.9]
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
@@ -195,7 +195,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((4, 3), pmf.get_shape())
 
   def testMultinomialMean(self):
-    with self.test_session():
+    with self.cached_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
       dist = multinomial.Multinomial(total_count=n, probs=p)
@@ -204,7 +204,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(expected_means, dist.mean().eval())
 
   def testMultinomialCovariance(self):
-    with self.test_session():
+    with self.cached_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
       dist = multinomial.Multinomial(total_count=n, probs=p)
@@ -215,7 +215,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(expected_covariances, dist.covariance().eval())
 
   def testMultinomialCovarianceBatch(self):
-    with self.test_session():
+    with self.cached_session():
       # Shape [2]
       n = [5.] * 2
       # Shape [4, 1, 2]
@@ -237,7 +237,7 @@ class MultinomialTest(test.TestCase):
     ns = np.random.randint(low=1, high=11, size=[3, 5]).astype(np.float32)
     ns2 = np.random.randint(low=1, high=11, size=[6, 1]).astype(np.float32)
 
-    with self.test_session():
+    with self.cached_session():
       dist = multinomial.Multinomial(ns, p)
       dist2 = multinomial.Multinomial(ns2, p2)
 
@@ -253,7 +253,7 @@ class MultinomialTest(test.TestCase):
                       [2.5, 4, 0.01]], dtype=np.float32)
     theta /= np.sum(theta, 1)[..., array_ops.newaxis]
     n = np.array([[10., 9.], [8., 7.], [6., 5.]], dtype=np.float32)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # batch_shape=[3, 2], event_shape=[3]
       dist = multinomial.Multinomial(n, theta)
       x = dist.sample(int(1000e3), seed=1)
@@ -289,7 +289,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
 
   def testSampleUnbiasedNonScalarBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = multinomial.Multinomial(
           total_count=[7., 6., 5.],
           logits=math_ops.log(2. * self._rng.rand(4, 3, 2).astype(np.float32)))
@@ -318,7 +318,7 @@ class MultinomialTest(test.TestCase):
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
   def testSampleUnbiasedScalarBatch(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4).astype(np.float32)))
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 55d75cb474..0e83726760 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -663,8 +663,9 @@ class EmbeddingLookupSparseTest(test.TestCase):
         np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry)
 
     for num_shards, combiner, dtype, ignore_weights in itertools.product(
-        [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32,
-                                           dtypes.float64], [True, False]):
+        [1, 5], ["sum", "mean", "sqrtn"],
+        [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64],
+        [True, False]):
 
       with self.test_session():
         p, params, feed_dict = _EmbeddingParams(
@@ -677,6 +678,10 @@ class EmbeddingLookupSparseTest(test.TestCase):
 
         self.assertEqual(embedding_sum.get_shape().as_list(),
                          expected_lookup_result_shape)
+        if dtype in (dtypes.float16, dtypes.bfloat16):
+          self.assertEqual(embedding_sum.dtype, dtypes.float32)
+        else:
+          self.assertEqual(embedding_sum.dtype, dtype)
 
         tf_embedding_sum = embedding_sum.eval(feed_dict=feed_dict)
 
@@ -692,7 +697,14 @@ class EmbeddingLookupSparseTest(test.TestCase):
         if combiner == "sqrtn":
           np_embedding_sum /= np.reshape(
               np.sqrt(np_weight_sq_sum), (batch_size, 1, 1))
-        self.assertAllClose(np_embedding_sum, tf_embedding_sum)
+
+        rtol = 1e-6
+        if dtype == dtypes.bfloat16:
+          rtol = 1e-2
+        elif dtype == dtypes.float16:
+          rtol = 1e-3
+        atol = rtol
+        self.assertAllClose(np_embedding_sum, tf_embedding_sum, rtol, atol)
 
   def testGradientsEmbeddingLookupSparse(self):
     vocab_size = 12
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
index 60090a1510..e1f5a6b620 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
@@ -25,6 +25,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed as random_seed_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
@@ -100,6 +102,24 @@ class ExtractImagePatchesGradTest(test.TestCase):
           print('extract_image_patches gradient err: %.4e' % err)
           self.assertLess(err, 1e-4)
 
+  def testConstructGradientWithLargeImages(self):
+    batch_size = 4
+    height = 1024
+    width = 1024
+    ksize = 5
+    images = variable_scope.get_variable('inputs',
+                                         (batch_size, height, width, 1))
+    patches = array_ops.extract_image_patches(images,
+                                              ksizes=[1, ksize, ksize, 1],
+                                              strides=[1, 1, 1, 1],
+                                              rates=[1, 1, 1, 1],
+                                              padding='SAME')
+    # Github issue: #20146
+    # tf.extract_image_patches() gradient very slow at graph construction time
+    gradients = gradients_impl.gradients(patches, images)
+    # Won't time out.
+    self.assertIsNotNone(gradients)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 5db2e9821d..1e76ad7476 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -1075,30 +1075,13 @@ class PartitionedCallTest(test.TestCase):
       with ops.device("/cpu:2"):
         s3 = iterator_ops.Iterator.from_structure(
             (dtypes.float32,)).string_handle()
-      with ops.device(""):
-        # TODO(akshayka): This is unfortunate and brittle. It prevents
-        # `Iterator.from_structure` from assigning the iterator op to 'cpu:0'.
-        #  Remove this hack once we have a way of obtaining metadata about
-        #  function execution.
-        s4 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
-      return s1, s2, s3, s4
+      return s1, s2, s3
 
     with self.test_session(config=config, use_gpu=True) as sess:
-      with ops.device("/cpu:3"):
-        outputs = sess.run(functional_ops.partitioned_call(args=[], f=Body))
-    self.assertIn(compat.as_bytes("CPU:0"), outputs[0])
-    self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
-    self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
-    self.assertIn(compat.as_bytes("CPU:3"), outputs[3])
-
-    with self.test_session(config=config, use_gpu=True):
-      with ops.device("/cpu:0"):
-        outputs = sess.run(functional_ops.partitioned_call(args=[], f=Body))
+      outputs = sess.run(functional_ops.partitioned_call(args=[], f=Body))
     self.assertIn(compat.as_bytes("CPU:0"), outputs[0])
     self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
     self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
-    self.assertIn(compat.as_bytes("CPU:0"), outputs[3])
 
   def testAssignAddResourceVariable(self):
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 612a50bcec..99497914f2 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -191,7 +191,7 @@ class NonSquareLinearOperatorCompositionTest(
         linalg.LinearOperatorFullMatrix(rng.rand(2, 4, 5))
     ]
     operator = linalg.LinearOperatorComposition(operators)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual((2, 3, 5), operator.shape_tensor().eval())
 
   def test_shape_tensors_when_only_dynamically_available(self):
@@ -206,7 +206,7 @@ class NonSquareLinearOperatorCompositionTest(
         linalg.LinearOperatorFullMatrix(mat_ph_2)
     ]
     operator = linalg.LinearOperatorComposition(operators)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual(
           (1, 2, 3, 5), operator.shape_tensor().eval(feed_dict=feed_dict))
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 83cc8c483f..52861ae84a 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -52,7 +52,7 @@ class LinearOperatorDiagTest(
 
   def test_assert_positive_definite_raises_for_zero_eigenvalue(self):
     # Matrix with one positive eigenvalue and one zero eigenvalue.
-    with self.test_session():
+    with self.cached_session():
       diag = [1.0, 0.0]
       operator = linalg.LinearOperatorDiag(diag)
 
@@ -62,7 +62,7 @@ class LinearOperatorDiagTest(
         operator.assert_positive_definite().run()
 
   def test_assert_positive_definite_raises_for_negative_real_eigvalues(self):
-    with self.test_session():
+    with self.cached_session():
       diag_x = [1.0, -2.0]
       diag_y = [0., 0.]  # Imaginary eigenvalues should not matter.
       diag = math_ops.complex(diag_x, diag_y)
@@ -74,7 +74,7 @@ class LinearOperatorDiagTest(
         operator.assert_positive_definite().run()
 
   def test_assert_positive_definite_does_not_raise_if_pd_and_complex(self):
-    with self.test_session():
+    with self.cached_session():
       x = [1., 2.]
       y = [1., 0.]
       diag = math_ops.complex(x, y)  # Re[diag] > 0.
@@ -83,14 +83,14 @@ class LinearOperatorDiagTest(
 
   def test_assert_non_singular_raises_if_zero_eigenvalue(self):
     # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
-    with self.test_session():
+    with self.cached_session():
       diag = [1.0, 0.0]
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
   def test_assert_non_singular_does_not_raise_for_complex_nonsingular(self):
-    with self.test_session():
+    with self.cached_session():
       x = [1., 0.]
       y = [0., 1.]
       diag = math_ops.complex(x, y)
@@ -98,7 +98,7 @@ class LinearOperatorDiagTest(
       linalg.LinearOperatorDiag(diag).assert_non_singular().run()
 
   def test_assert_self_adjoint_raises_if_diag_has_complex_part(self):
-    with self.test_session():
+    with self.cached_session():
       x = [1., 0.]
       y = [0., 1.]
       diag = math_ops.complex(x, y)
@@ -107,7 +107,7 @@ class LinearOperatorDiagTest(
         operator.assert_self_adjoint().run()
 
   def test_assert_self_adjoint_does_not_raise_for_diag_with_zero_imag(self):
-    with self.test_session():
+    with self.cached_session():
       x = [1., 0.]
       y = [0., 0.]
       diag = math_ops.complex(x, y)
@@ -123,7 +123,7 @@ class LinearOperatorDiagTest(
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.matmul cannot handle.
     # In particular, tf.matmul does not broadcast.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
       # This LinearOperatorDiag will be broadcast to (2, 2, 3, 3) during solve
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 1a40a29ec6..8373b5263f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -65,7 +65,7 @@ class SquareLinearOperatorFullMatrixTest(
     self.assertTrue(operator.is_square)
 
   def test_assert_non_singular_raises_if_cond_too_big_but_finite(self):
-    with self.test_session():
+    with self.cached_session():
       tril = linear_operator_test_util.random_tril_matrix(
           shape=(50, 50), dtype=np.float32)
       diag = np.logspace(-2, 2, 50).astype(np.float32)
@@ -80,7 +80,7 @@ class SquareLinearOperatorFullMatrixTest(
         operator.assert_non_singular().run()
 
   def test_assert_non_singular_raises_if_cond_infinite(self):
-    with self.test_session():
+    with self.cached_session():
       matrix = [[1., 1.], [1., 1.]]
       # We don't pass the is_self_adjoint hint here, which means we take the
       # generic code path.
@@ -91,14 +91,14 @@ class SquareLinearOperatorFullMatrixTest(
   def test_assert_self_adjoint(self):
     matrix = [[0., 1.], [0., 1.]]
     operator = linalg.LinearOperatorFullMatrix(matrix)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("not equal to its adjoint"):
         operator.assert_self_adjoint().run()
 
   def test_assert_positive_definite(self):
     matrix = [[1., 1.], [1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=True)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("Cholesky decomposition was not success"):
         operator.assert_positive_definite().run()
 
@@ -158,7 +158,7 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
     matrix = [[1., 1.], [1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix, is_self_adjoint=True, is_positive_definite=True)
-    with self.test_session():
+    with self.cached_session():
       # Cholesky decomposition may fail, so the error is not specific to
       # non-singular.
       with self.assertRaisesOpError(""):
@@ -168,7 +168,7 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
     matrix = [[0., 1.], [0., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix, is_self_adjoint=True, is_positive_definite=True)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("not equal to its adjoint"):
         operator.assert_self_adjoint().run()
 
@@ -176,7 +176,7 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
     matrix = [[1., 1.], [1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix, is_self_adjoint=True, is_positive_definite=True)
-    with self.test_session():
+    with self.cached_session():
       # Cholesky decomposition may fail, so the error is not specific to
       # non-singular.
       with self.assertRaisesOpError(""):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 35dcf4417c..0c3c6b390f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -57,24 +57,24 @@ class LinearOperatorIdentityTest(
     return operator, mat
 
   def test_assert_positive_definite(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_positive_definite().run()  # Should not fail
 
   def test_assert_non_singular(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_non_singular().run()  # Should not fail
 
   def test_assert_self_adjoint(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_self_adjoint().run()  # Should not fail
 
   def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
@@ -106,7 +106,7 @@ class LinearOperatorIdentityTest(
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[-2])
 
   def test_non_scalar_num_rows_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows, assert_proper_shapes=True)
@@ -114,7 +114,7 @@ class LinearOperatorIdentityTest(
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
   def test_negative_num_rows_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows, assert_proper_shapes=True)
@@ -122,7 +122,7 @@ class LinearOperatorIdentityTest(
         operator.to_dense().eval(feed_dict={num_rows: -2})
 
   def test_non_1d_batch_shape_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
@@ -130,7 +130,7 @@ class LinearOperatorIdentityTest(
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
   def test_negative_batch_shape_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
@@ -147,7 +147,7 @@ class LinearOperatorIdentityTest(
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
 
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows, assert_proper_shapes=True)
       y = operator.matmul(x)
@@ -158,7 +158,7 @@ class LinearOperatorIdentityTest(
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = random_ops.random_normal(shape=(1, 2, 3, 4))
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
@@ -172,7 +172,7 @@ class LinearOperatorIdentityTest(
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
@@ -188,7 +188,7 @@ class LinearOperatorIdentityTest(
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Given this x and LinearOperatorIdentity shape of (2, 1, 3, 3), the
       # broadcast shape of operator and 'x' is (2, 2, 3, 4)
       x = random_ops.random_normal(shape=(1, 2, 3, 4))
@@ -209,7 +209,7 @@ class LinearOperatorIdentityTest(
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Given this x and LinearOperatorIdentity shape of (2, 1, 3, 3), the
       # broadcast shape of operator and 'x' is (2, 2, 3, 4)
       x = array_ops.placeholder(dtypes.float32)
@@ -287,39 +287,39 @@ class LinearOperatorScaledIdentityTest(
     return operator, matrix
 
   def test_assert_positive_definite_does_not_raise_when_positive(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=1.)
       operator.assert_positive_definite().run()  # Should not fail
 
   def test_assert_positive_definite_raises_when_negative(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=-1.)
       with self.assertRaisesOpError("not positive definite"):
         operator.assert_positive_definite().run()
 
   def test_assert_non_singular_does_not_raise_when_non_singular(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1., 2., 3.])
       operator.assert_non_singular().run()  # Should not fail
 
   def test_assert_non_singular_raises_when_singular(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1., 2., 0.])
       with self.assertRaisesOpError("was singular"):
         operator.assert_non_singular().run()
 
   def test_assert_self_adjoint_does_not_raise_when_self_adjoint(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1. + 0J])
       operator.assert_self_adjoint().run()  # Should not fail
 
   def test_assert_self_adjoint_raises_when_not_self_adjoint(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=[1. + 1J])
       with self.assertRaisesOpError("not self-adjoint"):
@@ -328,7 +328,7 @@ class LinearOperatorScaledIdentityTest(
   def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
-    with self.test_session():
+    with self.cached_session():
       multiplier = rng.rand(3).astype(np.float16)
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=multiplier)
@@ -353,7 +353,7 @@ class LinearOperatorScaledIdentityTest(
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
 
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows, multiplier=[1., 2], assert_proper_shapes=True)
       y = operator.matmul(x)
@@ -364,7 +364,7 @@ class LinearOperatorScaledIdentityTest(
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Given this x and LinearOperatorScaledIdentity shape of (2, 1, 3, 3), the
       # broadcast shape of operator and 'x' is (2, 2, 3, 4)
       x = random_ops.random_normal(shape=(1, 2, 3, 4))
@@ -392,7 +392,7 @@ class LinearOperatorScaledIdentityTest(
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Given this x and LinearOperatorScaledIdentity shape of (3, 3), the
       # broadcast shape of operator and 'x' is (1, 2, 3, 4), which is the same
       # shape as x.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index e26b946151..7e81c9c6c4 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -70,7 +70,7 @@ class KroneckerDenseTest(test.TestCase):
         [10., 15., -2., -3.],
         [5., 10., -1., -2.]], dtype=dtypes.float32)
 
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(_kronecker_dense([x, y]).eval(), z.eval())
       self.assertAllClose(_kronecker_dense([y, x]).eval(), w.eval())
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index 0e38dbd48d..61268607a4 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -256,7 +256,7 @@ class LinearOpearatorLowRankUpdateBroadcastsShape(test.TestCase):
 
     # domain_dimension is 3
     self.assertAllEqual([2, 3, 3], operator.shape)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllEqual([2, 3, 3], operator.to_dense().eval().shape)
 
   def test_dynamic_shape_broadcasts_up_from_operator_to_other_args(self):
@@ -274,7 +274,7 @@ class LinearOpearatorLowRankUpdateBroadcastsShape(test.TestCase):
         u_shape_ph: [2, 3, 2],  # batch_shape = [2]
     }
 
-    with self.test_session():
+    with self.cached_session():
       shape_tensor = operator.shape_tensor().eval(feed_dict=feed_dict)
       self.assertAllEqual([2, 3, 3], shape_tensor)
       dense = operator.to_dense().eval(feed_dict=feed_dict)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index b389e0cbdf..eb4bff915b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -51,7 +51,7 @@ class LinearOperatorLowerTriangularTest(
 
   def test_assert_non_singular(self):
     # Singlular matrix with one positive eigenvalue and one zero eigenvalue.
-    with self.test_session():
+    with self.cached_session():
       tril = [[1., 0.], [1., 0.]]
       operator = linalg.LinearOperatorLowerTriangular(tril)
       with self.assertRaisesOpError("Singular operator"):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 8e9f0150a2..819347343b 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -108,7 +108,7 @@ class LinearOperatorTest(test.TestCase):
     self.assertAllEqual(3, operator.range_dimension)
 
   def test_all_shape_methods_defined_by_the_one_method_shape(self):
-    with self.test_session():
+    with self.cached_session():
       shape = (1, 2, 3, 4)
       operator = LinearOperatorShape(shape)
 
@@ -131,7 +131,7 @@ class LinearOperatorTest(test.TestCase):
   def test_generic_to_dense_method_non_square_matrix_static(self):
     matrix = rng.randn(2, 3, 4)
     operator = LinearOperatorMatmulSolve(matrix)
-    with self.test_session():
+    with self.cached_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
       self.assertAllClose(matrix, operator_dense.eval())
@@ -140,7 +140,7 @@ class LinearOperatorTest(test.TestCase):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
     operator = LinearOperatorMatmulSolve(matrix_ph)
-    with self.test_session():
+    with self.cached_session():
       operator_dense = operator.to_dense()
       self.assertAllClose(
           matrix, operator_dense.eval(feed_dict={matrix_ph: matrix}))
@@ -149,7 +149,7 @@ class LinearOperatorTest(test.TestCase):
     matrix = [[1., 0], [0., 2.]]
     operator = LinearOperatorMatmulSolve(matrix)
     x = [1., 1.]
-    with self.test_session():
+    with self.cached_session():
       y = operator.matvec(x)
       self.assertAllEqual((2,), y.get_shape())
       self.assertAllClose([1., 2.], y.eval())
@@ -158,7 +158,7 @@ class LinearOperatorTest(test.TestCase):
     matrix = [[1., 0], [0., 2.]]
     operator = LinearOperatorMatmulSolve(matrix)
     y = [1., 1.]
-    with self.test_session():
+    with self.cached_session():
       x = operator.solvevec(y)
       self.assertAllEqual((2,), x.get_shape())
       self.assertAllClose([1., 1 / 2.], x.eval())
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 7b291e29de..86847d38c2 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -36,7 +36,7 @@ class AssertZeroImagPartTest(test.TestCase):
 
   def test_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([0., 2, 3])
-    with self.test_session():
+    with self.cached_session():
       # Should not raise.
       linear_operator_util.assert_zero_imag_part(x, message="ABC123").run()
 
@@ -44,7 +44,7 @@ class AssertZeroImagPartTest(test.TestCase):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([0., 0, 0])
     z = math_ops.complex(x, y)
-    with self.test_session():
+    with self.cached_session():
       # Should not raise.
       linear_operator_util.assert_zero_imag_part(z, message="ABC123").run()
 
@@ -52,7 +52,7 @@ class AssertZeroImagPartTest(test.TestCase):
     x = ops.convert_to_tensor([1., 2, 0])
     y = ops.convert_to_tensor([1., 2, 0])
     z = math_ops.complex(x, y)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("ABC123"):
         linear_operator_util.assert_zero_imag_part(z, message="ABC123").run()
 
@@ -61,7 +61,7 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 
   def test_nonzero_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 2, 3])
-    with self.test_session():
+    with self.cached_session():
       # Should not raise.
       linear_operator_util.assert_no_entries_with_modulus_zero(
           x, message="ABC123").run()
@@ -70,14 +70,14 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([1., 2, 0])
     z = math_ops.complex(x, y)
-    with self.test_session():
+    with self.cached_session():
       # Should not raise.
       linear_operator_util.assert_no_entries_with_modulus_zero(
           z, message="ABC123").run()
 
   def test_zero_real_tensor_raises(self):
     x = ops.convert_to_tensor([1., 0, 3])
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("ABC123"):
         linear_operator_util.assert_no_entries_with_modulus_zero(
             x, message="ABC123").run()
@@ -86,7 +86,7 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
     x = ops.convert_to_tensor([1., 2, 0])
     y = ops.convert_to_tensor([1., 2, 0])
     z = math_ops.complex(x, y)
-    with self.test_session():
+    with self.cached_session():
       with self.assertRaisesOpError("ABC123"):
         linear_operator_util.assert_no_entries_with_modulus_zero(
             z, message="ABC123").run()
@@ -103,7 +103,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     tensor, = linear_operator_util.broadcast_matrix_batch_dims([arr])
     self.assertTrue(isinstance(tensor, ops.Tensor))
 
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(arr, tensor.eval())
 
   def test_static_dims_broadcast(self):
@@ -118,7 +118,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
       x_bc_, y_bc_ = sess.run([x_bc, y_bc])
@@ -137,7 +137,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
       x_bc_, y_bc_ = sess.run([x_bc, y_bc])
@@ -159,7 +159,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y})
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
@@ -179,7 +179,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
     x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph])
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y})
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
@@ -203,7 +203,7 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(2, 3, 7)
     chol_broadcast = chol + np.zeros((2, 1, 1))
 
-    with self.test_session():
+    with self.cached_session():
       result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
@@ -219,7 +219,7 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
     chol_ph = array_ops.placeholder(dtypes.float64)
     rhs_ph = array_ops.placeholder(dtypes.float64)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       result, expected = sess.run(
           [
               linear_operator_util.cholesky_solve_with_broadcast(
@@ -242,7 +242,7 @@ class MatmulWithBroadcastTest(test.TestCase):
     y = rng.rand(3, 7)
     y_broadcast = y + np.zeros((2, 1, 1))
 
-    with self.test_session():
+    with self.cached_session():
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 1, 7), result.get_shape())
       expected = math_ops.matmul(x, y_broadcast)
@@ -258,7 +258,7 @@ class MatmulWithBroadcastTest(test.TestCase):
     x_ph = array_ops.placeholder(dtypes.float64)
     y_ph = array_ops.placeholder(dtypes.float64)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       result, expected = sess.run(
           [
               linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
@@ -279,7 +279,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(2, 3, 7)
     matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
-    with self.test_session():
+    with self.cached_session():
       result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
@@ -295,7 +295,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
     matrix_ph = array_ops.placeholder(dtypes.float64)
     rhs_ph = array_ops.placeholder(dtypes.float64)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       result, expected = sess.run(
           [
               linear_operator_util.matrix_solve_with_broadcast(
@@ -317,7 +317,7 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
     rhs = rng.rand(3, 7)
     rhs_broadcast = rhs + np.zeros((2, 1, 1))
 
-    with self.test_session():
+    with self.cached_session():
       result = linear_operator_util.matrix_triangular_solve_with_broadcast(
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
@@ -333,7 +333,7 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
     matrix_ph = array_ops.placeholder(dtypes.float64)
     rhs_ph = array_ops.placeholder(dtypes.float64)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       result, expected = sess.run(
           [
               linear_operator_util.matrix_triangular_solve_with_broadcast(
@@ -359,7 +359,7 @@ class DomainDimensionStubOperator(object):
 class AssertCompatibleMatrixDimensionsTest(test.TestCase):
 
   def test_compatible_dimensions_do_not_raise(self):
-    with self.test_session():
+    with self.cached_session():
       x = ops.convert_to_tensor(rng.rand(2, 3, 4))
       operator = DomainDimensionStubOperator(3)
       # Should not raise
@@ -367,7 +367,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
           operator, x).run()  # pyformat: disable
 
   def test_incompatible_dimensions_raise(self):
-    with self.test_session():
+    with self.cached_session():
       x = ops.convert_to_tensor(rng.rand(2, 4, 4))
       operator = DomainDimensionStubOperator(3)
       with self.assertRaisesOpError("Incompatible matrix dimensions"):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index 8f60b55e0a..f0556304ad 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -73,7 +73,7 @@ class LinearOperatorZerosTest(
       operator.assert_non_singular()
 
   def test_assert_self_adjoint(self):
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
       operator.assert_self_adjoint().run()  # Should not fail
 
@@ -108,7 +108,7 @@ class LinearOperatorZerosTest(
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
 
   def test_non_scalar_num_rows_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorZeros(
           num_rows, assert_proper_shapes=True)
@@ -116,7 +116,7 @@ class LinearOperatorZerosTest(
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
   def test_negative_num_rows_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       n = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorZeros(
           num_rows=n, assert_proper_shapes=True)
@@ -129,7 +129,7 @@ class LinearOperatorZerosTest(
         operator.to_dense().eval(feed_dict={n: -2})
 
   def test_non_1d_batch_shape_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorZeros(
           num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
@@ -137,7 +137,7 @@ class LinearOperatorZerosTest(
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
   def test_negative_batch_shape_raises_dynamic(self):
-    with self.test_session():
+    with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
       operator = linalg_lib.LinearOperatorZeros(
           num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
@@ -154,7 +154,7 @@ class LinearOperatorZerosTest(
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
 
-    with self.test_session():
+    with self.cached_session():
       operator = linalg_lib.LinearOperatorZeros(
           num_rows, assert_proper_shapes=True)
       y = operator.matmul(x)
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index bf82e08551..3193222262 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -421,6 +421,31 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                                  "Invalid data type at index 0"):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testZerosLike(self):
+    for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
+                  dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
+                  dtypes.float64, dtypes.complex64, dtypes.complex128,
+                  dtypes.bool):
+      l_empty = list_ops.empty_tensor_list(
+          element_dtype=dtype, element_shape=scalar_shape())
+      l_empty_zeros = array_ops.zeros_like(l_empty)
+      t_empty_zeros = list_ops.tensor_list_stack(
+          l_empty_zeros, element_dtype=dtype)
+
+      l_full = list_ops.tensor_list_push_back(l_empty,
+                                              math_ops.cast(0, dtype=dtype))
+      l_full = list_ops.tensor_list_push_back(l_full,
+                                              math_ops.cast(1, dtype=dtype))
+      l_full_zeros = array_ops.zeros_like(l_full)
+      t_full_zeros = list_ops.tensor_list_stack(
+          l_full_zeros, element_dtype=dtype)
+
+      self.assertAllEqual(self.evaluate(t_empty_zeros), [])
+      self.assertAllEqual(
+          self.evaluate(t_full_zeros), np.zeros(
+              (2,), dtype=dtype.as_numpy_dtype))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 24edc4f59f..723a15fbd1 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.linalg import linalg_impl
 from tensorflow.python.platform import test
 
 
@@ -39,7 +40,7 @@ class LogarithmOpTest(test.TestCase):
     inp = x.astype(np_type)
     with self.test_session(use_gpu=True):
       # Verify that expm(logm(A)) == A.
-      tf_ans = gen_linalg_ops.matrix_exponential(
+      tf_ans = linalg_impl.matrix_exponential(
           gen_linalg_ops.matrix_logarithm(inp))
       out = tf_ans.eval()
       self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3)
@@ -98,16 +99,25 @@ class LogarithmOpTest(test.TestCase):
     self._verifyLogarithmComplex(np.empty([0, 2, 2], dtype=np.complex64))
     self._verifyLogarithmComplex(np.empty([2, 0, 0], dtype=np.complex64))
 
-  def testRandomSmallAndLarge(self):
+  def testRandomSmallAndLargeComplex64(self):
     np.random.seed(42)
-    for dtype in np.complex64, np.complex128:
-      for batch_dims in [(), (1,), (3,), (2, 2)]:
-        for size in 8, 31, 32:
-          shape = batch_dims + (size, size)
-          matrix = np.random.uniform(
-              low=-1.0, high=1.0,
-              size=np.prod(shape)).reshape(shape).astype(dtype)
-          self._verifyLogarithmComplex(matrix)
+    for batch_dims in [(), (1,), (3,), (2, 2)]:
+      for size in 8, 31, 32:
+        shape = batch_dims + (size, size)
+        matrix = np.random.uniform(
+            low=-1.0, high=1.0,
+            size=np.prod(shape)).reshape(shape).astype(np.complex64)
+        self._verifyLogarithmComplex(matrix)
+
+  def testRandomSmallAndLargeComplex128(self):
+    np.random.seed(42)
+    for batch_dims in [(), (1,), (3,), (2, 2)]:
+      for size in 8, 31, 32:
+        shape = batch_dims + (size, size)
+        matrix = np.random.uniform(
+            low=-1.0, high=1.0,
+            size=np.prod(shape)).reshape(shape).astype(np.complex128)
+        self._verifyLogarithmComplex(matrix)
 
   def testConcurrentExecutesWithoutError(self):
     with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
index dd67919f69..e14894cf56 100644
--- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
+++ b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
@@ -182,6 +182,19 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
   def testSmallStddev(self):
     self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10)
 
+  def testSamplingWithSmallStdDevFarFromBound(self):
+    sample_op = random_ops.parameterized_truncated_normal(
+        shape=(int(1e5),), means=0.8, stddevs=0.05, minvals=-1., maxvals=1.)
+
+    with self.test_session(use_gpu=True) as sess:
+      samples = sess.run(sample_op)
+      # 0. is more than 16 standard deviations from the mean, and
+      # should have a likelihood < 1e-57.
+      # TODO(jjhunt)  Sampler is still numerically unstable in this case,
+      # numbers less than 0 should never observed.
+      no_neg_samples = np.sum(samples < 0.)
+      self.assertLess(no_neg_samples, 2.)
+
 
 # Benchmarking code
 def parameterized_vs_naive(shape, num_iters, use_gpu=False):
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index f5c6255c34..15d5702252 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -31,6 +33,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
 
 
 class PartitionerCreatorsTest(test.TestCase):
@@ -594,6 +597,38 @@ class PartitionedVariablesTestCase(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(value.eval(), var_x.as_tensor().eval())
 
+  def testMetaGraphSaveLoad(self):
+    save_prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(
+        graph=save_graph) as session:
+      partitioner = partitioned_variables.fixed_size_partitioner(5, axis=0)
+      with variable_scope.variable_scope("root", partitioner=partitioner):
+        v0 = variable_scope.get_variable(
+            "v0", dtype=dtypes.float32, shape=(10, 10))
+        v0_list = v0._get_variable_list()
+        v0_part = v0._get_partitions()
+        self.assertEqual(len(v0_list), 5)
+        self.assertAllEqual(v0_part, (5, 1))
+        variables.global_variables_initializer().run()
+
+        save_graph.get_collection_ref("partvar").append(v0)
+        saver = saver_lib.Saver()
+        save_graph.finalize()
+        save_path = saver.save(sess=session, save_path=save_prefix)
+        previous_value = session.run(
+            save_graph.get_tensor_by_name(v0.name + ":0"))
+
+    restore_graph = ops.Graph()
+    with restore_graph.as_default(), self.test_session(
+        graph=restore_graph) as session:
+      saver = saver_lib.import_meta_graph(save_path + ".meta")
+      saver.restore(sess=session, save_path=save_path)
+      v0, = save_graph.get_collection_ref("partvar")
+      self.assertIsInstance(v0, variables.PartitionedVariable)
+      self.assertAllEqual(
+          previous_value,
+          session.run(restore_graph.get_tensor_by_name(v0.name + ":0")))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index 6028be1228..8ded522320 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -30,12 +30,12 @@ class RandomCropTest(test.TestCase):
     # No random cropping is performed since the size is value.shape.
     for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
       value = np.arange(0, np.prod(shape), dtype=np.int32).reshape(shape)
-      with self.test_session():
+      with self.cached_session():
         crop = random_ops.random_crop(value, shape).eval()
         self.assertAllEqual(crop, value)
 
   def testContains(self):
-    with self.test_session():
+    with self.cached_session():
       shape = (3, 5, 7)
       target = (2, 3, 4)
       value = np.random.randint(1000000, size=shape)
@@ -57,7 +57,7 @@ class RandomCropTest(test.TestCase):
     single = [1, 1, 1]
     value = np.arange(size).reshape(shape)
 
-    with self.test_session():
+    with self.cached_session():
       crop = random_ops.random_crop(value, single, seed=7)
       counts = np.zeros(size, dtype=np.int32)
       for _ in range(num_samples):
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index aa40228dc1..d969944493 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -256,7 +256,7 @@ class RandomGammaTest(test.TestCase):
   def testPositive(self):
     n = int(10e3)
     for dt in [dtypes.float16, dtypes.float32, dtypes.float64]:
-      with self.test_session():
+      with self.cached_session():
         x = random_ops.random_gamma(shape=[n], alpha=0.001, dtype=dt, seed=0)
         self.assertEqual(0, math_ops.reduce_sum(math_ops.cast(
             math_ops.less_equal(x, 0.), dtype=dtypes.int64)).eval())
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index c1d455b785..d89056c485 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -49,7 +49,7 @@ class AddLeadingUnitDimensionsTest(test.TestCase):
     x = array_ops.placeholder(dtypes.float32)
     num_dimensions = array_ops.placeholder(dtypes.int32)
     ret = random_grad.add_leading_unit_dimensions(x, num_dimensions)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       ret_val = sess.run(ret, {x: np.ones([2, 2]), num_dimensions: 2})
     self.assertAllEqual(ret_val.shape, [1, 1, 2, 2])
 
@@ -99,7 +99,7 @@ class RandomGammaGradTest(test.TestCase):
 
     alpha_val = np.ones([1, 2])
     beta_val = np.ones([2, 1])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       grads_alpha_val, grads_beta_val = sess.run(
           [grads_alpha, grads_beta],
           {alpha: alpha_val, beta: beta_val, shape: [2, 1]})
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index afdf71e652..15ab95cdb7 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -137,7 +137,7 @@ class RandomPoissonTest(test.TestCase):
         self.assertGreaterEqual(np.linalg.norm(diff.eval()), 1)
 
   def testZeroShape(self):
-    with self.test_session():
+    with self.cached_session():
       rnd = random_ops.random_poisson([], [], seed=12345)
       self.assertEqual([0], rnd.get_shape().as_list())
       self.assertAllClose(np.array([], dtype=np.float32), rnd.eval())
@@ -186,7 +186,7 @@ class RandomPoissonTest(test.TestCase):
 
   def testDTypeCombinationsV2(self):
     """Tests random_poisson_v2() for all supported dtype combinations."""
-    with self.test_session():
+    with self.cached_session():
       for lam_dt in _SUPPORTED_DTYPES:
         for out_dt in _SUPPORTED_DTYPES:
           random_ops.random_poisson(
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index b7a79f239c..0d85a072d4 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -46,7 +46,7 @@ class RandomShuffleQueueTest(test.TestCase):
     tf_logging.error("Finished: %s", self._testMethodName)
 
   def testEnqueue(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 5, dtypes_lib.float32)
       enqueue_op = q.enqueue((10.0,))
       self.assertAllEqual(0, q.size().eval())
@@ -54,7 +54,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertAllEqual(1, q.size().eval())
 
   def testEnqueueWithShape(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(
           10, 5, dtypes_lib.float32, shapes=tensor_shape.TensorShape([3, 2]))
       enqueue_correct_op = q.enqueue(([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],))
@@ -64,7 +64,7 @@ class RandomShuffleQueueTest(test.TestCase):
         q.enqueue(([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],))
 
   def testEnqueueManyWithShape(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(
           10, 5, [dtypes_lib.int32, dtypes_lib.int32], shapes=[(), (2,)])
       q.enqueue_many([[1, 2, 3, 4], [[1, 1], [2, 2], [3, 3], [4, 4]]]).run()
@@ -76,7 +76,7 @@ class RandomShuffleQueueTest(test.TestCase):
       q2.enqueue_many(([[1, 2, 3]],))
 
   def testScalarShapes(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           10, 0, [dtypes_lib.int32, dtypes_lib.int32], shapes=[(), (1,)])
       q.enqueue_many([[1, 2, 3, 4], [[5], [6], [7], [8]]]).run()
@@ -93,7 +93,7 @@ class RandomShuffleQueueTest(test.TestCase):
                             results)
 
   def testParallelEnqueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -119,7 +119,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, results)
 
   def testParallelDequeue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -143,7 +143,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, results)
 
   def testDequeue(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -156,7 +156,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, vals)
 
   def testEnqueueAndBlockingDequeue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(3, 0, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0]
       enqueue_ops = [q.enqueue((x,)) for x in elems]
@@ -185,7 +185,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, results)
 
   def testMultiEnqueueAndDequeue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           10, 0, (dtypes_lib.int32, dtypes_lib.float32))
       elems = [(5, 10.0), (10, 20.0), (15, 30.0)]
@@ -202,12 +202,12 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, results)
 
   def testQueueSizeEmpty(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 5, dtypes_lib.float32)
       self.assertEqual(0, q.size().eval())
 
   def testQueueSizeAfterEnqueueAndDequeue(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue()
@@ -220,7 +220,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual([0], size.eval())
 
   def testEnqueueMany(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -234,7 +234,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems + elems, results)
 
   def testEmptyEnqueueMany(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 5, dtypes_lib.float32)
       empty_t = constant_op.constant(
           [], dtype=dtypes_lib.float32, shape=[0, 2, 3])
@@ -246,7 +246,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual(0, size_t.eval())
 
   def testEmptyDequeueMany(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, shapes=())
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
@@ -256,7 +256,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual([], dequeued_t.eval().tolist())
 
   def testEmptyDequeueUpTo(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, shapes=())
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
@@ -266,7 +266,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual([], dequeued_t.eval().tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       enqueue_op = q.enqueue((constant_op.constant(
           [10.0, 20.0], shape=(1, 2)),))
@@ -287,7 +287,7 @@ class RandomShuffleQueueTest(test.TestCase):
         dequeued_t.eval()
 
   def testEmptyDequeueUpToWithNoShape(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       enqueue_op = q.enqueue((constant_op.constant(
           [10.0, 20.0], shape=(1, 2)),))
@@ -308,7 +308,7 @@ class RandomShuffleQueueTest(test.TestCase):
         dequeued_t.eval()
 
   def testMultiEnqueueMany(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           10, 0, (dtypes_lib.float32, dtypes_lib.int32))
       float_elems = [10.0, 20.0, 30.0, 40.0]
@@ -327,7 +327,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(expected, results)
 
   def testDequeueMany(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -340,7 +340,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, results)
 
   def testDequeueUpToNoBlocking(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -353,7 +353,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, results)
 
   def testMultiDequeueMany(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           10, 0, (dtypes_lib.float32, dtypes_lib.int32), shapes=((), (2,)))
       float_elems = [
@@ -387,7 +387,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(zip(float_elems, int_elems), results)
 
   def testMultiDequeueUpToNoBlocking(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           10, 0, (dtypes_lib.float32, dtypes_lib.int32), shapes=((), (2,)))
       float_elems = [
@@ -422,7 +422,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(zip(float_elems, int_elems), results)
 
   def testHighDimension(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.int32, (
           (4, 4, 4, 4)))
       elems = np.array([[[[[x] * 4] * 4] * 4] * 4 for x in range(10)], np.int32)
@@ -433,7 +433,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(dequeued_t.eval().tolist(), elems.tolist())
 
   def testParallelEnqueueMany(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           1000, 0, dtypes_lib.float32, shapes=())
       elems = [10.0 * x for x in range(100)]
@@ -453,7 +453,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(dequeued_t.eval(), elems * 10)
 
   def testParallelDequeueMany(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           1000, 0, dtypes_lib.float32, shapes=())
       elems = [10.0 * x for x in range(1000)]
@@ -476,7 +476,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, dequeued_elems)
 
   def testParallelDequeueUpTo(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           1000, 0, dtypes_lib.float32, shapes=())
       elems = [10.0 * x for x in range(1000)]
@@ -499,7 +499,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, dequeued_elems)
 
   def testParallelDequeueUpToRandomPartition(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dequeue_sizes = [random.randint(50, 150) for _ in xrange(10)]
       total_elements = sum(dequeue_sizes)
       q = data_flow_ops.RandomShuffleQueue(
@@ -527,7 +527,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, dequeued_elems)
 
   def testBlockingDequeueMany(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -554,7 +554,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, dequeued_elems)
 
   def testBlockingDequeueUpTo(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -581,7 +581,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, dequeued_elems)
 
   def testDequeueManyWithTensorParameter(self):
-    with self.test_session():
+    with self.cached_session():
       # Define a first queue that contains integer counts.
       dequeue_counts = [random.randint(1, 10) for _ in range(100)]
       count_q = data_flow_ops.RandomShuffleQueue(100, 0, dtypes_lib.int32)
@@ -607,7 +607,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, dequeued_elems)
 
   def testDequeueUpToWithTensorParameter(self):
-    with self.test_session():
+    with self.cached_session():
       # Define a first queue that contains integer counts.
       dequeue_counts = [random.randint(1, 10) for _ in range(100)]
       count_q = data_flow_ops.RandomShuffleQueue(100, 0, dtypes_lib.int32)
@@ -633,7 +633,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elems, dequeued_elems)
 
   def testDequeueFromClosedQueue(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 2, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -652,7 +652,7 @@ class RandomShuffleQueueTest(test.TestCase):
         dequeued_t.eval()
 
   def testBlockingDequeueFromClosedQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       min_size = 2
       q = data_flow_ops.RandomShuffleQueue(10, min_size, dtypes_lib.float32)
       elems = [10.0, 20.0, 30.0, 40.0]
@@ -690,7 +690,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual(len(results), 4)
 
   def testBlockingDequeueFromClosedEmptyQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       close_op = q.close()
       dequeued_t = q.dequeue()
@@ -715,7 +715,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual(len(finished), 1)
 
   def testBlockingDequeueManyFromClosedQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -751,7 +751,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual(len(progress), 2)
 
   def testBlockingDequeueUpToFromClosedQueueReturnsRemainder(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -778,7 +778,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(results, elems)
 
   def testBlockingDequeueUpToSmallerThanMinAfterDequeue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(
           capacity=10,
           min_after_dequeue=2,
@@ -811,7 +811,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(results, elems)
 
   def testBlockingDequeueManyFromClosedQueueWithElementsRemaining(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -845,7 +845,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual(len(results), 4)
 
   def testBlockingDequeueManyFromClosedEmptyQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 5, dtypes_lib.float32, ((),))
       close_op = q.close()
       dequeued_t = q.dequeue_many(4)
@@ -865,7 +865,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeue_thread.join()
 
   def testBlockingDequeueUpToFromClosedEmptyQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(10, 5, dtypes_lib.float32, ((),))
       close_op = q.close()
       dequeued_t = q.dequeue_up_to(4)
@@ -885,7 +885,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeue_thread.join()
 
   def testEnqueueToClosedQueue(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 4, dtypes_lib.float32)
       enqueue_op = q.enqueue((10.0,))
       close_op = q.close()
@@ -898,7 +898,7 @@ class RandomShuffleQueueTest(test.TestCase):
         enqueue_op.run()
 
   def testEnqueueManyToClosedQueue(self):
-    with self.test_session():
+    with self.cached_session():
       q = data_flow_ops.RandomShuffleQueue(10, 5, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -912,7 +912,7 @@ class RandomShuffleQueueTest(test.TestCase):
         enqueue_op.run()
 
   def testBlockingEnqueueToFullQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(4, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -940,7 +940,7 @@ class RandomShuffleQueueTest(test.TestCase):
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(4, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -974,7 +974,7 @@ class RandomShuffleQueueTest(test.TestCase):
       thread.join()
 
   def testBlockingEnqueueToClosedQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(4, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0, 40.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -1019,7 +1019,7 @@ class RandomShuffleQueueTest(test.TestCase):
       thread1.join()
 
   def testBlockingEnqueueManyToClosedQueue(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(4, 0, dtypes_lib.float32, ((),))
       elems = [10.0, 20.0, 30.0]
       enqueue_op = q.enqueue_many((elems,))
@@ -1067,7 +1067,7 @@ class RandomShuffleQueueTest(test.TestCase):
         sess.run(blocking_enqueue_op)
 
   def testSharedQueueSameSession(self):
-    with self.test_session():
+    with self.cached_session():
       q1 = data_flow_ops.RandomShuffleQueue(
           1, 0, dtypes_lib.float32, ((),), shared_name="shared_queue")
       q1.enqueue((10.0,)).run()
@@ -1104,7 +1104,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual(q2_size_t.eval(), 0)
 
   def testSharedQueueSameSessionGraphSeedNone(self):
-    with self.test_session():
+    with self.cached_session():
       q1 = data_flow_ops.RandomShuffleQueue(
           1,
           0,
@@ -1127,7 +1127,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual(q2_size_t.eval(), 1)
 
   def testIncompatibleSharedQueueErrors(self):
-    with self.test_session():
+    with self.cached_session():
       q_a_1 = data_flow_ops.RandomShuffleQueue(
           10, 5, dtypes_lib.float32, shared_name="q_a")
       q_a_2 = data_flow_ops.RandomShuffleQueue(
@@ -1193,7 +1193,7 @@ class RandomShuffleQueueTest(test.TestCase):
         q_h_2.queue_ref.op.run()
 
   def testSelectQueue(self):
-    with self.test_session():
+    with self.cached_session():
       num_queues = 10
       qlist = list()
       for _ in xrange(num_queues):
@@ -1207,7 +1207,7 @@ class RandomShuffleQueueTest(test.TestCase):
         self.assertEqual(q.dequeue().eval(), 10.0)
 
   def testSelectQueueOutOfRange(self):
-    with self.test_session():
+    with self.cached_session():
       q1 = data_flow_ops.RandomShuffleQueue(10, 0, dtypes_lib.float32)
       q2 = data_flow_ops.RandomShuffleQueue(15, 0, dtypes_lib.float32)
       enq_q = data_flow_ops.RandomShuffleQueue.from_list(3, [q1, q2])
@@ -1235,7 +1235,7 @@ class RandomShuffleQueueTest(test.TestCase):
       sess.run(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q_empty = data_flow_ops.RandomShuffleQueue(5, 0, dtypes_lib.float32, (
           (),))
       dequeue_op = q_empty.dequeue()
@@ -1267,7 +1267,7 @@ class RandomShuffleQueueTest(test.TestCase):
         t.join()
 
   def testDequeueManyInDifferentOrders(self):
-    with self.test_session():
+    with self.cached_session():
       # Specify seeds to make the test deterministic
       # (https://en.wikipedia.org/wiki/Taxicab_number).
       q1 = data_flow_ops.RandomShuffleQueue(
@@ -1301,7 +1301,7 @@ class RandomShuffleQueueTest(test.TestCase):
           self.assertNotEqual(results[i], results[j])
 
   def testDequeueUpToInDifferentOrders(self):
-    with self.test_session():
+    with self.cached_session():
       # Specify seeds to make the test deterministic
       # (https://en.wikipedia.org/wiki/Taxicab_number).
       q1 = data_flow_ops.RandomShuffleQueue(
@@ -1335,7 +1335,7 @@ class RandomShuffleQueueTest(test.TestCase):
           self.assertNotEqual(results[i], results[j])
 
   def testDequeueInDifferentOrders(self):
-    with self.test_session():
+    with self.cached_session():
       # Specify seeds to make the test deterministic
       # (https://en.wikipedia.org/wiki/Taxicab_number).
       q1 = data_flow_ops.RandomShuffleQueue(
@@ -1371,7 +1371,7 @@ class RandomShuffleQueueTest(test.TestCase):
           self.assertNotEqual(results[i], results[j])
 
   def testBigEnqueueMany(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(5, 0, dtypes_lib.int32, ((),))
       elem = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
       enq = q.enqueue_many((elem,))
@@ -1416,7 +1416,7 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertItemsEqual(elem, results)
 
   def testBigDequeueMany(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(2, 0, dtypes_lib.int32, ((),))
       elem = np.arange(4, dtype=np.int32)
       enq_list = [q.enqueue((e,)) for e in elem]
diff --git a/tensorflow/python/kernel_tests/regex_replace_op_test.py b/tensorflow/python/kernel_tests/regex_replace_op_test.py
index 6739ac3224..f0e84b8fca 100644
--- a/tensorflow/python/kernel_tests/regex_replace_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_replace_op_test.py
@@ -18,54 +18,104 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
-class RegexReplaceOpTest(test.TestCase):
+@parameterized.parameters(
+    (gen_string_ops.regex_replace),
+    (gen_string_ops.static_regex_replace))
+class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
+
+  def testForwarding(self, op):
+    with self.test_session():
+      # Generate an input that is uniquely consumed by the regex op.
+      # This exercises code paths which are optimized for this case
+      # (e.g., using forwarding).
+      inp = string_ops.substr(
+          constant_op.constant(["AbCdEfG",
+                                "HiJkLmN"], dtypes.string),
+          pos=0,
+          len=5)
+      stripped = op(inp, "\\p{Ll}", ".").eval()
+      self.assertAllEqual([b"A.C.E", b"H.J.L"], stripped)
 
-  def testRemovePrefix(self):
+  def testRemovePrefix(self, op):
     values = ["a:foo", "a:bar", "a:foo", "b:baz", "b:qux", "ca:b"]
     with self.test_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = string_ops.regex_replace(
-          input_vector, "^(a:|b:)", "", replace_global=False).eval()
+      stripped = op(input_vector, "^(a:|b:)", "", replace_global=False).eval()
       self.assertAllEqual([b"foo", b"bar", b"foo", b"baz", b"qux", b"ca:b"],
                           stripped)
 
-  def testRegexReplace(self):
+  def testRegexReplace(self, op):
     values = ["aba\naba", "abcdabcde"]
     with self.test_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = string_ops.regex_replace(input_vector, "a.*a", "(\\0)").eval()
+      stripped = op(input_vector, "a.*a", "(\\0)").eval()
       self.assertAllEqual([b"(aba)\n(aba)", b"(abcda)bcde"], stripped)
 
-  def testEmptyMatch(self):
+  def testEmptyMatch(self, op):
     values = ["abc", "1"]
     with self.test_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = string_ops.regex_replace(input_vector, "", "x").eval()
+      stripped = op(input_vector, "", "x").eval()
       self.assertAllEqual([b"xaxbxcx", b"x1x"], stripped)
 
-  def testInvalidPattern(self):
+  def testInvalidPattern(self, op):
     values = ["abc", "1"]
     with self.test_session():
       input_vector = constant_op.constant(values, dtypes.string)
       invalid_pattern = "A["
-      replace = string_ops.regex_replace(input_vector, invalid_pattern, "x")
+      replace = op(input_vector, invalid_pattern, "x")
       with self.assertRaisesOpError("Invalid pattern"):
         replace.eval()
 
-  def testGlobal(self):
+  def testGlobal(self, op):
     values = ["ababababab", "abcabcabc", ""]
     with self.test_session():
       input_vector = constant_op.constant(values, dtypes.string)
-      stripped = string_ops.regex_replace(input_vector, "ab", "abc",
-                                          True).eval()
+      stripped = op(input_vector, "ab", "abc", True).eval()
       self.assertAllEqual([b"abcabcabcabcabc", b"abccabccabcc", b""], stripped)
 
 
+def as_string(s):
+  return s
+
+
+def as_tensor(s):
+  return constant_op.constant(s, dtypes.string)
+
+
+class RegexReplaceTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      (as_string, as_tensor),
+      (as_tensor, as_string),
+      (as_tensor, as_tensor))
+  def testRegexReplaceDelegation(self, pattern_fn, rewrite_fn):
+    with compat.forward_compatibility_horizon(2018, 10, 11):
+      with self.test_session():
+        input_vector = constant_op.constant("foo", dtypes.string)
+        pattern = pattern_fn("[a-z]")
+        replace = rewrite_fn(".")
+        op = string_ops.regex_replace(input_vector, pattern, replace)
+        self.assertTrue(op.name.startswith("RegexReplace"))
+
+  def testStaticRegexReplaceDelegation(self):
+    with compat.forward_compatibility_horizon(2018, 10, 11):
+      with self.test_session():
+        input_vector = constant_op.constant("foo", dtypes.string)
+        pattern = "[a-z]"
+        replace = "."
+        op = string_ops.regex_replace(input_vector, pattern, replace)
+        self.assertTrue(op.name.startswith("StaticRegexReplace"))
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index b1ef46f2a1..d0ed08933d 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -17,7 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import gc
+import os
+import pickle
 
 import numpy as np
 
@@ -106,6 +109,34 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(False, name="bool_test")
       self.assertAllEqual(bool(v), False)
 
+  def testEagerDeepCopy(self):
+    with context.eager_mode():
+      init_value = np.ones((4, 4, 4))
+      variable = resource_variable_ops.ResourceVariable(init_value,
+                                                        name="init")
+
+      copied_variable = copy.deepcopy(variable)
+      copied_variable.assign(4 * np.ones((4, 4, 4)))
+
+      # Copying the variable should create a new underlying tensor with distinct
+      # values.
+      self.assertFalse(np.allclose(variable.numpy(), copied_variable.numpy()))
+
+  def testGraphDeepCopy(self):
+    with self.test_session():
+      init_value = np.ones((4, 4, 4))
+      variable = resource_variable_ops.ResourceVariable(init_value,
+                                                        name="init")
+      with self.assertRaises(NotImplementedError):
+        copy.deepcopy(variable)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStridedSliceAssign(self):
+    v = resource_variable_ops.ResourceVariable([1.0, 2.0])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(v[0].assign(2.0))
+    self.assertAllEqual(self.evaluate(v), [2.0, 2.0])
+
   def testDifferentAssignGraph(self):
     with ops.Graph().as_default():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -233,6 +264,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[5]])
 
+  def testEagerPickle(self):
+    with context.eager_mode():
+      tmp_dir = self.get_temp_dir()
+      fname = os.path.join(tmp_dir, "var.pickle")
+      with open(fname, "wb") as f:
+        v = resource_variable_ops.ResourceVariable(10.0)
+        pickle.dump(v, f)
+
+      with open(fname, "rb") as f:
+        v = pickle.load(f)
+        self.assertAllEqual(v.numpy(), 10.0)
+
   @test_util.run_in_graph_and_eager_modes
   def testScatterDiv(self):
     handle = resource_variable_ops.var_handle_op(
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index e32d7c4e67..78f2993d27 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -44,11 +45,13 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_lib
 import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
+from tensorflow.python.ops.losses import losses
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 import tensorflow.python.ops.sparse_grad  # pylint: disable=unused-import
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
+from tensorflow.python.training import training
 
 
 class Plus1RNNCell(rnn_cell_impl.RNNCell):
@@ -250,12 +253,44 @@ class RNNTest(test.TestCase):
     self.assertAllEqual(4, state[0])
     self.assertAllEqual([[[1]], [[2]], [[3]], [[4]]], state[1])
 
+  def testCellGetInitialState(self):
+    cell = rnn_cell_impl.BasicRNNCell(5)
+    with self.assertRaisesRegexp(
+        ValueError, "batch_size and dtype cannot be None"):
+      cell.get_initial_state(None, None, None)
+
+    inputs = array_ops.placeholder(dtypes.float32, shape=(None, 4, 1))
+    with self.assertRaisesRegexp(
+        ValueError, "batch size from input tensor is different from"):
+      cell.get_initial_state(inputs=inputs, batch_size=50, dtype=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "batch size from input tensor is different from"):
+      cell.get_initial_state(
+          inputs=inputs, batch_size=constant_op.constant(50), dtype=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "dtype from input tensor is different from"):
+      cell.get_initial_state(inputs=inputs, batch_size=None, dtype=dtypes.int16)
+
+    initial_state = cell.get_initial_state(
+        inputs=inputs, batch_size=None, dtype=None)
+    self.assertEqual(initial_state.shape.as_list(), [None, 5])
+    self.assertEqual(initial_state.dtype, inputs.dtype)
+
+    batch = array_ops.shape(inputs)[0]
+    dtype = inputs.dtype
+    initial_state = cell.get_initial_state(None, batch, dtype)
+    self.assertEqual(initial_state.shape.as_list(), [None, 5])
+    self.assertEqual(initial_state.dtype, inputs.dtype)
+
   def _assert_cell_builds(self, cell_class, dtype, batch_size, in_size,
                           out_size):
     cell = cell_class(out_size, dtype=dtype)
     in_shape = tensor_shape.TensorShape((batch_size, in_size))
     cell.build(in_shape)
-    state_output = cell.zero_state(batch_size, dtype)
+    state_output = cell.get_initial_state(
+        inputs=None, batch_size=batch_size, dtype=dtype)
     cell_output, _ = cell(array_ops.zeros(in_shape, dtype), state_output)
     self.assertAllEqual([batch_size, out_size], cell_output.shape.as_list())
 
@@ -278,12 +313,228 @@ class RNNTest(test.TestCase):
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f32, 5, 7, 3)
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f64, 5, 7, 3)
 
+  def testRNNWithKerasSimpleRNNCell(self):
+    with self.test_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train)
+      cell = keras.layers.SimpleRNNCell(output_shape)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(state.shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state)
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables_lib.global_variables_initializer()])
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], {inputs: x_train, predict: y_train})
+
+      self.assertEqual(len(outputs), batch)
+      self.assertEqual(len(state), batch)
+
+  def testRNNWithKerasGRUCell(self):
+    with self.test_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train)
+      cell = keras.layers.GRUCell(output_shape)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(state.shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state)
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables_lib.global_variables_initializer()])
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], {inputs: x_train, predict: y_train})
+
+      self.assertEqual(len(outputs), batch)
+      self.assertEqual(len(state), batch)
+
+  def testRNNWithKerasLSTMCell(self):
+    with self.test_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train)
+      cell = keras.layers.LSTMCell(output_shape)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(len(state), 2)
+      self.assertEqual(state[0].shape.as_list(), [None, output_shape])
+      self.assertEqual(state[1].shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state[0])
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables_lib.global_variables_initializer()])
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], {inputs: x_train, predict: y_train})
+
+      self.assertEqual(len(outputs), batch)
+      self.assertEqual(len(state), 2)
+      self.assertEqual(len(state[0]), batch)
+      self.assertEqual(len(state[1]), batch)
+
+  def testRNNWithStackKerasCell(self):
+    with self.test_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train)
+      cell = keras.layers.StackedRNNCells(
+          [keras.layers.LSTMCell(2 * output_shape),
+           keras.layers.LSTMCell(output_shape)])
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(len(state), 4)
+      self.assertEqual(state[0].shape.as_list(), [None, 2 * output_shape])
+      self.assertEqual(state[1].shape.as_list(), [None, 2 * output_shape])
+      self.assertEqual(state[2].shape.as_list(), [None, output_shape])
+      self.assertEqual(state[3].shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state[2])
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables_lib.global_variables_initializer()])
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], {inputs: x_train, predict: y_train})
+
+      self.assertEqual(len(outputs), batch)
+      self.assertEqual(len(state), 4)
+      for s in state:
+        self.assertEqual(len(s), batch)
+
+  def testStaticRNNWithKerasSimpleRNNCell(self):
+    with self.test_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      x_train = np.transpose(x_train, (1, 0, 2))
+      y_train = keras.utils.to_categorical(y_train)
+      cell = keras.layers.SimpleRNNCell(output_shape)
+
+      inputs = [array_ops.placeholder(
+          dtypes.float32, shape=(None, input_shape))] * timestep
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.static_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(len(outputs), timestep)
+      self.assertEqual(outputs[0].shape.as_list(), [None, output_shape])
+      self.assertEqual(state.shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state)
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables_lib.global_variables_initializer()])
+      feed_dict = {i: d for i, d in zip(inputs, x_train)}
+      feed_dict[predict] = y_train
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], feed_dict)
+
+      self.assertEqual(len(outputs), timestep)
+      self.assertEqual(len(outputs[0]), batch)
+      self.assertEqual(len(state), batch)
+
+  def testKerasAndTFRNNLayerOutputComparison(self):
+    input_shape = 10
+    output_shape = 5
+    timestep = 4
+    batch = 20
+    (x_train, _), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    fix_weights_generator = keras.layers.SimpleRNNCell(output_shape)
+    fix_weights_generator.build((None, input_shape))
+    weights = fix_weights_generator.get_weights()
+
+    with self.test_session(graph=ops_lib.Graph()) as sess:
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      cell = keras.layers.SimpleRNNCell(output_shape)
+      tf_out, tf_state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      cell.set_weights(weights)
+      [tf_out, tf_state] = sess.run([tf_out, tf_state], {inputs: x_train})
+    with self.test_session(graph=ops_lib.Graph()) as sess:
+      k_input = keras.Input(shape=(timestep, input_shape),
+                            dtype=dtypes.float32)
+      cell = keras.layers.SimpleRNNCell(output_shape)
+      layer = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+      keras_out = layer(k_input)
+      cell.set_weights(weights)
+      k_out, k_state = sess.run(keras_out, {k_input: x_train})
+    self.assertAllClose(tf_out, k_out)
+    self.assertAllClose(tf_state, k_state)
+
   def testBasicLSTMCellInterchangeWithLSTMCell(self):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       basic_cell = rnn_cell_impl.BasicLSTMCell(1)
       basic_cell(array_ops.ones([1, 1]),
-                 state=basic_cell.zero_state(batch_size=1,
-                                             dtype=dtypes.float32))
+                 state=basic_cell.get_initial_state(inputs=None,
+                                                    batch_size=1,
+                                                    dtype=dtypes.float32))
       self.evaluate([v.initializer for v in basic_cell.variables])
       self.evaluate(basic_cell._bias.assign([10.] * 4))
       save = saver.Saver()
@@ -293,22 +544,21 @@ class RNNTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       lstm_cell = rnn_cell_impl.LSTMCell(1, name="basic_lstm_cell")
       lstm_cell(array_ops.ones([1, 1]),
-                state=lstm_cell.zero_state(batch_size=1,
-                                           dtype=dtypes.float32))
+                state=lstm_cell.get_initial_state(inputs=None,
+                                                  batch_size=1,
+                                                  dtype=dtypes.float32))
       self.evaluate([v.initializer for v in lstm_cell.variables])
       save = saver.Saver()
       save.restore(sess, save_path)
       self.assertAllEqual([10.] * 4, self.evaluate(lstm_cell._bias))
 
   def testRNNCellSerialization(self):
-    for cell in  [
+    for cell in [
         rnn_cell_impl.LSTMCell(32, use_peepholes=True, cell_clip=True),
         rnn_cell_impl.BasicLSTMCell(32, dtype=dtypes.float32),
-        # TODO(scottzhu): GRU and BasicRNN cell are not compatible with Keras.
-        # rnn_cell_impl.BasicRNNCell(
-        #     32, activation="relu", dtype=dtypes.float32),
-        # rnn_cell_impl.GRUCell(
-        #     32, kernel_initializer="ones", dtype=dtypes.float32)
+        rnn_cell_impl.BasicRNNCell(32, activation="relu", dtype=dtypes.float32),
+        rnn_cell_impl.GRUCell(
+            32, kernel_initializer="ones", dtype=dtypes.float32)
     ]:
       with self.test_session():
         x = keras.Input((None, 5))
@@ -326,11 +576,13 @@ class RNNTest(test.TestCase):
         # not visible as a Keras layer, and also has a name conflict with
         # keras.LSTMCell and GRUCell.
         layer = keras.layers.RNN.from_config(
-            config, custom_objects={
-                # "BasicRNNCell": rnn_cell_impl.BasicRNNCell,
-                # "GRUCell": rnn_cell_impl.GRUCell,
+            config,
+            custom_objects={
+                "BasicRNNCell": rnn_cell_impl.BasicRNNCell,
+                "GRUCell": rnn_cell_impl.GRUCell,
                 "LSTMCell": rnn_cell_impl.LSTMCell,
-                "BasicLSTMCell": rnn_cell_impl.BasicLSTMCell})
+                "BasicLSTMCell": rnn_cell_impl.BasicLSTMCell
+            })
         y = layer(x)
         model = keras.models.Model(x, y)
         model.set_weights(weights)
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index b8e7c50a37..c0269db9ae 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -121,9 +122,12 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) third-order gradient err = ", err)
     self.assertLess(err, 5e-5)
 
-  def testWarnInts(self):
-    # Running the op triggers address sanitizer errors, so we just make it
-    nn_ops.softplus(constant_op.constant(7))
+  def testNoInts(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "No OpKernel was registered to support Op 'Softplus'"):
+        nn_ops.softplus(constant_op.constant(7)).eval()
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index 371f86ff15..a5247ce08d 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -65,11 +66,12 @@ class SoftsignTest(test.TestCase):
     print("softsign (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
-  def testWarnInts(self):
-    # NOTE(irving): Actually I don't know how to intercept the warning, but
-    # let's make sure it runs.  I promised I've looked, and there was a warning.
+  def testNoInts(self):
     with self.test_session():
-      nn_ops.softsign(constant_op.constant(7)).eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "No OpKernel was registered to support Op 'Softsign'"):
+        nn_ops.softsign(constant_op.constant(7)).eval()
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py b/tensorflow/python/kernel_tests/string_length_op_test.py
index 2c1f099360..075a3204ad 100644
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,28 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-r"""Train a ConvNet on MNIST using K-FAC.
-
-Train on single machine. See `convnet.train_mnist_single_machine` for details.
-"""
+"""Tests for string_length_op."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
 
-from absl import flags
-import tensorflow as tf
-
-from tensorflow.contrib.kfac.examples import convnet
 
-FLAGS = flags.FLAGS
-flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
+class StringLengthOpTest(test.TestCase):
 
+  def testStringLength(self):
+    strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]
 
-def main(unused_argv):
-  convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
+    with self.test_session() as sess:
+      lengths = string_ops.string_length(strings)
+      values = sess.run(lengths)
+      self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
 
 
 if __name__ == "__main__":
-  tf.app.run(main=main)
+  test.main()
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index e20daccb28..b6a0f45adc 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -58,14 +58,28 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 5])
 
   def testStringSplitEmptyToken(self):
-    strings = [" hello ", "", "world "]
+    strings = ["", " a", "b ", " c", " ", " d ", "  e", "f  ", "  g  ", "  "]
 
     with self.test_session() as sess:
       tokens = string_ops.string_split(strings)
       indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [2, 0]])
-      self.assertAllEqual(values, [b"hello", b"world"])
-      self.assertAllEqual(shape, [3, 1])
+      self.assertAllEqual(
+          indices,
+          [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
+      self.assertAllEqual(values, [b"a", b"b", b"c", b"d", b"e", b"f", b"g"])
+      self.assertAllEqual(shape, [10, 1])
+
+  def testStringSplitOnSetEmptyToken(self):
+    strings = ["", " a", "b ", " c", " ", " d ", ". e", "f .", " .g. ", " ."]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split(strings, delimiter=" .")
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(
+          indices,
+          [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
+      self.assertAllEqual(values, [b"a", b"b", b"c", b"d", b"e", b"f", b"g"])
+      self.assertAllEqual(shape, [10, 1])
 
   def testStringSplitWithDelimiter(self):
     strings = ["hello|world", "hello world"]
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 0b3a396d6b..9dcdaa61ed 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -359,6 +360,23 @@ class TemplateTest(test.TestCase):
     self.assertEqual(2, len(tmpl1._checkpoint_dependencies))
     self.assertEqual("nested", tmpl1._checkpoint_dependencies[0].name)
     self.assertEqual("nested_1", tmpl1._checkpoint_dependencies[1].name)
+    model = training.Model()
+    model.template = tmpl1
+    self.assertEqual(model.variables, [v1, v2])
+    self.assertEqual(model.trainable_variables, [v1, v2])
+    self.assertEqual(len(model.non_trainable_variables), 0)
+    model.templates = [tmpl2]
+    self.assertEqual(model.variables, [v1, v2, v5, v6])
+    self.assertEqual(model.trainable_variables, [v1, v2, v5, v6])
+    self.assertEqual(len(model.non_trainable_variables), 0)
+    # Make sure losses, layers, and updates aren't broken by having a Template
+    # in the mix, which does not expose any updates or losses.
+    self.assertEqual([], model.layers)
+    self.assertEqual([], model.updates)
+    self.assertEqual([], model.losses)
+    self.assertEqual([], model.templates.layers)
+    self.assertEqual([], model.templates.updates)
+    self.assertEqual([], model.templates.losses)
 
   @test_util.run_in_graph_and_eager_modes
   def test_nested_templates_with_defun(self):
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index fa7c6a0f8a..d5f0726106 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -76,7 +76,7 @@ class TopKTest(test.TestCase):
         for result_index, src_index in np.ndenumerate(indices):
           value = values[result_index]
           expected_value = np_inputs[result_index[0], src_index]
-          np.testing.utils.assert_almost_equal(value, expected_value)
+          np.testing.assert_almost_equal(value, expected_value)
 
         # Check that if two elements are equal, the lower-index element appears
         # first.
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index ae2a0ab29a..b736b12416 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -335,7 +335,7 @@ class VariableScopeTest(test.TestCase):
         # reuse=True is for now only supported when eager execution is disabled.
         if not context.executing_eagerly():
           v = variable_scope.get_variable("v",
-                                          [])  # "v" is alredy there, reused
+                                          [])  # "v" is already there, reused
           losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
           self.assertEqual(3, len(losses))  # No new loss added.
 
@@ -389,6 +389,18 @@ class VariableScopeTest(test.TestCase):
       sess.run(v0.initializer)
       sess.run(add)
 
+  def testEnableResourceVariables(self):
+    old = variable_scope._DEFAULT_USE_RESOURCE
+    try:
+      variable_scope.enable_resource_variables()
+      self.assertTrue(isinstance(variables_lib.Variable(1.0),
+                                 resource_variable_ops.ResourceVariable))
+      variable_scope.disable_resource_variables()
+      self.assertFalse(isinstance(variables_lib.Variable(1.0),
+                                  resource_variable_ops.ResourceVariable))
+    finally:
+      variable_scope._DEFAULT_USE_RESOURCE = old
+
   def testControlFlow(self):
     with self.test_session() as sess:
       v0 = variable_scope.get_variable(
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index ab08865532..3ba880d7a1 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -262,11 +262,13 @@ class Layer(base_layer.Layer):
         use_resource = (use_resource or
                         self._use_resource_variables or
                         scope.use_resource)
+        if initializer is None:
+          initializer = scope.initializer
         variable = super(Layer, self).add_weight(
             name,
             shape,
             dtype=dtypes.as_dtype(dtype),
-            initializer=initializer or scope.initializer,
+            initializer=initializer,
             trainable=trainable,
             constraint=constraint,
             partitioner=partitioner,
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 625320b48b..d61d3b6dba 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -264,7 +264,7 @@ class ConvTest(test.TestCase):
       self.assertEqual(len(variables.trainable_variables()), 2)
 
   def testFunctionalConv2DInitializerFromScope(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         height, width = 7, 9
@@ -647,7 +647,7 @@ class SeparableConv2DTest(test.TestCase):
       self.assertEqual(len(variables.trainable_variables()), 3)
 
   def testFunctionalConv2DInitializerFromScope(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         height, width = 7, 9
@@ -882,7 +882,7 @@ class Conv2DTransposeTest(test.TestCase):
       self.assertEqual(len(variables.trainable_variables()), 2)
 
   def testFunctionalConv2DTransposeInitializerFromScope(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         height, width = 7, 9
@@ -1061,7 +1061,7 @@ class Conv3DTransposeTest(test.TestCase):
       self.assertEqual(len(variables.trainable_variables()), 2)
 
   def testFunctionalConv3DTransposeInitializerFromScope(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       with variable_scope.variable_scope(
           'scope', initializer=init_ops.ones_initializer()):
         depth, height, width = 5, 7, 9
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 040c1cddc0..46009a30ac 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -60,7 +60,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(dense.name, 'dense_2')
 
   def testVariableInput(self):
-    with self.test_session():
+    with self.cached_session():
       v = variable_scope.get_variable(
           'X', initializer=init_ops.zeros_initializer(), shape=(1, 1))
       x = core_layers.Dense(1)(v)
@@ -221,7 +221,7 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.losses, loss_keys)
 
   def testFunctionalDense(self):
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((5, 3), seed=1)
       outputs = core_layers.dense(
           inputs, 2, activation=nn_ops.relu, name='my_dense')
@@ -240,7 +240,7 @@ class DenseTest(test.TestCase):
 
   # TODO(alive): get this to  work in eager mode.
   def testFunctionalDenseTwiceReuse(self):
-    with self.test_session():
+    with self.cached_session():
       inputs = random_ops.random_uniform((5, 3), seed=1)
       core_layers.dense(inputs, 2, name='my_dense')
       vars1 = variables.trainable_variables()
@@ -250,7 +250,7 @@ class DenseTest(test.TestCase):
 
   # TODO(alive): get this to  work in eager mode.
   def testFunctionalDenseTwiceReuseFromScope(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('scope'):
         inputs = random_ops.random_uniform((5, 3), seed=1)
         core_layers.dense(inputs, 2, name='my_dense')
@@ -262,7 +262,8 @@ class DenseTest(test.TestCase):
 
   def testFunctionalDenseInitializerFromScope(self):
     with variable_scope.variable_scope(
-        'scope', initializer=init_ops.ones_initializer()), self.test_session():
+        'scope',
+        initializer=init_ops.ones_initializer()), self.cached_session():
       inputs = random_ops.random_uniform((5, 3), seed=1)
       core_layers.dense(inputs, 2)
       variables.global_variables_initializer().run()
@@ -305,7 +306,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(called[0], 2)
 
   def testFunctionalDenseInScope(self):
-    with self.test_session():
+    with self.cached_session():
       with variable_scope.variable_scope('test'):
         inputs = random_ops.random_uniform((5, 3), seed=1)
         core_layers.dense(inputs, 2, name='my_dense')
@@ -391,7 +392,7 @@ class DropoutTest(test.TestCase):
     self.assertAllClose(np.ones((5, 3)), np_output)
 
   def testDynamicLearningPhase(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       dp = core_layers.Dropout(0.5, seed=1)
       inputs = array_ops.ones((5, 5))
       training = array_ops.placeholder(dtype='bool')
@@ -424,7 +425,7 @@ class DropoutTest(test.TestCase):
     self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
 
   def testFunctionalDropout(self):
-    with self.test_session():
+    with self.cached_session():
       inputs = array_ops.ones((5, 5))
       dropped = core_layers.dropout(inputs, 0.5, training=True, seed=1)
       variables.global_variables_initializer().run()
@@ -435,7 +436,7 @@ class DropoutTest(test.TestCase):
       self.assertAllClose(np.ones((5, 5)), np_output)
 
   def testDynamicRate(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       rate = array_ops.placeholder(dtype='float32', name='rate')
       dp = core_layers.Dropout(rate, name='dropout')
       inputs = array_ops.ones((5, 5))
@@ -450,7 +451,7 @@ class DropoutTest(test.TestCase):
 class FlattenTest(test.TestCase):
 
   def testCreateFlatten(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
       y = core_layers.Flatten()(x)
       np_output = sess.run(y, feed_dict={x: np.zeros((3, 2, 3))})
@@ -484,7 +485,7 @@ class FlattenTest(test.TestCase):
       core_layers.Flatten()(x)
 
   def testFlattenUnknownAxes(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(5, None, None), dtype='float32')
       y = core_layers.Flatten()(x)
       np_output = sess.run(y, feed_dict={x: np.zeros((5, 2, 3))})
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index e147f348b0..a72d147a0b 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -72,7 +72,7 @@ class BNTest(test.TestCase):
              dtype=dtypes.float32):
     ops.reset_default_graph()
     graph = ops.get_default_graph()
-    with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
+    with self.session(graph=graph, use_gpu=use_gpu) as sess:
       image = array_ops.placeholder(dtype=dtype, shape=shape)
       loss, train_op, saver = self._simple_model(image, is_fused, freeze_mode)
       if restore:
@@ -94,7 +94,7 @@ class BNTest(test.TestCase):
     dtype = image_val.dtype
     ops.reset_default_graph()
     graph = ops.get_default_graph()
-    with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
+    with self.session(graph=graph, use_gpu=use_gpu) as sess:
       image = array_ops.placeholder(dtype=dtype, shape=shape)
       loss, _, saver = self._simple_model(image, is_fused, True)
       saver.restore(sess, checkpoint_path)
@@ -319,7 +319,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
 
@@ -361,7 +361,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
@@ -442,7 +442,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
@@ -482,7 +482,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
@@ -522,7 +522,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
@@ -563,7 +563,7 @@ class BNTest(test.TestCase):
       training = array_ops.placeholder(dtype='bool')
       outputs = bn.apply(inputs, training=training)
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         # Test training with placeholder learning phase.
         sess.run(variables.global_variables_initializer())
         np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
@@ -603,7 +603,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
@@ -644,7 +644,7 @@ class BNTest(test.TestCase):
     outputs_training = bn.apply(inputs, training=True)
     outputs_infer = bn.apply(inputs, training=False)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
@@ -694,7 +694,7 @@ class BNTest(test.TestCase):
     beta = all_vars['bn/beta:0']
     gamma = all_vars['bn/gamma:0']
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       np_gamma, np_beta = sess.run([gamma, beta])
@@ -756,7 +756,7 @@ class BNTest(test.TestCase):
     beta = all_vars['bn/beta:0']
     gamma = all_vars['bn/gamma:0']
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
       for _ in range(100):
@@ -1254,7 +1254,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
 
@@ -1294,7 +1294,7 @@ class BNTest(test.TestCase):
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Test training with placeholder learning phase.
       sess.run(variables.global_variables_initializer())
 
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index fc02d6de0e..6189503d8f 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -398,7 +398,7 @@ Status ConvertNdarrayToTensor(PyObject* obj, Tensor* ret) {
       TF_RETURN_IF_ERROR(NumericNpDTypeToTfDType(PyArray_TYPE(input), &dtype));
       CHECK(DataTypeCanUseMemcpy(dtype));
       if (reinterpret_cast<intptr_t>(PyArray_DATA(input)) %
-              EIGEN_MAX_ALIGN_BYTES !=
+              std::max(1, EIGEN_MAX_ALIGN_BYTES) !=
           0) {
         Tensor t(dtype, shape);
         StringPiece p = t.tensor_data();
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
index 44dfe7ba21..a9f39d3946 100644
--- a/tensorflow/python/lib/core/py_util.h
+++ b/tensorflow/python/lib/core/py_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
-#define TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -24,4 +24,4 @@ namespace tensorflow {
 string PyExceptionFetch();
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index 891a7b0fd0..0aa08ea3d1 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -42,7 +42,7 @@ inline void FileExists(const string& filename, TF_Status* out_status) {
 inline void FileExists(const tensorflow::StringPiece& filename,
     TF_Status* out_status) {
   tensorflow::Status status =
-      tensorflow::Env::Default()->FileExists(filename.ToString());
+      tensorflow::Env::Default()->FileExists(string(filename));
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
   }
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index 3c64813735..e4e5268b0f 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -52,10 +52,17 @@ PyRecordWriter::~PyRecordWriter() {
   file_.reset();
 }
 
-bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
-  if (writer_ == nullptr) return false;
+void PyRecordWriter::WriteRecord(tensorflow::StringPiece record,
+                                 TF_Status* out_status) {
+  if (writer_ == nullptr) {
+    TF_SetStatus(out_status, TF_FAILED_PRECONDITION,
+                 "Writer not initialized or previously closed");
+    return;
+  }
   Status s = writer_->WriteRecord(record);
-  return s.ok();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+  }
 }
 
 void PyRecordWriter::Flush(TF_Status* out_status) {
diff --git a/tensorflow/python/lib/io/py_record_writer.h b/tensorflow/python/lib/io/py_record_writer.h
index 9d66c031d4..61a4960ee6 100644
--- a/tensorflow/python/lib/io/py_record_writer.h
+++ b/tensorflow/python/lib/io/py_record_writer.h
@@ -43,7 +43,7 @@ class PyRecordWriter {
                              TF_Status* out_status);
   ~PyRecordWriter();
 
-  bool WriteRecord(tensorflow::StringPiece record);
+  void WriteRecord(tensorflow::StringPiece record, TF_Status* out_status);
   void Flush(TF_Status* out_status);
   void Close(TF_Status* out_status);
 
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index aec12ab3ea..404423ce07 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -15,7 +15,7 @@
 
 """Python functions for directly manipulating TFRecord-formatted files.
 
-See the @{$python/python_io} guide.
+See the [Python IO](https://tensorflow.org/api_guides/python/python_io) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 941d6cd67c..2b3e986f6b 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -125,8 +125,8 @@ class TFRecordWriter(object):
     Args:
       record: str
     """
-    # TODO(sethtroisi): Failures are currently swallowed, change that.
-    self._writer.WriteRecord(record)
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._writer.WriteRecord(record, status)
 
   def flush(self):
     """Flush the file."""
diff --git a/tensorflow/python/lib/io/tf_record_test.py b/tensorflow/python/lib/io/tf_record_test.py
index 4743c037ec..b853b64ae4 100644
--- a/tensorflow/python/lib/io/tf_record_test.py
+++ b/tensorflow/python/lib/io/tf_record_test.py
@@ -358,12 +358,12 @@ class TFRecordWriterCloseAndFlushTests(test.TestCase):
     with self.assertRaises(errors_impl.FailedPreconditionError):
       self._writer.flush()
 
-  def testWriteAfterClose(self):
+  def testWriteAfterCloseIsError(self):
     self._writer.write(self._Record(0))
     self._writer.close()
 
-    # TODO(sethtroisi): No way to know this failed, changed that.
-    self._writer.write(self._Record(1))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      self._writer.write(self._Record(1))
 
 
 class TFRecordWriterCloseAndFlushGzipTests(TFRecordWriterCloseAndFlushTests):
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index a2b5f77f91..6ae869b89e 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from math import ceil
-
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -734,7 +732,6 @@ def _QuantizeAndDequantizeV3Grad(_, grad):
 
 @ops.RegisterGradient("ExtractImagePatches")
 def _ExtractImagePatchesGrad(op, grad):
-
   batch_size, rows_in, cols_in, channels = [
       dim.value for dim in op.inputs[0].get_shape()
   ]
@@ -742,28 +739,45 @@ def _ExtractImagePatchesGrad(op, grad):
   batch_size = input_bhwc[0]
   channels = input_bhwc[3]
 
+  # Create indices matrix for input tensor.
+  # Note that 0 is preserved for padding location,
+  # so indices for input start from 1 to 1 + rows_in * cols_in.
+  input_indices_num = 1 + rows_in * cols_in
+  input_idx = array_ops.reshape(math_ops.range(1, input_indices_num,
+                                               dtype=ops.dtypes.int64),
+                                (1, rows_in, cols_in, 1))
+  input_idx_patched = gen_array_ops.extract_image_patches(
+      input_idx,
+      op.get_attr("ksizes"),
+      op.get_attr("strides"),
+      op.get_attr("rates"),
+      op.get_attr("padding"))
+
+  # Create indices matrix for output tensor.
   _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].get_shape()]
   _, ksize_r, ksize_c, _ = op.get_attr("ksizes")
-  _, stride_r, stride_h, _ = op.get_attr("strides")
-  _, rate_r, rate_c, _ = op.get_attr("rates")
-  padding = op.get_attr("padding")
-
-  ksize_r_eff = ksize_r + (ksize_r - 1) * (rate_r - 1)
-  ksize_c_eff = ksize_c + (ksize_c - 1) * (rate_c - 1)
-
-  if padding == b"SAME":
-    rows_out = int(ceil(rows_in / stride_r))
-    cols_out = int(ceil(cols_in / stride_h))
-    pad_rows = ((rows_out - 1) * stride_r + ksize_r_eff - rows_in) // 2
-    pad_cols = ((cols_out - 1) * stride_h + ksize_c_eff - cols_in) // 2
-
-  elif padding == b"VALID":
-    rows_out = int(ceil((rows_in - ksize_r_eff + 1) / stride_r))
-    cols_out = int(ceil((cols_in - ksize_c_eff + 1) / stride_h))
-    pad_rows = (rows_out - 1) * stride_r + ksize_r_eff - rows_in
-    pad_cols = (cols_out - 1) * stride_h + ksize_c_eff - cols_in
-
-  pad_rows, pad_cols = max(0, pad_rows), max(0, pad_cols)
+  # Indices for output start from 0.
+  output_indices_num = rows_out * cols_out * ksize_r * ksize_c
+  output_idx = array_ops.reshape(math_ops.range(output_indices_num,
+                                                dtype=ops.dtypes.int64),
+                                 (1, rows_out, cols_out, ksize_r * ksize_c))
+
+  # Construct mapping table for indices: (input -> output).
+  idx_matrix = array_ops.concat(
+      [array_ops.expand_dims(input_idx_patched, axis=-1),
+       array_ops.expand_dims(output_idx, axis=-1)],
+      axis=-1)
+  idx_map = array_ops.reshape(idx_matrix, (-1, 2))
+
+  sp_shape = (input_indices_num, output_indices_num)
+  sp_mat_full = sparse_tensor.SparseTensor(
+      idx_map,
+      array_ops.ones([output_indices_num], dtype=grad.dtype),
+      sp_shape)
+  # Remove all padding locations [0, :].
+  sp_mat = sparse_ops.sparse_slice(sp_mat_full,
+                                   (1, 0),
+                                   (input_indices_num - 1, output_indices_num))
 
   grad_expanded = array_ops.transpose(
       array_ops.reshape(
@@ -771,27 +785,6 @@ def _ExtractImagePatchesGrad(op, grad):
       (1, 2, 3, 4, 0, 5))
   grad_flat = array_ops.reshape(grad_expanded, (-1, batch_size * channels))
 
-  row_steps = range(0, rows_out * stride_r, stride_r)
-  col_steps = range(0, cols_out * stride_h, stride_h)
-
-  idx = []
-  for i in range(rows_out):
-    for j in range(cols_out):
-      r_low, c_low = row_steps[i] - pad_rows, col_steps[j] - pad_cols
-      r_high, c_high = r_low + ksize_r_eff, c_low + ksize_c_eff
-
-      idx.extend([(r * (cols_in) + c, i * (cols_out * ksize_r * ksize_c) + j *
-                   (ksize_r * ksize_c) + ri * (ksize_c) + ci)
-                  for (ri, r) in enumerate(range(r_low, r_high, rate_r))
-                  for (ci, c) in enumerate(range(c_low, c_high, rate_c))
-                  if 0 <= r and r < rows_in and 0 <= c and c < cols_in])
-
-  sp_shape = (rows_in * cols_in, rows_out * cols_out * ksize_r * ksize_c)
-
-  sp_mat = sparse_tensor.SparseTensor(
-      array_ops.constant(idx, dtype=ops.dtypes.int64),
-      array_ops.ones((len(idx),), dtype=grad.dtype), sp_shape)
-
   jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
 
   grad_out = array_ops.reshape(jac, (rows_in, cols_in, batch_size, channels))
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a917f51087..66bc4df18c 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -15,7 +15,7 @@
 # Tests for this file live in python/kernel_tests/array_ops_test.py
 """Support for manipulating tensors.
 
-See the @{$python/array_ops} guide.
+See the [Array Ops](https://tensorflow.org/api_guides/python/array_ops) guide.
 """
 
 from __future__ import absolute_import
@@ -712,10 +712,7 @@ def strided_slice(input_,
         new_axis_mask=new_axis_mask,
         shrink_axis_mask=shrink_axis_mask)
 
-  if not context.executing_eagerly():
-    # TODO(apassos) In eager mode assignment will be done by overriding
-    # __setitem__ instead.
-    op.assign = assign
+  op.assign = assign
   return op
 
 
@@ -2662,6 +2659,76 @@ def gather(params, indices, validate_indices=None, name=None, axis=0):
 gather.__doc__ = gen_array_ops.gather_v2.__doc__
 
 
+@tf_export("batch_gather")
+def batch_gather(params, indices, name=None):
+  """Gather slices from `params` according to `indices` with leading batch dims.
+
+  This operation assumes that the leading dimensions of `indices` are dense,
+  and the gathers on the axis corresponding to the last dimension of `indices`.
+  More concretely it computes:
+
+  result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
+
+  Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
+  `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
+  a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
+
+  In the case in which indices is a 1D tensor, this operation is equivalent to
+  `tf.gather`.
+
+  See also `tf.gather` and `tf.gather_nd`.
+
+  Args:
+    params: A Tensor. The tensor from which to gather values.
+    indices: A Tensor. Must be one of the following types: int32, int64. Index
+        tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
+        last dimension of `indices` itself.
+    name: A name for the operation (optional).
+
+  Returns:
+    A Tensor. Has the same type as `params`.
+
+  Raises:
+    ValueError: if `indices` has an unknown shape.
+  """
+
+  with ops.name_scope(name):
+    indices = ops.convert_to_tensor(indices, name="indices")
+    params = ops.convert_to_tensor(params, name="params")
+    indices_shape = shape(indices)
+    params_shape = shape(params)
+    ndims = indices.shape.ndims
+    if ndims is None:
+      raise ValueError("batch_gather does not allow indices with unknown "
+                       "shape.")
+    batch_indices = indices
+    accum_dim_value = 1
+    for dim in range(ndims-1, 0, -1):
+      dim_value = params_shape[dim-1]
+      accum_dim_value *= params_shape[dim]
+      dim_indices = gen_math_ops._range(0, dim_value, 1)
+      dim_indices *= accum_dim_value
+      dim_shape = stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
+                        axis=0)
+      batch_indices += reshape(dim_indices, dim_shape)
+
+    flat_indices = reshape(batch_indices, [-1])
+    outer_shape = params_shape[ndims:]
+    flat_inner_shape = gen_math_ops.prod(
+        params_shape[:ndims], [0], False)
+
+    flat_params = reshape(
+        params, concat([[flat_inner_shape], outer_shape], axis=0))
+    flat_result = gather(flat_params, flat_indices)
+    result = reshape(flat_result, concat([indices_shape, outer_shape], axis=0))
+    final_shape = indices.get_shape()[:ndims-1].merge_with(
+        params.get_shape()[:ndims -1])
+    final_shape = final_shape.concatenate(indices.get_shape()[ndims-1])
+    final_shape = final_shape.concatenate(params.get_shape()[ndims:])
+    result.set_shape(final_shape)
+    return result
+
+
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
 @tf_export("quantize_v2")
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 375a5ec2c3..c5a0f2949e 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -15,7 +15,8 @@
 # pylint: disable=g-short-docstring-punctuation
 """Asserts and Boolean Checks.
 
-See the @{$python/check_ops} guide.
+See the [Asserts and
+checks](https://tensorflow.org/api_guides/python/check_ops) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/clip_ops_test.py b/tensorflow/python/ops/clip_ops_test.py
index 7d8dc90491..444cd0f62c 100644
--- a/tensorflow/python/ops/clip_ops_test.py
+++ b/tensorflow/python/ops/clip_ops_test.py
@@ -30,7 +30,7 @@ class ClipOpsTest(test.TestCase):
     super(ClipOpsTest, self).__init__(method_name)
 
   def _testClipByNorm(self, inputs, max_norm, expected):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       input_op = constant_op.constant(inputs)
       clipped = clip_ops.clip_by_norm(input_op, max_norm)
       check_op = numerics.add_check_numerics_ops()
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 76173e0f30..75a1a53eb7 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -24,7 +24,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
-from tensorflow.python.framework import function
+from tensorflow.python.eager import function
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.ops import gradients_impl
 
diff --git a/tensorflow/python/ops/cond_v2_impl.py b/tensorflow/python/ops/cond_v2_impl.py
index b3dacff6d6..c4e9c982b5 100644
--- a/tensorflow/python/ops/cond_v2_impl.py
+++ b/tensorflow/python/ops/cond_v2_impl.py
@@ -27,14 +27,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python import pywrap_tensorflow as c_api
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_functional_ops
-from tensorflow.python.util import compat
 
 
 # The following modules cannot be imported directly because they cause circular
@@ -57,46 +56,27 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
     name = "cond"
 
   with ops.name_scope(name) as scope:
-    # Identify if there is a caller device, & get the innermost if possible.
-    # pylint: disable=protected-access
-    device_funcs = ops.get_default_graph()._device_functions_outer_to_inner
-    caller_device = device_funcs[-1] if device_funcs else None
-
-    caller_colocation_stack = ops.get_default_graph()._colocation_stack
-    caller_container = ops.get_default_graph()._container
-    caller_collection_ref = ops.get_default_graph()._collections
-
     with ops.name_scope(None):
       # Find the outer most graph for uniquing function names.
       # TODO(jpienaar): Make this work in eager mode.
       graph = ops.get_default_graph()
-      while isinstance(graph, _function._FuncGraph):
-        graph = graph._outer_graph
+      while isinstance(graph, _function.FuncGraph):
+        graph = graph.outer_graph
 
       true_name = graph.unique_name(("%strue" % scope).replace("/", "_"))
       false_name = graph.unique_name(("%sfalse" % scope).replace("/", "_"))
-    # pylint: enable=protected-access
+
     true_graph = _function.func_graph_from_py_func(
-        true_fn, [], [],
-        name=true_name,
-        device=caller_device,
-        colocation_stack=caller_colocation_stack,
-        collections_ref=caller_collection_ref,
-        container=caller_container)
+        true_name, true_fn, [], {})
     false_graph = _function.func_graph_from_py_func(
-        false_fn, [], [],
-        name=false_name,
-        device=caller_device,
-        colocation_stack=caller_colocation_stack,
-        collections_ref=caller_collection_ref,
-        container=caller_container)
+        false_name, false_fn, [], {})
     _check_same_outputs(true_graph, false_graph)
 
     # Add inputs to true_graph and false_graph to make them match. Note that
     # this modifies true_graph and false_graph.
     cond_inputs = _make_inputs_match(true_graph, false_graph,
-                                     true_graph.extra_inputs,
-                                     false_graph.extra_inputs)
+                                     true_graph.external_captures,
+                                     false_graph.external_captures)
 
     # Add all intermediate tensors as function outputs so they're available for
     # the gradient computation.
@@ -148,8 +128,8 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   true_graph, false_graph = _get_func_graphs(op)
   # Note: op.graph != ops.get_default_graph() when we are computing the gradient
   # of a nested cond.
-  assert true_graph._outer_graph == op.graph
-  assert false_graph._outer_graph == op.graph
+  assert true_graph.outer_graph == op.graph
+  assert false_graph.outer_graph == op.graph
 
   # Create grad functions that compute the gradient of the true/false forward
   # graphs. These functions will capture tensors from the forward pass
@@ -164,14 +144,13 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
 
   # Resolve references to forward graph tensors in grad graphs and ensure
   # they are in-scope, i.e., belong to one of outer graphs of the grad graph.
-  true_grad_extra_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
-  false_grad_extra_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
+  true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
+  false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
   # Make the inputs to true_grad_graph and false_grad_graph match. Note that
   # this modifies true_grad_graph and false_grad_graph.
   grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
-                                   true_grad_extra_inputs,
-                                   false_grad_extra_inputs)
+                                   true_grad_inputs, false_grad_inputs)
 
   # Add all intermediate tensors as function outputs so they're available for
   # higher-order gradient computations.
@@ -211,8 +190,8 @@ def _get_func_graphs(if_op):
   """
   def _get_func_graph_for_branch(branch_name):
     """Generates and returns a _FuncGraph for the given branch."""
-    extra_inputs = if_op.inputs[1:]  # First input is pred.
-    input_shapes = [t.shape for t in extra_inputs]
+    inputs = if_op.inputs[1:]  # First input is pred.
+    input_shapes = [t.shape for t in inputs]
     func_name = if_op.get_attr(branch_name).name
     fdef = if_op.graph._get_function(func_name).definition
     # `if_op.graph` may not be the same as `ops.get_default_graph()` e.g.
@@ -224,9 +203,8 @@ def _get_func_graphs(if_op):
     with if_op.graph.as_default():
       func_graph = _function_def_to_graph.function_def_to_graph(
           fdef, input_shapes)
-    func_graph.extra_inputs = extra_inputs
-    func_graph.extra_args = func_graph.inputs
-    func_graph._captured = dict(zip(extra_inputs, func_graph.inputs))
+    func_graph.captures = collections.OrderedDict(zip(inputs,
+                                                      func_graph.inputs))
     # Set the if op so that the gradient code can use it.
     func_graph._if = if_op
     return func_graph
@@ -282,12 +260,12 @@ def _grad_fn(func_graph, grads):
 
 def _create_grad_func(func_graph, grads, name):
   """Returns the _FuncGraph representation of _grad_fn."""
-  return _function.func_graph_from_py_func(lambda: _grad_fn(func_graph, grads),
-                                           [], [], name)
+  return _function.func_graph_from_py_func(
+      name, lambda: _grad_fn(func_graph, grads), [], {})
 
 
 def _resolve_grad_inputs(cond_graph, grad_graph):
-  """Returns the tensors to pass as `extra_inputs` to `grad_graph`.
+  """Returns the tensors to pass as inputs to `grad_graph`.
 
   The `grad_graph` may have external references to
   1. Its outer graph containing the input gradients. These references are kept
@@ -305,10 +283,10 @@ def _resolve_grad_inputs(cond_graph, grad_graph):
   Returns:
     A list of inputs tensors to be passed to grad_graph.
   """
-  new_extra_inputs = []
+  new_inputs = []
 
-  for t in grad_graph.extra_inputs:
-    if t.graph != grad_graph._outer_graph:
+  for t in grad_graph.external_captures:
+    if t.graph != grad_graph.outer_graph:
       # `t` is a tensor in `cond_graph` or one of its ancestors. We bubble this
       # tensor to the least common ancestor of the `cond_graph` and
       # `grad_graph` so that it is "in-scope" for `grad_graph`.
@@ -316,19 +294,19 @@ def _resolve_grad_inputs(cond_graph, grad_graph):
       # common ancestor once and re-use.
       assert _is_ancestor(cond_graph, t.graph)
       while not _is_ancestor(grad_graph, t.graph):
-        assert isinstance(t.graph, _function._FuncGraph)
-        if t in t.graph.extra_args:
-          # TODO(srbs): Consider building a map of extra_args -> extra_inputs.
-          # instead of searching for `t` twice.
-          t = t.graph.extra_inputs[t.graph.extra_args.index(t)]
+        assert isinstance(t.graph, _function.FuncGraph)
+        if t in t.graph.internal_captures:
+          # TODO(srbs): Consider building a map of internal_captures ->
+          # external_captures instead of searching for `t` twice.
+          t = t.graph.external_captures[t.graph.internal_captures.index(t)]
         else:
           # Note: All intermediate tensors are output by the If op.
           # TODO(srbs): .index() calls may be expensive. Optimize.
           t = t.graph._if.outputs[t.graph.outputs.index(t)]
       assert _is_ancestor(grad_graph, t.graph)
-    new_extra_inputs.append(t)
+    new_inputs.append(t)
 
-  return new_extra_inputs
+  return new_inputs
 
 
 def _create_new_tf_function(func_graph):
@@ -340,26 +318,9 @@ def _create_new_tf_function(func_graph):
   Returns:
     The name of the new TF_Function.
   """
-  c_func = c_api.TF_GraphToFunction_wrapper(
-      func_graph._c_graph,
-      compat.as_str(func_graph.name),
-      False,  # append_hash_to_fn_name
-      None,  # opers
-      [t._as_tf_output() for t in func_graph.inputs],
-      [t._as_tf_output() for t in func_graph.outputs],
-      [],
-      None,  # opts
-      None)  # description
-  _ = c_api_util.ScopedTFFunction(c_func)
-
-  # TODO(b/109833212): this sucks, we're serializing the TF_Function*,
-  # deserializing it into a Python FunctionDef, then reserializing it to create
-  # a new TF_Function that we add to the graph.
-  fdef = _function.function_def_from_tf_function(c_func)
-  defined_func = _function._from_definition(fdef)
-  defined_func._sub_functions = func_graph._functions
-  defined_func.add_to_graph(func_graph._outer_graph)
-
+  func = _function._EagerDefinedFunction(
+      func_graph.name, func_graph, func_graph.inputs, func_graph.outputs, {})
+  func.add_to_graph(func_graph.outer_graph)
   return func_graph.name
 
 
@@ -421,21 +382,20 @@ def _pad_params(true_graph, false_graph, true_params, false_params):
   return new_true_params, new_false_inputs
 
 
-def _make_inputs_match(true_graph, false_graph, true_extra_inputs,
-                       false_extra_inputs):
+def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   """Modifies true_graph and false_graph so they have the same input signature.
 
   This method reorders and/or adds parameters to true_graph and false_graph so
-  they have the same input signature, and updates the 'inputs', 'extra_inputs',
-  and '_captured' fields of both graphs accordingly. It uses the input tensors
-  from the outer graph to avoid duplicating shared arguments.
+  they have the same input signature, and updates the 'inputs' and 'captured'
+  fields of both graphs accordingly. It uses the input tensors from the outer
+  graph to avoid duplicating shared arguments.
 
   Args:
     true_graph: function._FuncGraph
     false_graph: function._FuncGraph
-    true_extra_inputs: a list of Tensors in the outer graph. The inputs for
+    true_inputs: a list of Tensors in the outer graph. The inputs for
       true_graph.
-    false_extra_inputs: a list of Tensors in the outer graph. The inputs for
+    false_inputs: a list of Tensors in the outer graph. The inputs for
       false_graph.
 
   Returns:
@@ -444,12 +404,12 @@ def _make_inputs_match(true_graph, false_graph, true_extra_inputs,
     false_inputs.
   """
   shared_inputs, true_only_inputs, false_only_inputs = _separate_unique_inputs(
-      true_extra_inputs, false_extra_inputs)
+      true_inputs, false_inputs)
 
   new_inputs = shared_inputs + true_only_inputs + false_only_inputs
 
-  true_input_to_param = dict(zip(true_extra_inputs, true_graph.inputs))
-  false_input_to_param = dict(zip(false_extra_inputs, false_graph.inputs))
+  true_input_to_param = dict(zip(true_inputs, true_graph.inputs))
+  false_input_to_param = dict(zip(false_inputs, false_graph.inputs))
 
   true_graph.inputs = (
       [true_input_to_param[t] for t in shared_inputs] +
@@ -462,14 +422,10 @@ def _make_inputs_match(true_graph, false_graph, true_extra_inputs,
       [false_input_to_param[t] for t in false_only_inputs])
 
   # Rewrite the _FuncGraphs' state to reflect the new inputs.
-  true_graph.extra_inputs = new_inputs
-  false_graph.extra_inputs = new_inputs
-
-  true_graph.extra_args = true_graph.inputs
-  false_graph.extra_args = false_graph.inputs
-
-  true_graph._captured = dict(zip(new_inputs, true_graph.inputs))
-  false_graph._captured = dict(zip(new_inputs, false_graph.inputs))
+  true_graph.captures = collections.OrderedDict(zip(new_inputs,
+                                                    true_graph.inputs))
+  false_graph.captures = collections.OrderedDict(zip(new_inputs,
+                                                     false_graph.inputs))
 
   return new_inputs
 
@@ -506,10 +462,10 @@ def _get_grad_fn_name(func_graph):
   counter = 1
   has_conflict = True
   while has_conflict:
-    curr_graph = func_graph._outer_graph
+    curr_graph = func_graph.outer_graph
     has_conflict = curr_graph._is_function(name)
-    while not has_conflict and isinstance(curr_graph, _function._FuncGraph):
-      curr_graph = curr_graph._outer_graph
+    while not has_conflict and isinstance(curr_graph, _function.FuncGraph):
+      curr_graph = curr_graph.outer_graph
       has_conflict = curr_graph._is_function(name)
     if has_conflict:
       name = "%s_%s" % (base_name, counter)
@@ -534,6 +490,6 @@ def _check_same_outputs(true_graph, false_graph):
 def _is_ancestor(graph, maybe_ancestor):
   if maybe_ancestor == graph:
     return True
-  if isinstance(graph, _function._FuncGraph):
-    return _is_ancestor(graph._outer_graph, maybe_ancestor)
+  if isinstance(graph, _function.FuncGraph):
+    return _is_ancestor(graph.outer_graph, maybe_ancestor)
   return False
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index f84ff4ddf0..e3c1aa3d5a 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -14,7 +14,8 @@
 # ==============================================================================
 """Control Flow Operations.
 
-See the @{$python/control_flow_ops} guide.
+See the [Control
+Flow](https://tensorflow.org/api_guides/python/control_flow_ops) guide.
 """
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -1965,8 +1966,12 @@ def cond(pred,
   `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
   `false_fn` must have the same non-zero number and type of outputs.
 
-  Note that the conditional execution applies only to the operations defined in
-  `true_fn` and `false_fn`. Consider the following simple program:
+  **WARNING**: Any Tensors or Operations created outside of `true_fn` and
+  `false_fn` will be executed regardless of which branch is selected at runtime.
+
+  Although this behavior is consistent with the dataflow model of TensorFlow,
+  it has frequently surprised users who expected a lazier semantics.
+  Consider the following simple program:
 
   ```python
   z = tf.multiply(a, b)
@@ -1977,8 +1982,6 @@ def cond(pred,
   operation will not be executed. Since `z` is needed for at least one
   branch of the `cond`, the `tf.multiply` operation is always executed,
   unconditionally.
-  Although this behavior is consistent with the dataflow model of TensorFlow,
-  it has occasionally surprised some users who expected a lazier semantics.
 
   Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
   call to `cond`, and not at all during `Session.run()`). `cond`
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 153548ae92..2c42176158 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -153,7 +153,7 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
       const_with_dep = control_flow_ops.with_dependencies(
           (increment_counter, constant_op.constant(42)),
           constant_op.constant(7))
-      with self.test_session():
+      with self.cached_session():
         variables.global_variables_initializer().run()
         self.assertEquals(0, counter.eval())
         self.assertEquals(7, const_with_dep.eval())
@@ -167,7 +167,7 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
       const_with_dep = control_flow_ops.with_dependencies(
           [increment_counter, constant_op.constant(42)],
           constant_op.constant(7))
-      with self.test_session():
+      with self.cached_session():
         variables.global_variables_initializer().run()
         self.assertEquals(0, counter.eval())
         self.assertEquals(7, const_with_dep.eval())
@@ -177,7 +177,7 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithDenseShape(self):
-    with self.test_session():
+    with self.cached_session():
       data = ops.IndexedSlices(
           constant_op.constant([1, 2, 3]),
           constant_op.constant([0, 1]),
@@ -208,7 +208,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
                        constant_op.constant(0.0)])
       optimizer = momentum.MomentumOptimizer(0.1, 0.9)
       train_op = optimizer.minimize(cost)
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         sess.run(variables.global_variables_initializer())
         for _ in range(10):
           sess.run([train_op])
@@ -231,7 +231,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       _, cost = control_flow_ops.while_loop(
           cond, body, [constant_op.constant(0),
                        constant_op.constant(0.0)])
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(10.0, cost.eval())
 
@@ -268,7 +268,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       static_grads = math_ops.segment_sum(static_grads.values,
                                           static_grads.indices)
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
 
@@ -280,7 +280,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         num_steps = 9
 
         inputs = array_ops.placeholder(dtype=dtype, shape=[num_steps])
@@ -309,7 +309,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithDynamicShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         inputs = array_ops.placeholder(dtype=dtype)
         initial_outputs = tensor_array_ops.TensorArray(
             dtype=dtype, dynamic_size=True, size=1)
@@ -335,7 +335,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         self.assertAllEqual(grad, [1] * 3)
 
   def testGradientThroughSingleBranchOutsideOfContext(self):
-    with self.test_session():
+    with self.cached_session():
       x = constant_op.constant(2.)
       s = constant_op.constant(True)
       x_false, x_true = control_flow_ops.switch(x, s)
@@ -434,7 +434,7 @@ class CondTest(test_util.TensorFlowTestCase):
 class ContextTest(test_util.TensorFlowTestCase):
 
   def testCondContext(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       x = constant_op.constant(2)
       y = constant_op.constant(5)
       control_flow_ops.cond(
@@ -448,7 +448,7 @@ class ContextTest(test_util.TensorFlowTestCase):
               control_flow_ops.CondContext.from_proto(c.to_proto()).to_proto())
 
   def _testWhileContextHelper(self, maximum_iterations=None):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       i = constant_op.constant(0)
       c = lambda i: math_ops.less(i, 10)
       b = lambda i: math_ops.add(i, 1)
@@ -469,7 +469,7 @@ class ContextTest(test_util.TensorFlowTestCase):
     self._testWhileContextHelper(maximum_iterations=10)
 
   def testControlContextImportScope(self):
-    with self.test_session():
+    with self.cached_session():
       constant_op.constant(0, name="a")
       constant_op.constant(2, name="test_scope/a")
       b1 = constant_op.constant(1, name="b")
@@ -562,7 +562,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     output_case = control_flow_ops.case([(condition, fn_true)], fn_false,
                                         strict=strict)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       variables.global_variables_initializer().run()
       true_feed_dict = {condition: True}
       true_feed_dict.update(feed_dict)
@@ -884,7 +884,7 @@ class CaseTest(test_util.TensorFlowTestCase):
                   (math_ops.equal(x, 2), lambda: constant_op.constant(4))]
     default = lambda: constant_op.constant(6)
     output = control_flow_ops.case(conditions, default, exclusive=True)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
@@ -896,7 +896,7 @@ class CaseTest(test_util.TensorFlowTestCase):
                   (math_ops.equal(x, 2), lambda: constant_op.constant(6))]
     default = lambda: constant_op.constant(8)
     output = control_flow_ops.case(conditions, default, exclusive=True)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
@@ -909,7 +909,7 @@ class CaseTest(test_util.TensorFlowTestCase):
                   (math_ops.equal(x, 2), lambda: constant_op.constant(6))]
     default = lambda: constant_op.constant(8)
     output = control_flow_ops.case(conditions, default, exclusive=False)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
@@ -920,7 +920,7 @@ class CaseTest(test_util.TensorFlowTestCase):
                   (math_ops.equal(x, 2), lambda: constant_op.constant(4)),
                   (math_ops.equal(x, 3), lambda: constant_op.constant(6))]
     output = control_flow_ops.case(conditions, exclusive=True)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
@@ -931,7 +931,7 @@ class CaseTest(test_util.TensorFlowTestCase):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2))]
     output = control_flow_ops.case(conditions, exclusive=True)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
diff --git a/tensorflow/python/ops/dequantize_op_test.py b/tensorflow/python/ops/dequantize_op_test.py
index 31338db0dd..13e50273d8 100644
--- a/tensorflow/python/ops/dequantize_op_test.py
+++ b/tensorflow/python/ops/dequantize_op_test.py
@@ -32,7 +32,7 @@ class DequantizeOpTest(test.TestCase):
     super(DequantizeOpTest, self).__init__(method_name)
 
   def _testDequantizeOp(self, inputs, min_range, max_range, dtype):
-    with self.test_session():
+    with self.cached_session():
       input_op = constant_op.constant(inputs, shape=[len(inputs)], dtype=dtype)
       dequantized = array_ops.dequantize(input_op, min_range, max_range)
       tf_ans = dequantized.eval()
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 7b9e7de145..f97fca47ea 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -427,6 +427,8 @@ def embedding_lookup_sparse(params,
 
     embeddings = embedding_lookup(
         params, ids, partition_strategy=partition_strategy, max_norm=max_norm)
+    if embeddings.dtype in (dtypes.float16, dtypes.bfloat16):
+      embeddings = math_ops.to_float(embeddings)
     if not ignore_weights:
       weights = sp_weights.values
       if weights.dtype != embeddings.dtype:
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 4ecc74675a..a6be82673f 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -15,7 +15,8 @@
 
 """Functional operations.
 
-See the @{$python/functional_ops} guide.
+See the [Higher Order
+Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index b0ecdc6a50..fbb84b9018 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -76,7 +76,7 @@ class GradientCheckerTest(test.TestCase):
 
   def testAddCustomized(self):
     np.random.seed(3)  # Fix seed to avoid flakiness
-    with self.test_session():
+    with self.cached_session():
       # a test case for Add operation
       size = (2, 3)
       x1 = constant_op.constant(
@@ -94,7 +94,7 @@ class GradientCheckerTest(test.TestCase):
 
   def testGather(self):
     np.random.seed(4)  # Fix seed to avoid flakiness
-    with self.test_session():
+    with self.cached_session():
       p_shape = (4, 2)
       p_size = 8
       index_values = [1, 3]
@@ -111,7 +111,7 @@ class GradientCheckerTest(test.TestCase):
 
   def testNestedGather(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
-    with self.test_session():
+    with self.cached_session():
       p_shape = (8, 2)
       p_size = 16
       index_values = [1, 3, 5, 6]
@@ -131,7 +131,7 @@ class GradientCheckerTest(test.TestCase):
     assert error < 1e-4
 
   def testComplexMul(self):
-    with self.test_session():
+    with self.cached_session():
       size = ()
       c = constant_op.constant(5 + 7j, dtype=dtypes.complex64)
       x = constant_op.constant(11 - 13j, dtype=dtypes.complex64)
@@ -145,7 +145,7 @@ class GradientCheckerTest(test.TestCase):
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
 
   def testComplexConj(self):
-    with self.test_session():
+    with self.cached_session():
       size = ()
       x = constant_op.constant(11 - 13j, dtype=dtypes.complex64)
       y = math_ops.conj(x)
@@ -158,7 +158,7 @@ class GradientCheckerTest(test.TestCase):
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-5)
 
   def testEmptySucceeds(self):
-    with self.test_session():
+    with self.cached_session():
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.identity(x)
       for grad in gradient_checker.compute_gradient(x, (0, 3), y, (0, 3)):
@@ -168,7 +168,7 @@ class GradientCheckerTest(test.TestCase):
 
   def testEmptyFails(self):
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g):
+      with self.session(graph=g):
         x = array_ops.placeholder(dtypes.float32)
         with g.gradient_override_map({"Identity": "BadGrad"}):
           y = array_ops.identity(x)
@@ -180,7 +180,7 @@ class GradientCheckerTest(test.TestCase):
 
   def testNaNGradFails(self):
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g):
+      with self.session(graph=g):
         x = array_ops.placeholder(dtypes.float32)
         with g.gradient_override_map({"Identity": "NaNGrad"}):
           y = array_ops.identity(x)
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index d02fcf4ee2..fa9910b351 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -159,7 +159,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
   def testBoundaryContinue(self):
     # Test that we differentiate both 'x' and 'y' correctly when x is a
     # predecessor of y.
-    with self.test_session():
+    with self.cached_session():
       x = constant(1.0)
       y = x * 2.0
       z = y * 3.0
@@ -168,7 +168,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(6.0, grads[0].eval())
 
   def testAggregationMethodAccumulateN(self):
-    with self.test_session():
+    with self.cached_session():
       x = constant(1.0)
       y = x * 2.0
       z = y + y + y + y + y + y + y + y + y + y
@@ -181,7 +181,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(10.0, grads[1].eval())
 
   def testAggregationMethodAddN(self):
-    with self.test_session():
+    with self.cached_session():
       x = constant(1.0)
       y = x * 2.0
       z = y + y + y + y + y + y + y + y + y + y
@@ -192,7 +192,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(10.0, grads[1].eval())
 
   def testAggregationMethodTree(self):
-    with self.test_session():
+    with self.cached_session():
       x = constant(1.0)
       y = x * 2.0
       z = y + y + y + y + y + y + y + y + y + y
@@ -232,7 +232,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
           array_ops.placeholder(dtypes.int32))
       dx, = gradients.gradients(y, x, grad_ys=dy)
       # The IndexedSlices gradient of tf.identity is the identity map.
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         vdx, vdy = sess.run(
             [dx, dy], feed_dict={x: [1.0], dy.indices: [0], dy.values: [2.0]})
       self.assertEqual(vdx, vdy)
@@ -276,7 +276,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertIsNotNone(gradient)
 
   def testDependentYs(self):
-    with self.test_session():
+    with self.cached_session():
       x = constant_op.constant(3.0)
       y = math_ops.square(x)
       y1 = math_ops.square(y)
@@ -291,7 +291,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(17502.0, g[0].eval())
 
   def testPartialDerivatives(self):
-    with self.test_session():
+    with self.cached_session():
       x = constant_op.constant(1.)
       y = 2 * x
       z = x + y
@@ -341,7 +341,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
                           constants=constants, variables=variables_))
 
     # evaluate all tensors in one call to session.run for speed
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       results = sess.run([(case["grad1"], case["grad2"]) for case in cases])
 
     for (npgrad1, npgrad2), case in zip(results, cases):
@@ -378,7 +378,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
     y = f(x, b)
     grads = gradients.gradients(y, [x, b])
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       return sess.run(grads)
 
   def testFunctionGradientsBasic(self):
@@ -401,7 +401,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       # Build gradient graph (should add SymbolicGradient node for function).
       grads = gradients.gradients(y, [x, b1])
 
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertAllEqual([40.0], sess.run(grads)[0])
         self.assertAllEqual([10.0], sess.run(grads)[1])
 
@@ -448,7 +448,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return g[0]
 
       f = Foo()
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertEqual(sess.run(f), 2.0)
 
   def testGradientOfCaptured(self):
@@ -462,7 +462,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return g[0]
 
       f = Foo()
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertEqual(sess.run(f), 2.0)
 
   def testCapturedResourceVariable(self):
@@ -476,7 +476,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return g[0]
 
       f = Foo()
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         sess.run(variables.global_variables_initializer())
         self.assertEqual(sess.run(f), 2.0)
 
@@ -501,7 +501,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return Inner()
 
       x1_grad, x2_grad = Outer()
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         # 1.0 + None + 2.0 + 1.0 = 4.0
         self.assertEqual(sess.run(x1_grad), 4.0)
         # None + 1.0 + 1.0 + None = 2.0
@@ -524,7 +524,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
         return Inner()
 
       z_grad = Outer()
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         self.assertEqual(sess.run(z_grad), 3.0)
 
 
@@ -667,7 +667,7 @@ class HessianTest(test_util.TensorFlowTestCase):
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesToTensor(self):
-    with self.test_session():
+    with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
       c = constant_op.constant(np_val)
       c_sparse = math_ops._as_indexed_slices(c)
@@ -676,7 +676,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       self.assertAllClose(np_val, c_dense.eval())
 
   def testIndexedSlicesToTensorList(self):
-    with self.test_session():
+    with self.cached_session():
       numpy_list = []
       dense_list = []
       sparse_list = []
@@ -692,7 +692,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       self.assertAllClose(packed_dense.eval(), packed_sparse.eval())
 
   def testInt64Indices(self):
-    with self.test_session():
+    with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
       c = constant_op.constant(np_val)
       c_sparse = math_ops._as_indexed_slices(c)
@@ -938,7 +938,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       F(x)
 
   def testRVGradientsDynamicCond(self):
-    with self.test_session():
+    with self.cached_session():
       alpha = resource_variable_ops.ResourceVariable(
           np.random.random((1,)),
           dtype="float32")
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index e86a8e5a5b..7291e05685 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 # pylint: disable=g-short-docstring-punctuation
 """Histograms.
-
-Please see @{$python/histogram_ops} guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index 2e57ae8a2d..1ba805dbb4 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -35,7 +35,7 @@ class BinValuesFixedWidth(test.TestCase):
     value_range = [0.0, 5.0]
     values = []
     expected_bins = []
-    with self.test_session():
+    with self.cached_session():
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
@@ -47,7 +47,7 @@ class BinValuesFixedWidth(test.TestCase):
     value_range = [0.0, 5.0]
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     expected_bins = [0, 0, 1, 2, 4, 4]
-    with self.test_session():
+    with self.cached_session():
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int32, bins.dtype)
@@ -59,7 +59,7 @@ class BinValuesFixedWidth(test.TestCase):
     value_range = np.float64([0.0, 5.0])
     values = np.float64([-1.0, 0.0, 1.5, 2.0, 5.0, 15])
     expected_bins = [0, 0, 1, 2, 4, 4]
-    with self.test_session():
+    with self.cached_session():
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
@@ -72,7 +72,7 @@ class BinValuesFixedWidth(test.TestCase):
     values = constant_op.constant(
         [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]], shape=(2, 3))
     expected_bins = [[0, 0, 1], [2, 4, 4]]
-    with self.test_session():
+    with self.cached_session():
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index 75d00c8ed1..fddde75f6b 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -108,7 +108,7 @@ class ResizeBilinearOpTest(test.TestCase):
 
     x = np.arange(0, 4).reshape(in_shape).astype(np.float32)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       input_tensor = constant_op.constant(x, shape=in_shape)
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       self.assertEqual(out_shape, list(resize_out.get_shape()))
@@ -122,7 +122,7 @@ class ResizeBilinearOpTest(test.TestCase):
 
     x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
 
-    with self.test_session():
+    with self.cached_session():
       input_tensor = constant_op.constant(x, shape=in_shape)
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       err = gradient_checker.compute_gradient_error(
@@ -135,7 +135,7 @@ class ResizeBilinearOpTest(test.TestCase):
 
     x = np.arange(0, 24).reshape(in_shape).astype(np.float32)
 
-    with self.test_session():
+    with self.cached_session():
       input_tensor = constant_op.constant(x, shape=in_shape)
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       err = gradient_checker.compute_gradient_error(
@@ -165,7 +165,7 @@ class ResizeBilinearOpTest(test.TestCase):
     out_shape = [1, 2, 3, 1]
     x = np.arange(0, 24).reshape(in_shape)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for dtype in [np.float16, np.float32, np.float64]:
         input_tensor = constant_op.constant(x.astype(dtype), shape=in_shape)
         resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
@@ -190,7 +190,7 @@ class ResizeBicubicOpTest(test.TestCase):
     x = np.arange(0, 4).reshape(in_shape).astype(np.float32)
 
     for align_corners in [True, False]:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3],
                                               align_corners=align_corners)
@@ -206,7 +206,7 @@ class ResizeBicubicOpTest(test.TestCase):
     x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
 
     for align_corners in [True, False]:
-      with self.test_session():
+      with self.cached_session():
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3],
                                               align_corners=align_corners)
@@ -221,7 +221,7 @@ class ResizeBicubicOpTest(test.TestCase):
     x = np.arange(0, 24).reshape(in_shape).astype(np.float32)
 
     for align_corners in [True, False]:
-      with self.test_session():
+      with self.cached_session():
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3],
                                               align_corners=align_corners)
@@ -235,7 +235,7 @@ class ResizeBicubicOpTest(test.TestCase):
 
     x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
 
-    with self.test_session():
+    with self.cached_session():
       input_tensor = constant_op.constant(x, shape=in_shape)
       resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3])
       grad = gradients_impl.gradients(input_tensor, [resize_out])
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 343531ac55..3de46e7cf3 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -16,7 +16,7 @@
 # pylint: disable=g-short-docstring-punctuation
 """Image processing and decoding ops.
 
-See the @{$python/image} guide.
+See the [Images](https://tensorflow.org/api_guides/python/image) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 2c61bb232a..f7502c4018 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -238,7 +238,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
   def test_adjust_gamma_one(self):
     """Same image should be returned for gamma equal to one"""
-    with self.test_session():
+    with self.cached_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
 
@@ -252,7 +252,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
   def test_adjust_gamma_less_zero(self):
     """White image should be returned for gamma equal to zero"""
-    with self.test_session():
+    with self.cached_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
 
@@ -270,7 +270,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
   def test_adjust_gamma_less_zero_tensor(self):
     """White image should be returned for gamma equal to zero"""
-    with self.test_session():
+    with self.cached_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
 
@@ -290,7 +290,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
   def test_adjust_gamma_zero(self):
     """White image should be returned for gamma equal to zero"""
-    with self.test_session():
+    with self.cached_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
 
@@ -308,7 +308,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
   def test_adjust_gamma_less_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to half"""
-    with self.test_session():
+    with self.cached_session():
       x_np = np.arange(0, 255, 4, np.uint8).reshape(8, 8)
       y = image_ops.adjust_gamma(x_np, gamma=0.5)
       y_tf = np.trunc(y.eval())
@@ -329,7 +329,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
   def test_adjust_gamma_greater_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to two"""
-    with self.test_session():
+    with self.cached_session():
       x_np = np.arange(0, 255, 4, np.uint8).reshape(8, 8)
       y = image_ops.adjust_gamma(x_np, gamma=2)
       y_tf = np.trunc(y.eval())
@@ -2367,7 +2367,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     for opt in self.OPTIONS:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         image = constant_op.constant(img_np, shape=img_shape)
         y = image_ops.resize_images(image, [height, width], opt)
         yshape = array_ops.shape(y)
@@ -3076,7 +3076,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         self.assertLess(error, 4)
 
   def testCropAndDecodeJpeg(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Encode it, then decode it, then encode it
       base = "tensorflow/core/lib/jpeg/testdata"
       jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
@@ -3102,7 +3102,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(image1_crop, image2)
 
   def testCropAndDecodeJpegWithInvalidCropWindow(self):
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       # Encode it, then decode it, then encode it
       base = "tensorflow/core/lib/jpeg/testdata"
       jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
@@ -3577,7 +3577,7 @@ class FormatTest(test_util.TensorFlowTestCase):
         "png": functools.partial(image_ops.decode_png, channels=3),
         "gif": lambda s: array_ops.squeeze(image_ops.decode_gif(s), axis=0),
     }
-    with self.test_session():
+    with self.cached_session():
       for path in paths:
         contents = io_ops.read_file(os.path.join(prefix, path)).eval()
         images = {}
@@ -3592,7 +3592,7 @@ class FormatTest(test_util.TensorFlowTestCase):
 
   def testError(self):
     path = "tensorflow/core/lib/gif/testdata/scan.gif"
-    with self.test_session():
+    with self.cached_session():
       for decode in image_ops.decode_jpeg, image_ops.decode_png:
         with self.assertRaisesOpError(r"Got 12 frames"):
           decode(io_ops.read_file(path)).eval()
@@ -3606,7 +3606,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
     scores_np = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
     max_output_size_np = 3
     iou_threshold_np = 0.5
-    with self.test_session():
+    with self.cached_session():
       boxes = constant_op.constant(boxes_np)
       scores = constant_op.constant(scores_np)
       max_output_size = constant_op.constant(max_output_size_np)
@@ -3686,7 +3686,7 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
     # The output shape of the padded operation must be fully defined.
     self.assertEqual(selected_indices_padded.shape.is_fully_defined(), True)
     self.assertEqual(selected_indices.shape.is_fully_defined(), False)
-    with self.test_session():
+    with self.cached_session():
       self.assertAllClose(selected_indices_padded.eval(), [3, 0, 5, 0, 0])
       self.assertEqual(num_valid_padded.eval(), 3)
       self.assertAllClose(selected_indices.eval(), [3, 0, 5])
@@ -4035,7 +4035,7 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
     expected_dx = np.reshape([[2, 1, -2, 0], [-1, -2, 1, 0]], shape)
 
     dy, dx = image_ops.image_gradients(img)
-    with self.test_session():
+    with self.cached_session():
       actual_dy = dy.eval()
       actual_dx = dx.eval()
       self.assertAllClose(expected_dy, actual_dy)
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index f6fffa9079..6a1fe17119 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -55,7 +55,7 @@ class InitializersTest(test.TestCase):
 
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
-    with self.test_session():
+    with self.cached_session():
       self._runner(
           init_ops.RandomUniform(minval=-1, maxval=1, seed=124),
           tensor_shape,
@@ -65,7 +65,7 @@ class InitializersTest(test.TestCase):
 
   def test_normal(self):
     tensor_shape = (8, 12, 99)
-    with self.test_session():
+    with self.cached_session():
       self._runner(
           init_ops.RandomNormal(mean=0, stddev=1, seed=153),
           tensor_shape,
@@ -74,7 +74,7 @@ class InitializersTest(test.TestCase):
 
   def test_truncated_normal(self):
     tensor_shape = (12, 99, 7)
-    with self.test_session():
+    with self.cached_session():
       self._runner(
           init_ops.TruncatedNormal(mean=0, stddev=1, seed=126),
           tensor_shape,
@@ -84,7 +84,7 @@ class InitializersTest(test.TestCase):
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
-    with self.test_session():
+    with self.cached_session():
       self._runner(
           init_ops.Constant(2),
           tensor_shape,
@@ -94,7 +94,7 @@ class InitializersTest(test.TestCase):
 
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(
@@ -105,7 +105,7 @@ class InitializersTest(test.TestCase):
 
   def test_glorot_uniform_initializer(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(
@@ -116,7 +116,7 @@ class InitializersTest(test.TestCase):
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(
@@ -127,7 +127,7 @@ class InitializersTest(test.TestCase):
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(
@@ -138,7 +138,7 @@ class InitializersTest(test.TestCase):
 
   def test_glorot_normal_initializer(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(
@@ -149,7 +149,7 @@ class InitializersTest(test.TestCase):
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
-    with self.test_session():
+    with self.cached_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(
@@ -160,11 +160,11 @@ class InitializersTest(test.TestCase):
 
   def test_Orthogonal(self):
     tensor_shape = (20, 20)
-    with self.test_session():
+    with self.cached_session():
       self._runner(init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
 
   def test_Identity(self):
-    with self.test_session():
+    with self.cached_session():
       tensor_shape = (3, 4, 5)
       with self.assertRaises(ValueError):
         self._runner(
@@ -182,13 +182,13 @@ class InitializersTest(test.TestCase):
 
   def test_Zeros(self):
     tensor_shape = (4, 5)
-    with self.test_session():
+    with self.cached_session():
       self._runner(
           init_ops.Zeros(), tensor_shape, target_mean=0., target_max=0.)
 
   def test_Ones(self):
     tensor_shape = (4, 5)
-    with self.test_session():
+    with self.cached_session():
       self._runner(init_ops.Ones(), tensor_shape, target_mean=1., target_max=1.)
 
 
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index b5274ef2ed..fbc1350c61 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -16,7 +16,8 @@
 # pylint: disable=line-too-long
 """Inputs and Readers.
 
-See the @{$python/io_ops} guide.
+See the [Inputs and
+Readers](https://tensorflow.org/api_guides/python/io_ops) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index fb51fbc626..561a341cf3 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import six
 
+from tensorflow.python.compat import compat as fwd_compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -299,6 +300,7 @@ class HashTable(InitializableLookupTableBase):
         self._value_shape))
     return exported_keys, exported_values
 
+
 class TableInitializerBase(object):
   """Base class for lookup table initializers."""
 
@@ -370,8 +372,13 @@ class KeyValueTensorInitializer(TableInitializerBase):
         # Ensure a unique name when eager execution is enabled to avoid spurious
         # sharing issues.
         scope += str(ops.uid())
-      init_op = gen_lookup_ops.initialize_table_v2(
-          table.table_ref, self._keys, self._values, name=scope)
+      if fwd_compat.forward_compatible(2018, 9, 19):
+        init_op = gen_lookup_ops.lookup_table_import_v2(
+            table.table_ref, self._keys, self._values, name=scope)
+      else:
+        # To maintain forward compatibiltiy, use the old implementation.
+        init_op = gen_lookup_ops.initialize_table_v2(
+            table.table_ref, self._keys, self._values, name=scope)
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 51fb4cbac8..806539747e 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -193,7 +193,7 @@ def compute_weighted_loss(
     gradient, you need to apply `tf.stop_gradient` to `weights` before
     passing them to `compute_weighted_loss`.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -266,7 +266,7 @@ def absolute_difference(
       `labels` or if the shape of `weights` is invalid or if `labels`
       or `predictions` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -317,7 +317,7 @@ def cosine_distance(
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `axis`, `labels`, `predictions` or `weights` is `None`.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -369,7 +369,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
     ValueError: If the shapes of `logits` and `labels` don't match or
       if `labels` or `logits` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -437,7 +437,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
       if the shape of `weights` is invalid.  Also if `labels` or
      `predictions` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -503,7 +503,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
       if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -571,7 +571,7 @@ def mean_pairwise_squared_error(
       if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -654,7 +654,7 @@ def mean_squared_error(
       if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -711,7 +711,7 @@ def sigmoid_cross_entropy(
       `multi_class_labels` or if the shape of `weights` is invalid, or if
       `weights` is None.  Also if `multi_class_labels` or `logits` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -777,7 +777,7 @@ def softmax_cross_entropy(
       or if the shape of `weights` is invalid or if `weights` is None.  Also if
       `onehot_labels` or `logits` is None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
@@ -894,7 +894,7 @@ def sparse_softmax_cross_entropy(
     ValueError: If the shapes of `logits`, `labels`, and `weights` are
       incompatible, or if any of them are None.
 
-  @compatbility(eager)
+  @compatibility(eager)
   The `loss_collection` argument is ignored when executing eagerly. Consider
   holding on to the return value or collecting losses via a `tf.keras.Model`.
   @end_compatibility
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 2a7a2fd51f..8e11c4bce1 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -972,9 +972,9 @@ def _RealDivGrad(op, grad):
                   grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy))
 
 
-@ops.RegisterGradient("UnsafeDiv")
-def _UnsafeDivGrad(op, grad):
-  """UnsafeDiv op gradient."""
+@ops.RegisterGradient("DivNoNan")
+def _DivNoNanGrad(op, grad):
+  """DivNoNan op gradient."""
   x = op.inputs[0]
   y = op.inputs[1]
   sx = array_ops.shape(x)
@@ -983,10 +983,10 @@ def _UnsafeDivGrad(op, grad):
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
-      math_ops.reduce_sum(math_ops.unsafe_div(grad, y), rx), sx),
+      math_ops.reduce_sum(math_ops.div_no_nan(grad, y), rx), sx),
           array_ops.reshape(
               math_ops.reduce_sum(
-                  grad * math_ops.unsafe_div(math_ops.unsafe_div(-x, y), y),
+                  grad * math_ops.div_no_nan(math_ops.div_no_nan(-x, y), y),
                   ry), sy))
 
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index f9bb60e7fe..7110e0958c 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -102,14 +102,14 @@ class MinOrMaxGradientTest(test.TestCase):
   def testMinGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_min(array_ops.concat([inputs, inputs], 0))
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(inputs, [1], outputs, [])
       self.assertLess(error, 1e-4)
 
   def testMaxGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_max(array_ops.concat([inputs, inputs], 0))
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(inputs, [1], outputs, [])
       self.assertLess(error, 1e-4)
 
@@ -119,14 +119,14 @@ class MaximumOrMinimumGradientTest(test.TestCase):
   def testMaximumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.maximum(inputs, 3.0)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
       self.assertLess(error, 1e-4)
 
   def testMinimumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.minimum(inputs, 2.0)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
       self.assertLess(error, 1e-4)
 
@@ -137,7 +137,7 @@ class ProdGradientTest(test.TestCase):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
     outputs = math_ops.reduce_prod(inputs)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(
           inputs, inputs.get_shape().as_list(),
           outputs, outputs.get_shape().as_list())
@@ -147,7 +147,7 @@ class ProdGradientTest(test.TestCase):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
     outputs = math_ops.reduce_prod(inputs, -1)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(
           inputs, inputs.get_shape().as_list(),
           outputs, outputs.get_shape().as_list())
@@ -158,7 +158,7 @@ class ProdGradientTest(test.TestCase):
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
                                     dtype=dtype)
       outputs = math_ops.reduce_prod(inputs)
-      with self.test_session():
+      with self.cached_session():
         error = gradient_checker.compute_gradient_error(
             inputs, inputs.get_shape().as_list(),
             outputs, outputs.get_shape().as_list())
@@ -169,7 +169,7 @@ class ProdGradientTest(test.TestCase):
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
                                     dtype=dtype)
       outputs = math_ops.reduce_prod(inputs, -1)
-      with self.test_session():
+      with self.cached_session():
         error = gradient_checker.compute_gradient_error(
             inputs, inputs.get_shape().as_list(),
             outputs, outputs.get_shape().as_list())
@@ -182,7 +182,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
     segment_min = math_ops.segment_min(data, segment_ids)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(data, [3], segment_min,
                                                       [2])
       self.assertLess(error, 1e-4)
@@ -191,7 +191,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
     segment_max = math_ops.segment_max(data, segment_ids)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(data, [3], segment_max,
                                                       [2])
       self.assertLess(error, 1e-4)
@@ -201,7 +201,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
     data = array_ops.concat([inputs, inputs], 0)
     segment_ids = constant_op.constant([0, 0], dtype=dtypes.int64)
     segment_min = math_ops.segment_min(data, segment_ids)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(inputs, [1], segment_min,
                                                       [1])
       self.assertLess(error, 1e-4)
@@ -211,7 +211,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
     data = array_ops.concat([inputs, inputs], 0)
     segment_ids = constant_op.constant([0, 0], dtype=dtypes.int64)
     segment_max = math_ops.segment_max(data, segment_ids)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(inputs, [1], segment_max,
                                                       [1])
       self.assertLess(error, 1e-4)
@@ -225,18 +225,19 @@ class FloorModGradientTest(test.TestCase):
     ns = constant_op.constant([17.], dtype=dtypes.float32)
     inputs = constant_op.constant([131.], dtype=dtypes.float32)
     floor_mod = math_ops.floormod(inputs, ns)
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(inputs, [1],
                                                       floor_mod, [1])
       self.assertLess(error, 1e-4)
 
 
-class UnsafeDivGradientTest(test.TestCase):
+class DivNoNanGradientTest(test.TestCase):
 
   def testBasicGradient(self):
-    inputs = constant_op.constant(np.arange(-3, 3), dtype=dtypes.float32)
-    outputs = math_ops.unsafe_div(inputs, 1 + math_ops.abs(inputs))
-    with self.test_session():
+    inputs = constant_op.constant(np.arange(-3, 3),
+                                  dtype=dtypes.float32)
+    outputs = math_ops.div_no_nan(inputs, 1 + math_ops.abs(inputs))
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(
           inputs,
           inputs.get_shape().as_list(), outputs,
@@ -244,10 +245,12 @@ class UnsafeDivGradientTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
   def testGradientWithDenominatorIsZero(self):
-    x = constant_op.constant(np.arange(-3, 3), dtype=dtypes.float32)
-    y = array_ops.zeros_like(x, dtype=dtypes.float32)
-    outputs = math_ops.unsafe_div(x, y)
-    with self.test_session():
+    x = constant_op.constant(np.arange(-3, 3),
+                             dtype=dtypes.float32)
+    y = array_ops.zeros_like(x,
+                             dtype=dtypes.float32)
+    outputs = math_ops.div_no_nan(x, y)
+    with self.cached_session():
       dx, dy = gradients.gradients(outputs, [x, y])
       self.assertAllClose(dx.eval(), np.zeros(x.shape.as_list()))
       self.assertAllClose(dy.eval(), np.zeros(y.shape.as_list()))
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 81499bee56..9b0ab00c7a 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Basic arithmetic operators.
 
-See the @{$python/math_ops} guide.
+See the [python/math_ops](python/math_ops) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -618,7 +618,7 @@ def cast(x, dtype, name=None):
   """Casts a tensor to a new type.
 
   The operation casts `x` (in case of `Tensor`) or `x.values`
-  (in case of `SparseTensor`) to `dtype`.
+  (in case of `SparseTensor` or `IndexedSlices`) to `dtype`.
 
   For example:
 
@@ -637,15 +637,16 @@ def cast(x, dtype, name=None):
   behavior of numpy.
 
   Args:
-    x: A `Tensor` or `SparseTensor` of numeric type. It could be
-      `uint8`, `uint16`, `uint32`, `uint64`, `int8`, `int16`, `int32`, `int64`,
-      `float16`, `float32`, `float64`, `complex64`, `complex128`, `bfloat16`.
-    dtype: The destination type. The list of supported dtypes is the same
-      as `x`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices` of numeric type. It could
+      be `uint8`, `uint16`, `uint32`, `uint64`, `int8`, `int16`, `int32`,
+      `int64`, `float16`, `float32`, `float64`, `complex64`, `complex128`,
+      `bfloat16`.
+    dtype: The destination type. The list of supported dtypes is the same as
+      `x`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` and
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` and
       same type as `dtype`.
 
   Raises:
@@ -659,6 +660,9 @@ def cast(x, dtype, name=None):
     if isinstance(x, sparse_tensor.SparseTensor):
       values_cast = cast(x.values, base_type, name=name)
       x = sparse_tensor.SparseTensor(x.indices, values_cast, x.dense_shape)
+    elif isinstance(x, ops.IndexedSlices):
+      values_cast = cast(x.values, base_type, name=name)
+      x = ops.IndexedSlices(values_cast, x.indices, x.dense_shape)
     else:
       # TODO(josh11b): If x is not already a Tensor, we could return
       # ops.convert_to_tensor(x, dtype=dtype, ...)  here, but that
@@ -711,11 +715,12 @@ def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` with type `float32`.
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` with
+    type `float32`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `float32`.
@@ -728,11 +733,12 @@ def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` with type `float64`.
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` with
+    type `float64`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `float64`.
@@ -745,11 +751,12 @@ def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` with type `int32`.
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` with
+    type `int32`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `int32`.
@@ -762,11 +769,12 @@ def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` with type `int64`.
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` with
+    type `int64`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `int64`.
@@ -779,11 +787,12 @@ def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` with type `bfloat16`.
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` with
+    type `bfloat16`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `bfloat16`.
@@ -796,11 +805,12 @@ def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex64`.
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` with
+    type `complex64`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `complex64`.
@@ -813,11 +823,12 @@ def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
   Args:
-    x: A `Tensor` or `SparseTensor`.
+    x: A `Tensor` or `SparseTensor` or `IndexedSlices`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` or `SparseTensor` with same shape as `x` with type `complex128`.
+    A `Tensor` or `SparseTensor` or `IndexedSlices` with same shape as `x` with
+    type `complex128`.
 
   Raises:
     TypeError: If `x` cannot be cast to the `complex128`.
@@ -1038,29 +1049,27 @@ def div(x, y, name=None):
   return _div_python2(x, y, name)
 
 
-def unsafe_div(x, y, name=None):
+@tf_export("div_no_nan")
+def div_no_nan(x, y, name=None):
   """Computes an unsafe divide which returns 0 if the y is zero.
 
-  Note that the function uses Python 3 division operator semantics.
-
   Args:
-    x: A `Tensor`. Must be one of the following types:
-       `float32`, `float64`, `int16`, `int32`, `int64`.
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`.
     y: A `Tensor` whose dtype is compatible with `x`.
     name: A name for the operation (optional).
   Returns:
     The element-wise value of the x divided by y.
   """
 
-  with ops.name_scope(name, "unsafe_div", [x, y]) as name:
+  with ops.name_scope(name, "div_no_nan", [x, y]) as name:
     x = ops.convert_to_tensor(x, name="x")
     y = ops.convert_to_tensor(y, name="y", dtype=x.dtype.base_dtype)
     x_dtype = x.dtype.base_dtype
     y_dtype = y.dtype.base_dtype
     if x_dtype != y_dtype:
-      raise TypeError(
-          "x and y must have the same dtype, got %r != %r" % (x_dtype, y_dtype))
-    return gen_math_ops.unsafe_div(x, y, name=name)
+      raise TypeError("x and y must have the same dtype, got %r != %r" %
+                      (x_dtype, y_dtype))
+    return gen_math_ops.div_no_nan(x, y, name=name)
 
 
 # TODO(aselle): This should be removed
@@ -2130,7 +2139,8 @@ def add_n(inputs, name=None):
   """Adds all input tensors element-wise.
 
   Args:
-    inputs: A list of `Tensor` objects, each with same shape and type.
+    inputs: A list of `Tensor` or `IndexedSlices` objects, each with same shape
+      and type.
     name: A name for the operation (optional).
 
   Returns:
@@ -2141,17 +2151,21 @@ def add_n(inputs, name=None):
     cannot be inferred.
   """
   if not inputs or not isinstance(inputs, (list, tuple)):
-    raise ValueError("inputs must be a list of at least one Tensor with the "
-                     "same dtype and shape")
+    raise ValueError("inputs must be a list of at least one"
+                     "Tensor/IndexedSlices with the same dtype and shape")
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
-  if not all(isinstance(x, ops.Tensor) for x in inputs):
-    raise ValueError("inputs must be a list of at least one Tensor with the "
-                     "same dtype and shape")
+  if not all(isinstance(x, (ops.Tensor, ops.IndexedSlices)) for x in inputs):
+    raise ValueError("inputs must be a list of at least one"
+                     "Tensor/IndexedSlices with the same dtype and shape")
 
   if len(inputs) == 1:
+    if isinstance(inputs[0], ops.IndexedSlices):
+      values = inputs[0].values
+    else:
+      values = inputs[0]
     if name:
-      return array_ops.identity(inputs[0], name=name)
-    return inputs[0]
+      return array_ops.identity(values, name=name)
+    return values
   return gen_math_ops.add_n(inputs, name=name)
 
 
@@ -2559,8 +2573,9 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r""" Computes the mean along segments of a tensor.
 
-  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
   [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
@@ -2591,8 +2606,9 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
-  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
   [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
@@ -2627,8 +2643,9 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
   r"""Computes the sum along sparse segments of a tensor.
 
-  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
 
   Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
   dimension, selecting a subset of dimension 0, specified by `indices`.
@@ -2702,8 +2719,9 @@ def sparse_segment_mean(data,
                         num_segments=None):
   r"""Computes the mean along sparse segments of a tensor.
 
-  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
-  of segments.
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
   dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 5fe7bbca11..1b01d1d37f 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -373,7 +373,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 
   def testFloorModInt(self):
     nums, divs = self.intTestData()
-    with self.test_session():
+    with self.cached_session():
       # TODO(aselle): Change test to use % after switch
       # tf_result = math_ops.floor_mod(nums, divs).eval()
       tf_result = math_ops.floormod(nums, divs).eval()
@@ -382,7 +382,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 
   def testFloorModFloat(self):
     nums, divs = self.floatTestData()
-    with self.test_session():
+    with self.cached_session():
       tf_result = math_ops.floormod(nums, divs).eval()
       np_result = nums % divs
       self.assertAllEqual(tf_result, np_result)
@@ -393,21 +393,21 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
-    with self.test_session():
+    with self.cached_session():
       tf_result = math_ops.truncatemod(nums, divs).eval()
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
   def testTruncateModFloat(self):
     nums, divs = self.floatTestData()
-    with self.test_session():
+    with self.cached_session():
       tf_result = math_ops.truncatemod(nums, divs).eval()
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
   def testDivideInt(self):
     nums, divs = self.intTestData()
-    with self.test_session():
+    with self.cached_session():
       tf_result = math_ops.floor_div(nums, divs).eval()
       np_result = nums // divs
       self.assertAllEqual(tf_result, np_result)
@@ -417,29 +417,29 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       # self.assertAllEqual(tf2_result, tf_result)
 
   def testDivideName(self):
-    with self.test_session():
+    with self.cached_session():
       op = math_ops.divide(
           array_ops.constant(3), array_ops.constant(4), name="my_cool_divide")
       self.assertEqual(op.name, "my_cool_divide:0")
 
   def testRealDiv(self):
     nums, divs = self.floatTestData()
-    with self.test_session():
+    with self.cached_session():
       tf_result = math_ops.realdiv(nums, divs).eval()
       np_result = np.divide(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
   def testComplexDiv(self):
     foo = array_ops.constant([1. + 3.j])
-    with self.test_session():
+    with self.cached_session():
       _ = math_ops.divide(foo, 1.).eval()
       _ = math_ops.div(foo, 2.).eval()
 
   def testFloorDivGrad(self):
-    with self.test_session():
+    with self.cached_session():
       a = variables.Variable(2.)
       b = variables.Variable(4.)
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         sess.run(variables.global_variables_initializer())
         c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
@@ -451,7 +451,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 
   def testConsistent(self):
     nums, divs = self.intTestData()
-    with self.test_session():
+    with self.cached_session():
       tf_result = (math_ops.floor_div(nums, divs) * divs + math_ops.floormod(
           nums, divs)).eval()
       tf_nums = array_ops.constant(nums)
@@ -473,18 +473,19 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(tf_result, expanded_nums)
 
 
-class UnsafeDivTest(test_util.TensorFlowTestCase):
+class DivNoNanTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
-    nums = np.arange(-10, 10, .25).reshape(80, 1)
-    divs = np.arange(-3, 3, .25).reshape(1, 24)
+    for dtype in [np.float32, np.float64]:
+      nums = np.arange(-10, 10, .25, dtype=dtype).reshape(80, 1)
+      divs = np.arange(-3, 3, .25, dtype=dtype).reshape(1, 24)
 
-    np_result = np.true_divide(nums, divs)
-    np_result[:, divs[0] == 0] = 0
+      np_result = np.true_divide(nums, divs)
+      np_result[:, divs[0] == 0] = 0
 
-    with self.test_session():
-      tf_result = math_ops.unsafe_div(nums, divs).eval()
-      self.assertAllEqual(tf_result, np_result)
+      with self.cached_session(use_gpu=True):
+        tf_result = math_ops.div_no_nan(nums, divs).eval()
+        self.assertAllEqual(tf_result, np_result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 3aedeb6acd..763877c2d2 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -34,7 +34,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -57,7 +57,8 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
       Furthermore, the final answer should be computed once instead of
       in every replica/tower. Both of these are accomplished by
       running the computation of the final result value inside
-      `tf.contrib.distribute.get_tower_context().merge_call(fn)`.
+      `tf.contrib.distribution_strategy_context.get_tower_context(
+      ).merge_call(fn)`.
       Inside the `merge_call()`, ops are only added to the graph once
       and access to a tower-local variable in a computation returns
       the sum across all replicas/towers.
@@ -300,6 +301,40 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
   return total_cm, update_op
 
 
+def _aggregate_across_towers(metrics_collections, metric_value_fn, *args):
+  """Aggregate metric value across towers."""
+  def fn(distribution, *a):
+    """Call `metric_value_fn` in the correct control flow context."""
+    if hasattr(distribution, '_outer_control_flow_context'):
+      # If there was an outer context captured before this method was called,
+      # then we enter that context to create the metric value op. If the
+      # caputred context is `None`, ops.control_dependencies(None) gives the
+      # desired behavior. Else we use `Enter` and `Exit` to enter and exit the
+      # captured context.
+      # This special handling is needed because sometimes the metric is created
+      # inside a while_loop (and perhaps a TPU rewrite context). But we don't
+      # want the value op to be evaluated every step or on the TPU. So we
+      # create it outside so that it can be evaluated at the end on the host,
+      # once the update ops have been evaluted.
+
+      # pylint: disable=protected-access
+      if distribution._outer_control_flow_context is None:
+        with ops.control_dependencies(None):
+          metric_value = metric_value_fn(distribution, *a)
+      else:
+        distribution._outer_control_flow_context.Enter()
+        metric_value = metric_value_fn(distribution, *a)
+        distribution._outer_control_flow_context.Exit()
+        # pylint: enable=protected-access
+    else:
+      metric_value = metric_value_fn(distribution, *a)
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, metric_value)
+    return metric_value
+
+  return distribution_strategy_context.get_tower_context().merge_call(fn, *args)
+
+
 @tf_export('metrics.mean')
 def mean(values,
          weights=None,
@@ -367,14 +402,10 @@ def mean(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    def aggregate_across_towers(_, t, c):
-      mean_t = _safe_div(t, c, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_t)
-      return mean_t
+    compute_mean = lambda _, t, c: _safe_div(t, c, 'value')
 
-    mean_t = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, total, count)
+    mean_t = _aggregate_across_towers(
+        metrics_collections, compute_mean, total, count)
     update_op = _safe_div(update_total_op, update_count_op, 'update_op')
 
     if updates_collections:
@@ -611,14 +642,8 @@ def _confusion_matrix_at_thresholds(labels,
 
 
 def _aggregate_variable(v, collections):
-
-  def f(distribution, value):
-    value = distribution.read_var(value)
-    if collections:
-      ops.add_to_collections(collections, value)
-    return value
-
-  return distribute_lib.get_tower_context().merge_call(f, v)
+  f = lambda distribution, value: distribution.read_var(value)
+  return _aggregate_across_towers(collections, f, v)
 
 
 @tf_export('metrics.auc')
@@ -806,15 +831,12 @@ def auc(labels,
         raise ValueError('Invalid summation_method: %s' % summation_method)
 
     # sum up the areas of all the trapeziums
-    def aggregate_auc(_, values):
-      auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
-                              values['fp'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, auc_value)
-      return auc_value
-
-    auc_value = distribute_lib.get_tower_context().merge_call(
-        aggregate_auc, values)
+    def compute_auc_value(_, values):
+      return compute_auc(values['tp'], values['fn'], values['tn'], values['fp'],
+                         'value')
+
+    auc_value = _aggregate_across_towers(
+        metrics_collections, compute_auc_value, values)
     update_op = compute_auc(update_ops['tp'], update_ops['fn'],
                             update_ops['tn'], update_ops['fp'], 'update_op')
 
@@ -1045,16 +1067,14 @@ def mean_per_class_accuracy(labels,
     update_total_op = state_ops.scatter_add(total, labels, ones)
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
 
-    def aggregate_mean_accuracy(_, count, total):
+    def compute_mean_accuracy(_, count, total):
       per_class_accuracy = _safe_div(count, total, None)
       mean_accuracy_v = math_ops.reduce_mean(
           per_class_accuracy, name='mean_accuracy')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_accuracy_v)
       return mean_accuracy_v
 
-    mean_accuracy_v = distribute_lib.get_tower_context().merge_call(
-        aggregate_mean_accuracy, count, total)
+    mean_accuracy_v = _aggregate_across_towers(
+        metrics_collections, compute_mean_accuracy, count, total)
 
     update_op = _safe_div(update_count_op, update_total_op, name='update_op')
     if updates_collections:
@@ -1127,7 +1147,7 @@ def mean_iou(labels,
     total_cm, update_op = _streaming_confusion_matrix(labels, predictions,
                                                       num_classes, weights)
 
-    def compute_mean_iou(total_cm, name):
+    def compute_mean_iou(_, total_cm):
       """Compute the mean intersection-over-union via the confusion matrix."""
       sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
       sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
@@ -1151,17 +1171,12 @@ def mean_iou(labels,
       # If the number of valid entries is 0 (no classes) we return 0.
       result = array_ops.where(
           math_ops.greater(num_valid_entries, 0),
-          math_ops.reduce_sum(iou, name=name) / num_valid_entries, 0)
+          math_ops.reduce_sum(iou, name='mean_iou') / num_valid_entries, 0)
       return result
 
-    def mean_iou_across_towers(_, v):
-      mean_iou_v = compute_mean_iou(v, 'mean_iou')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_iou_v)
-      return mean_iou_v
-
-    mean_iou_v = distribute_lib.get_tower_context().merge_call(
-        mean_iou_across_towers, total_cm)
+    # TODO(priyag): Use outside_compilation if in TPU context.
+    mean_iou_v = _aggregate_across_towers(
+        metrics_collections, compute_mean_iou, total_cm)
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -1370,14 +1385,10 @@ def mean_tensor(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    def aggregate_across_towers(_, t, c):
-      mean_t = _safe_div(t, c, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_t)
-      return mean_t
+    compute_mean = lambda _, t, c: _safe_div(t, c, 'value')
 
-    mean_t = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, total, count)
+    mean_t = _aggregate_across_towers(
+        metrics_collections, compute_mean, total, count)
 
     update_op = _safe_div(update_total_op, update_count_op, 'update_op')
     if updates_collections:
@@ -2003,13 +2014,10 @@ def precision(labels,
           math_ops.greater(tp + fp, 0), math_ops.div(tp, tp + fp), 0, name)
 
     def once_across_towers(_, true_p, false_p):
-      p = compute_precision(true_p, false_p, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, p)
-      return p
+      return compute_precision(true_p, false_p, 'value')
 
-    p = distribute_lib.get_tower_context().merge_call(
-        once_across_towers, true_p, false_p)
+    p = _aggregate_across_towers(metrics_collections, once_across_towers,
+                                 true_p, false_p)
 
     update_op = compute_precision(true_positives_update_op,
                                   false_positives_update_op, 'update_op')
@@ -2087,13 +2095,10 @@ def precision_at_thresholds(labels,
       return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
 
     def precision_across_towers(_, values):
-      prec = compute_precision(values['tp'], values['fp'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, prec)
-      return prec
+      return compute_precision(values['tp'], values['fp'], 'value')
 
-    prec = distribute_lib.get_tower_context().merge_call(
-        precision_across_towers, values)
+    prec = _aggregate_across_towers(
+        metrics_collections, precision_across_towers, values)
 
     update_op = compute_precision(update_ops['tp'], update_ops['fp'],
                                   'update_op')
@@ -2183,13 +2188,10 @@ def recall(labels,
           math_ops.div(true_p, true_p + false_n), 0, name)
 
     def once_across_towers(_, true_p, false_n):
-      rec = compute_recall(true_p, false_n, 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, rec)
-      return rec
+      return compute_recall(true_p, false_n, 'value')
 
-    rec = distribute_lib.get_tower_context().merge_call(
-        once_across_towers, true_p, false_n)
+    rec = _aggregate_across_towers(
+        metrics_collections, once_across_towers, true_p, false_n)
 
     update_op = compute_recall(true_positives_update_op,
                                false_negatives_update_op, 'update_op')
@@ -2621,14 +2623,11 @@ def recall_at_top_k(labels,
         class_id=class_id,
         weights=weights)
 
-    def aggregate_across_towers(_, tp, fn):
-      metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, metric)
-      return metric
+    def compute_recall(_, tp, fn):
+      return math_ops.div(tp, math_ops.add(tp, fn), name=scope)
 
-    metric = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, tp, fn)
+    metric = _aggregate_across_towers(
+        metrics_collections, compute_recall, tp, fn)
 
     update = math_ops.div(
         tp_update, math_ops.add(tp_update, fn_update), name='update')
@@ -2703,13 +2702,10 @@ def recall_at_thresholds(labels,
       return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
 
     def recall_across_towers(_, values):
-      rec = compute_recall(values['tp'], values['fn'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, rec)
-      return rec
+      return compute_recall(values['tp'], values['fn'], 'value')
 
-    rec = distribute_lib.get_tower_context().merge_call(
-        recall_across_towers, values)
+    rec = _aggregate_across_towers(
+        metrics_collections, recall_across_towers, values)
 
     update_op = compute_recall(update_ops['tp'], update_ops['fn'], 'update_op')
     if updates_collections:
@@ -2777,14 +2773,9 @@ def root_mean_squared_error(labels,
   mse, update_mse_op = mean_squared_error(labels, predictions, weights, None,
                                           None, name or
                                           'root_mean_squared_error')
-  def once_across_towers(_, mse):
-    rmse = math_ops.sqrt(mse)
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, rmse)
-    return rmse
 
-  rmse = distribute_lib.get_tower_context().merge_call(
-      once_across_towers, mse)
+  once_across_towers = lambda _, mse: math_ops.sqrt(mse)
+  rmse = _aggregate_across_towers(metrics_collections, once_across_towers, mse)
 
   update_rmse_op = math_ops.sqrt(update_mse_op)
   if updates_collections:
@@ -2879,15 +2870,12 @@ def sensitivity_at_specificity(labels,
       return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + kepsilon,
                           name)
 
-    def aggregate_across_towers(_, values):
-      sensitivity = compute_sensitivity_at_specificity(
+    def sensitivity_across_towers(_, values):
+      return compute_sensitivity_at_specificity(
           values['tp'], values['tn'], values['fp'], values['fn'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, sensitivity)
-      return sensitivity
 
-    sensitivity = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, values)
+    sensitivity = _aggregate_across_towers(
+        metrics_collections, sensitivity_across_towers, values)
 
     update_op = compute_sensitivity_at_specificity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
@@ -3156,14 +3144,11 @@ def _streaming_sparse_average_precision_at_top_k(labels,
       total_update = state_ops.assign_add(total_var, batch_total, name='update')
 
     # Divide total by max to get mean, for both vars and the update ops.
-    def aggregate_across_towers(_, total_var, max_var):
-      mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, mean_average_precision)
-      return mean_average_precision
+    def precision_across_towers(_, total_var, max_var):
+      return _safe_scalar_div(total_var, max_var, name='mean')
 
-    mean_average_precision = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, total_var, max_var)
+    mean_average_precision = _aggregate_across_towers(
+        metrics_collections, precision_across_towers, total_var, max_var)
 
     update = _safe_scalar_div(total_update, max_update, name=scope)
     if updates_collections:
@@ -3442,14 +3427,11 @@ def precision_at_top_k(labels,
         class_id=class_id,
         weights=weights)
 
-    def aggregate_across_towers(_, tp, fp):
-      metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, metric)
-      return metric
+    def precision_across_towers(_, tp, fp):
+      return math_ops.div(tp, math_ops.add(tp, fp), name=scope)
 
-    metric = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, tp, fp)
+    metric = _aggregate_across_towers(
+        metrics_collections, precision_across_towers, tp, fp)
 
     update = math_ops.div(
         tp_update, math_ops.add(tp_update, fp_update), name='update')
@@ -3680,15 +3662,12 @@ def specificity_at_sensitivity(labels,
       return math_ops.div(tn[tf_index], tn[tf_index] + fp[tf_index] + kepsilon,
                           name)
 
-    def aggregate_across_towers(_, values):
-      specificity = compute_specificity_at_sensitivity(
+    def specificity_across_towers(_, values):
+      return compute_specificity_at_sensitivity(
           values['tp'], values['tn'], values['fp'], values['fn'], 'value')
-      if metrics_collections:
-        ops.add_to_collections(metrics_collections, specificity)
-      return specificity
 
-    specificity = distribute_lib.get_tower_context().merge_call(
-        aggregate_across_towers, values)
+    specificity = _aggregate_across_towers(
+        metrics_collections, specificity_across_towers, values)
 
     update_op = compute_specificity_at_sensitivity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 339684122e..4b73fc830e 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -16,7 +16,7 @@
 # pylint: disable=unused-import,g-bad-import-order
 """Neural network support.
 
-See the @{$python/nn} guide.
+See the [Neural network](https://tensorflow.org/api_guides/python/nn) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index 7d6dd3fb02..a7467aa943 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -129,7 +129,7 @@ class BatchNormalizationTest(test.TestCase):
     v_val = np.random.random_sample(param_shape).astype(np.float64)
     beta_val = np.random.random_sample(param_shape).astype(np.float64)
     gamma_val = np.random.random_sample(param_shape).astype(np.float64)
-    with self.test_session():
+    with self.cached_session():
       x = constant_op.constant(x_val, name="x")
       m = constant_op.constant(m_val, name="m")
       v = constant_op.constant(v_val, name="v")
@@ -455,7 +455,7 @@ class MomentsTest(test.TestCase):
     return nn_impl.moments(x, axes, keep_dims=keep_dims)
 
   def RunMomentTestWithDynamicShape(self, shape, axes, keep_dims, dtype):
-    with self.test_session():
+    with self.cached_session():
       # shape = [batch, width, height, depth]
       assert len(shape) == 4
 
@@ -482,7 +482,7 @@ class MomentsTest(test.TestCase):
           expected_variance, var.eval(feed_dict={x: x_numpy}))
 
   def RunMomentTest(self, shape, axes, keep_dims, dtype):
-    with self.test_session():
+    with self.cached_session():
       # shape = [batch, width, height, depth]
       assert len(shape) == 4
 
@@ -547,7 +547,7 @@ class MomentsTest(test.TestCase):
             dtype=dtype)
 
   def _testGlobalGradient(self, from_y="mean"):
-    with self.test_session():
+    with self.cached_session():
       x_shape = [3, 5, 4, 2]
       x_val = np.random.random_sample(x_shape).astype(np.float64)
       x = constant_op.constant(x_val)
@@ -644,7 +644,7 @@ class WeightedMomentsTest(MomentsTest):
                             keep_dims,
                             dtype,
                             dynshapes=False):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       x_numpy = np.random.normal(size=shape).astype(np.float32)
       weights_numpy = np.absolute(  # weights must be positive
           np.random.normal(
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index df23ac55ce..a648653909 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -471,7 +471,9 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
-        math_ops.matmul(grad_grad[:, None, :], softmax[:, :, None]), axis=1)) *
+        math_ops.matmul(array_ops.expand_dims(grad_grad, 1),
+                        array_ops.expand_dims(softmax, 2)),
+        axis=1)) *
              softmax)
 
   return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 49d54beb20..8065df4b16 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -37,7 +37,7 @@ class Relu6OpTest(test.TestCase):
     x_init_value = np.array([[-3.5, -1.5, 2, 4], [4.5, 7.5, 8.5, 11]])
     r = nn_ops.relu6(inputs)
     r_g = gradients_impl.gradients(r, inputs)[0]
-    with self.test_session():
+    with self.cached_session():
       error = gradient_checker.compute_gradient_error(
           inputs,
           inputs.get_shape().as_list(),
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 51f812b395..2a1919e66f 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -1210,7 +1210,9 @@ def nce_loss(weights,
         num_true]`. The target classes.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of negative classes to randomly sample
+        per batch. This single sample of negative classes is evaluated for each
+        element in the batch.
     num_classes: An `int`. The number of possible classes.
     num_true: An `int`.  The number of target classes per training example.
     sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 6fd1273687..edc6e04b48 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -698,7 +698,7 @@ def convolution(
   `padded_input` is obtained by zero padding the input using an effective
   spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
   output striding `strides` as described in the
-  @{$python/nn#Convolution$comment here}.
+  [comment here](https://tensorflow.org/api_guides/python/nn#Convolution).
 
   In the case that `data_format` does start with `"NC"`, the `input` and output
   (but not the `filter`) are simply transposed as follows:
@@ -1836,8 +1836,9 @@ def softmax_cross_entropy_with_logits_v2(
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` of the same shape as `labels` and of the same type as `logits`
-    with the softmax cross entropy loss.
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
@@ -1962,8 +1963,9 @@ def softmax_cross_entropy_with_logits(
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` of the same shape as `labels` and of the same type as `logits`
-    with the softmax cross entropy loss.
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
   """
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index ce0db6b264..2fabb2e966 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -53,7 +53,7 @@ class ZeroFractionTest(test_lib.TestCase):
     x_shape = [5, 17]
     x_np = np.random.randint(0, 2, size=x_shape).astype(np.float32)
     y_np = self._ZeroFraction(x_np)
-    with self.test_session():
+    with self.cached_session():
       x_tf = constant_op.constant(x_np)
       x_tf.set_shape(x_shape)
       y_tf = nn_impl.zero_fraction(x_tf)
@@ -62,7 +62,7 @@ class ZeroFractionTest(test_lib.TestCase):
     self.assertAllClose(y_tf_np, y_np, eps)
 
   def testZeroFractionEmpty(self):
-    with self.test_session():
+    with self.cached_session():
       x = np.zeros(0)
       y = nn_impl.zero_fraction(x).eval()
       self.assertTrue(np.isnan(y))
@@ -106,7 +106,7 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
-    with self.test_session():
+    with self.cached_session():
       x_tf = constant_op.constant(x_np)
       y_tf = nn_ops.softmax(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
@@ -143,7 +143,7 @@ class LogPoissonLossTest(test_lib.TestCase):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float64)
     z_np = np.random.randint(0, 5, size=x_shape).astype(np.float64)
-    with self.test_session():
+    with self.cached_session():
       x_tf = constant_op.constant(x_np)
       y_tf = nn_impl.log_poisson_loss(z_np, x_tf, compute_full_loss=False)
       y_tf_stirling = nn_impl.log_poisson_loss(
@@ -191,7 +191,7 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
-    with self.test_session():
+    with self.cached_session():
       x_tf = constant_op.constant(x_np)
       y_tf = nn_ops.log_softmax(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
@@ -215,7 +215,7 @@ class L2LossTest(test_lib.TestCase):
     x_shape = [20, 7, 3]
     np.random.seed(1)  # Make it reproducible.
     x_val = np.random.random_sample(x_shape).astype(np.float64)
-    with self.test_session():
+    with self.cached_session():
       x = constant_op.constant(x_val, name="x")
       output = nn_ops.l2_loss(x)
       err = gradient_checker.compute_gradient_error(x, x_shape, output, [1])
@@ -263,7 +263,7 @@ class L2NormalizeTest(test_lib.TestCase):
     np.random.seed(1)
     x_np = np.random.random_sample(x_shape).astype(np.float64)
     for dim in range(len(x_shape)):
-      with self.test_session():
+      with self.cached_session():
         x_tf = constant_op.constant(x_np, name="x")
         y_tf = nn_impl.l2_normalize(x_tf, dim)
         err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
@@ -282,7 +282,7 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.test_session():
+      with self.cached_session():
         t = constant_op.constant(
             1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
         dropout = nn_ops.dropout(t, keep_prob)
@@ -310,7 +310,7 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.test_session():
+      with self.cached_session():
         t = constant_op.constant(
             1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
         dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
@@ -335,7 +335,7 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.test_session():
+      with self.cached_session():
         t = constant_op.constant(
             1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
         dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
@@ -355,7 +355,7 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.test_session():
+      with self.cached_session():
         t = constant_op.constant(
             1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
         keep_prob_placeholder = array_ops.placeholder(dtypes.float32)
@@ -389,7 +389,7 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.test_session():
+      with self.cached_session():
         t = constant_op.constant(
             1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
         # Set noise_shape=[None, 1] which means [x_dim, 1].
@@ -541,7 +541,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           "b",
           partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
           initializer=constant_op.constant(biases))
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         variables.global_variables_initializer().run()
         return sess.run([list(sharded_weights), list(sharded_biases)])
 
@@ -549,7 +549,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for num_true in range(1, 5):
         labels = np.random.randint(
             low=0, high=num_classes, size=batch_size * num_true)
@@ -585,7 +585,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for num_true in range(1, 5):
         labels = np.random.randint(
             low=0, high=num_classes, size=batch_size * num_true)
@@ -622,7 +622,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     num_classes = 5
     batch_size = 3
     sampled = [1, 0, 2, 3]
-    with self.test_session():
+    with self.cached_session():
       for num_true in range(1, 5):
         labels = np.random.randint(
             low=0, high=num_classes, size=batch_size * num_true)
@@ -666,7 +666,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for num_true in range(1, 5):
         labels = np.random.randint(
             low=0, high=num_classes, size=batch_size * num_true)
@@ -702,7 +702,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       for num_true in range(1, 5):
         labels = np.random.randint(
             low=0, high=num_classes, size=batch_size * num_true)
@@ -762,7 +762,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_nce_loss = np.sum(
         _SigmoidCrossEntropyWithLogits(exp_logits, exp_labels), 1)
 
-    with self.test_session():
+    with self.cached_session():
       got_nce_loss = nn_impl.nce_loss(
           weights=constant_op.constant(weights),
           biases=constant_op.constant(biases),
@@ -819,7 +819,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.test_session():
+    with self.cached_session():
       got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
           weights=constant_op.constant(weights),
           biases=constant_op.constant(biases),
@@ -880,7 +880,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.test_session():
+    with self.cached_session():
       true_exp_bf16 = np.full(
           [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
       sampled_exp_bf16 = np.full(
@@ -911,7 +911,7 @@ class CReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.concatenate([x * (x > 0), -x * (x < 0)], axis=1)
-    with self.test_session():
+    with self.cached_session():
       z = nn_ops.crelu(constant_op.constant(x)).eval()
       self.assertAllClose(y, z, 1e-4)
 
@@ -922,7 +922,7 @@ class ReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.maximum(x, 0.0)
-    with self.test_session():
+    with self.cached_session():
       z = nn_ops.relu(constant_op.constant(x)).eval()
       self.assertAllEqual(y, z)
 
@@ -930,7 +930,7 @@ class ReluTest(test_lib.TestCase):
     # Test that relu(nan) = nan for various sizes.
     for i in range(18):
       x = np.zeros(i) + np.nan
-      with self.test_session():
+      with self.cached_session():
         z = nn_ops.relu(constant_op.constant(x)).eval()
         self.assertTrue(np.isnan(z).all())
 
@@ -947,7 +947,7 @@ class LeakyReluTest(test_lib.TestCase):
 
     outputs = nn_ops.leaky_relu(inputs)
     self.assertEquals(inputs.shape, outputs.shape)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       inputs, outputs = sess.run([inputs, outputs])
     self.assertGreaterEqual(outputs.min(), 0.0)
     self.assertLessEqual(outputs.max(), 1.0)
@@ -957,7 +957,7 @@ class LeakyReluTest(test_lib.TestCase):
     for dtype in [np.int32, np.int64, np.float16, np.float32, np.float64]:
       np_values = np.array([-2, -1, 0, 1, 2], dtype=dtype)
       outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         outputs = sess.run(outputs)
       tol = 2e-3 if dtype == np.float16 else 1e-6
       self.assertAllClose(
@@ -984,7 +984,7 @@ class SwishTest(test_lib.TestCase):
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
     expected_tf_outputs = tf_values * math_ops.sigmoid(tf_values)
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_outputs, expected_outputs = sess.run(
           [actual_tf_outputs, expected_tf_outputs])
     self.assertAllClose(actual_outputs, expected_outputs)
@@ -995,7 +995,7 @@ class SwishTest(test_lib.TestCase):
     input_values = np.random.randn(*shape) * sigma
     x_tf = constant_op.constant(input_values)
     y_tf = nn_impl.swish(x_tf)
-    with self.test_session():
+    with self.cached_session():
       err = gradient_checker.compute_gradient_error(x_tf, shape, y_tf, shape)
     self.assertLess(err, 1e-4)
 
@@ -1016,7 +1016,7 @@ class MomentsTest(test_lib.TestCase):
           expected_var = np.var(
               input_values, axis=moments_axes, keepdims=keep_dims)
           with ops.Graph().as_default() as g:
-            with self.test_session(graph=g) as sess:
+            with self.session(graph=g) as sess:
               inputs = constant_op.constant(
                   input_values, shape=input_shape, dtype=dtypes.float32)
               mean, variance = nn_impl.moments(
diff --git a/tensorflow/python/ops/nn_xent_test.py b/tensorflow/python/ops/nn_xent_test.py
index 90f4b40770..54a0e26bfb 100644
--- a/tensorflow/python/ops/nn_xent_test.py
+++ b/tensorflow/python/ops/nn_xent_test.py
@@ -54,7 +54,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     return logits, targets, losses
 
   def testConstructionNamed(self):
-    with self.test_session():
+    with self.cached_session():
       logits, targets, _ = self._Inputs()
       loss = nn_impl.sigmoid_cross_entropy_with_logits(
           labels=targets, logits=logits, name="mylogistic")
@@ -84,7 +84,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
 
   def testGradient(self):
     sizes = [4, 2]
-    with self.test_session():
+    with self.cached_session():
       logits, targets, _ = self._Inputs(sizes=sizes)
       loss = nn_impl.sigmoid_cross_entropy_with_logits(
           labels=targets, logits=logits)
@@ -93,7 +93,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     self.assertLess(err, 1e-7)
 
   def testGradientAtZero(self):
-    with self.test_session():
+    with self.cached_session():
       logits = constant_op.constant([0.0, 0.0], dtype=dtypes.float64)
       targets = constant_op.constant([0.0, 1.0], dtype=dtypes.float64)
       loss = nn_impl.sigmoid_cross_entropy_with_logits(
@@ -130,7 +130,7 @@ class WeightedCrossEntropyTest(test.TestCase):
     return logits, targets, q, losses
 
   def testConstructionNamed(self):
-    with self.test_session():
+    with self.cached_session():
       logits, targets, pos_weight, _ = self._Inputs()
       loss = nn_impl.weighted_cross_entropy_with_logits(
           targets=targets, logits=logits, pos_weight=pos_weight, name="mybce")
@@ -159,7 +159,7 @@ class WeightedCrossEntropyTest(test.TestCase):
 
   def testGradient(self):
     sizes = [4, 2]
-    with self.test_session():
+    with self.cached_session():
       logits, targets, pos_weight, _ = self._Inputs(sizes=sizes)
       loss = nn_impl.weighted_cross_entropy_with_logits(
           targets=targets, logits=logits, pos_weight=pos_weight)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index ccf2eb8214..ead7ae5478 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -46,6 +46,7 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
   """
 
   flat_loop_fn_dtypes = nest.flatten(loop_fn_dtypes)
+  is_none_list = []
 
   def while_body(i, *ta_list):
     """Body of while loop."""
@@ -56,10 +57,13 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
           "actual outputs, %d, from loop_fn" % (len(flat_loop_fn_dtypes),
                                                 len(fn_output)))
     outputs = []
+    del is_none_list[:]
+    is_none_list.extend([x is None for x in fn_output])
     for out, ta in zip(fn_output, ta_list):
       # TODO(agarwal): support returning Operation objects from loop_fn.
-      assert isinstance(out, ops.Tensor)
-      outputs.append(ta.write(i, array_ops.expand_dims(out, 0)))
+      if out is not None:
+        ta = ta.write(i, array_ops.expand_dims(out, 0))
+      outputs.append(ta)
     return tuple([i + 1] + outputs)
 
   ta_list = control_flow_ops.while_loop(
@@ -69,7 +73,10 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       ])[1:]
 
   # TODO(rachelim): enable this for sparse tensors
-  return nest.pack_sequence_as(loop_fn_dtypes, [ta.concat() for ta in ta_list])
+
+  output = [None if is_none else ta.concat()
+            for ta, is_none in zip(ta_list, is_none_list)]
+  return nest.pack_sequence_as(loop_fn_dtypes, output)
 
 
 def pfor(loop_fn, iters):
diff --git a/tensorflow/python/ops/parallel_for/gradients.py b/tensorflow/python/ops/parallel_for/gradients.py
index ee3d5c9b86..460de0a97f 100644
--- a/tensorflow/python/ops/parallel_for/gradients.py
+++ b/tensorflow/python/ops/parallel_for/gradients.py
@@ -61,9 +61,10 @@ def jacobian(output, inputs, use_pfor=True):
         loop_fn, [output.dtype] * len(flat_inputs), output_size)
 
   for i, out in enumerate(pfor_outputs):
-    new_shape = array_ops.concat(
-        [output_shape, array_ops.shape(out)[1:]], axis=0)
-    out = array_ops.reshape(out, new_shape)
+    if out is not None:
+      new_shape = array_ops.concat(
+          [output_shape, array_ops.shape(out)[1:]], axis=0)
+      out = array_ops.reshape(out, new_shape)
     pfor_outputs[i] = out
 
   return nest.pack_sequence_as(inputs, pfor_outputs)
@@ -119,6 +120,8 @@ def batch_jacobian(output, inp, use_pfor=True):
   else:
     pfor_output = control_flow_ops.for_loop(loop_fn, output.dtype,
                                             output_row_size)
+  if pfor_output is None:
+    return None
   pfor_output = array_ops.reshape(pfor_output,
                                   [output_row_size, batch_size, -1])
   output = array_ops.transpose(pfor_output, [1, 0, 2])
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 3a6d9149ad..f9cf16f6a4 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -333,6 +333,13 @@ class GradientsTest(test.TestCase):
     for i in range(n):
       self.assertAllClose(outputs[i], outputs[i + n], rtol=rtol, atol=atol)
 
+  def test_no_path(self):
+    for grad_func in [gradients.jacobian, gradients.batch_jacobian]:
+      for use_pfor in [True, False]:
+        x = constant_op.constant([[1.0]])
+        y = constant_op.constant([[2.0]])
+        self.assertIsNone(grad_func(y, x, use_pfor=use_pfor))
+
   def test_jacobian_fixed_shape(self):
     x = random_ops.random_uniform([2, 2])
     y = math_ops.matmul(x, x, transpose_a=True)
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 2e4b2fd64e..3c914f6ff6 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1070,6 +1070,8 @@ class PFor(object):
       If y does not need to be converted, it returns y as is. Else it returns
       the "converted value" corresponding to y.
     """
+    if y is None:
+      return None
     if isinstance(y, sparse_tensor.SparseTensor):
       return self._convert_sparse(y)
     output = self._convert_helper(y)
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index d8d9af545f..6041e2a0c5 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -629,76 +629,12 @@ def _parse_example_raw(serialized,
   Returns:
     A `dict` mapping keys to `Tensor`s and `SparseTensor`s.
 
-  Raises:
-    ValueError: If sparse and dense key sets intersect, or input lengths do not
-      match up.
   """
   with ops.name_scope(name, "ParseExample", [serialized, names]):
-    names = [] if names is None else names
-    dense_defaults = collections.OrderedDict(
-    ) if dense_defaults is None else dense_defaults
-    sparse_keys = [] if sparse_keys is None else sparse_keys
-    sparse_types = [] if sparse_types is None else sparse_types
-    dense_keys = [] if dense_keys is None else dense_keys
-    dense_types = [] if dense_types is None else dense_types
-    dense_shapes = (
-        [[]] * len(dense_keys) if dense_shapes is None else dense_shapes)
-
-    num_dense = len(dense_keys)
-    num_sparse = len(sparse_keys)
-
-    if len(dense_shapes) != num_dense:
-      raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d"
-                       % (len(dense_shapes), num_dense))
-    if len(dense_types) != num_dense:
-      raise ValueError("len(dense_types) != len(num_dense): %d vs. %d"
-                       % (len(dense_types), num_dense))
-    if len(sparse_types) != num_sparse:
-      raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d"
-                       % (len(sparse_types), num_sparse))
-    if num_dense + num_sparse == 0:
-      raise ValueError("Must provide at least one sparse key or dense key")
-    if not set(dense_keys).isdisjoint(set(sparse_keys)):
-      raise ValueError(
-          "Dense and sparse keys must not intersect; intersection: %s" %
-          set(dense_keys).intersection(set(sparse_keys)))
-
-    # Convert dense_shapes to TensorShape object.
-    dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes]
-
-    dense_defaults_vec = []
-    for i, key in enumerate(dense_keys):
-      default_value = dense_defaults.get(key)
-      dense_shape = dense_shapes[i]
-      if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
-          dense_shape[0].value is None):
-        # Variable stride dense shape, the default value should be a
-        # scalar padding value
-        if default_value is None:
-          default_value = ops.convert_to_tensor(
-              "" if dense_types[i] == dtypes.string else 0,
-              dtype=dense_types[i])
-        else:
-          # Reshape to a scalar to ensure user gets an error if they
-          # provide a tensor that's not intended to be a padding value
-          # (0 or 2+ elements).
-          key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-          default_value = ops.convert_to_tensor(
-              default_value, dtype=dense_types[i], name=key_name)
-          default_value = array_ops.reshape(default_value, [])
-      else:
-        if default_value is None:
-          default_value = constant_op.constant([], dtype=dense_types[i])
-        elif not isinstance(default_value, ops.Tensor):
-          key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
-          default_value = ops.convert_to_tensor(
-              default_value, dtype=dense_types[i], name=key_name)
-          default_value = array_ops.reshape(default_value, dense_shape)
-
-      dense_defaults_vec.append(default_value)
-
-    # Finally, convert dense_shapes to TensorShapeProto
-    dense_shapes = [shape.as_proto() for shape in dense_shapes]
+    (names, dense_defaults_vec, sparse_keys, sparse_types,
+     dense_keys, dense_shapes, _) = _process_raw_parameters(
+         names, dense_defaults, sparse_keys, sparse_types, dense_keys,
+         dense_types, dense_shapes)
 
     outputs = gen_parsing_ops.parse_example(
         serialized=serialized,
@@ -719,6 +655,112 @@ def _parse_example_raw(serialized,
     return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values))
 
 
+def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
+                            dense_keys, dense_types, dense_shapes):
+  """Process raw parameters to params used by `gen_parsing_ops`.
+
+  Args:
+    names: A vector (1-D Tensor) of strings (optional), the names of
+      the serialized protos.
+    dense_defaults: A dict mapping string keys to `Tensor`s.
+      The keys of the dict must match the dense_keys of the feature.
+    sparse_keys: A list of string keys in the examples' features.
+      The results for these keys will be returned as `SparseTensor` objects.
+    sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+      and `tf.string` (`BytesList`) are supported.
+    dense_keys: A list of string keys in the examples' features.
+      The results for these keys will be returned as `Tensor`s
+    dense_types: A list of DTypes of the same length as `dense_keys`.
+      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+      and `tf.string` (`BytesList`) are supported.
+    dense_shapes: A list of tuples with the same length as `dense_keys`.
+      The shape of the data for each dense feature referenced by `dense_keys`.
+      Required for any input tensors identified by `dense_keys`.  Must be
+      either fully defined, or may contain an unknown first dimension.
+      An unknown first dimension means the feature is treated as having
+      a variable number of blocks, and the output shape along this dimension
+      is considered unknown at graph build time.  Padding is applied for
+      minibatch elements smaller than the maximum number of blocks for the
+      given feature along this dimension.
+
+  Returns:
+    Tuple of `names`, `dense_defaults_vec`, `sparse_keys`, `sparse_types`,
+    `dense_keys`, `dense_shapes`.
+
+  Raises:
+    ValueError: If sparse and dense key sets intersect, or input lengths do not
+      match up.
+  """
+  names = [] if names is None else names
+  dense_defaults = collections.OrderedDict(
+  ) if dense_defaults is None else dense_defaults
+  sparse_keys = [] if sparse_keys is None else sparse_keys
+  sparse_types = [] if sparse_types is None else sparse_types
+  dense_keys = [] if dense_keys is None else dense_keys
+  dense_types = [] if dense_types is None else dense_types
+  dense_shapes = ([[]] * len(dense_keys)
+                  if dense_shapes is None else dense_shapes)
+
+  num_dense = len(dense_keys)
+  num_sparse = len(sparse_keys)
+
+  if len(dense_shapes) != num_dense:
+    raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" %
+                     (len(dense_shapes), num_dense))
+  if len(dense_types) != num_dense:
+    raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" %
+                     (len(dense_types), num_dense))
+  if len(sparse_types) != num_sparse:
+    raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" %
+                     (len(sparse_types), num_sparse))
+  if num_dense + num_sparse == 0:
+    raise ValueError("Must provide at least one sparse key or dense key")
+  if not set(dense_keys).isdisjoint(set(sparse_keys)):
+    raise ValueError(
+        "Dense and sparse keys must not intersect; intersection: %s" %
+        set(dense_keys).intersection(set(sparse_keys)))
+
+  # Convert dense_shapes to TensorShape object.
+  dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes]
+
+  dense_defaults_vec = []
+  for i, key in enumerate(dense_keys):
+    default_value = dense_defaults.get(key)
+    dense_shape = dense_shapes[i]
+    if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
+        dense_shape[0].value is None):
+      # Variable stride dense shape, the default value should be a
+      # scalar padding value
+      if default_value is None:
+        default_value = ops.convert_to_tensor(
+            "" if dense_types[i] == dtypes.string else 0, dtype=dense_types[i])
+      else:
+        # Reshape to a scalar to ensure user gets an error if they
+        # provide a tensor that's not intended to be a padding value
+        # (0 or 2+ elements).
+        key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+        default_value = ops.convert_to_tensor(
+            default_value, dtype=dense_types[i], name=key_name)
+        default_value = array_ops.reshape(default_value, [])
+    else:
+      if default_value is None:
+        default_value = constant_op.constant([], dtype=dense_types[i])
+      elif not isinstance(default_value, ops.Tensor):
+        key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+        default_value = ops.convert_to_tensor(
+            default_value, dtype=dense_types[i], name=key_name)
+        default_value = array_ops.reshape(default_value, dense_shape)
+
+    dense_defaults_vec.append(default_value)
+
+  # Finally, convert dense_shapes to TensorShapeProto
+  dense_shapes_as_proto = [shape.as_proto() for shape in dense_shapes]
+
+  return (names, dense_defaults_vec, sparse_keys, sparse_types, dense_keys,
+          dense_shapes_as_proto, dense_shapes)
+
+
 @tf_export("parse_single_example")
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index d533731c07..4800352ac2 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -94,26 +94,8 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
         ops.set_shape_and_handle_data_for_outputs(h.op)
       handle._handle_data = h._handle_data
     # pylint: enable=protected-access
-
-  # Clean up our reference cycles to avoid making the garbage collector run.
-  # pylint: disable=protected-access
-  # OrderedDict, constructed on Graph creation, makes a simple reference loop
-  # and hides it in an __attribute in some Python versions. We don't need to
-  # throw an error if we can't find it, but if we do find it we can break the
-  # loop to avoid creating work for the garbage collector.
-  problematic_cycle = graph._functions.__dict__.get("_OrderedDict__root", None)
-  # pylint: enable=protected-access
-  if problematic_cycle:
-    try:
-      del problematic_cycle[0][:]
-    except TypeError:
-      # This is probably not one of the problematic Python versions. Continue
-      # with the rest of our cleanup.
-      pass
-  # Now clean up our own reference cycles by clearing all of the attributes for
-  # the Graph and op we created.
-  h.__dict__ = {}
-  graph.__dict__ = {}
+  # Clean up op->graph->op reference cycles.
+  ops.dismantle_graph(graph)
   return handle
 
 
@@ -185,7 +167,8 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
 class ResourceVariable(variables.RefVariable):
   """Variable based on resource handles.
 
-  See the @{$variables$Variables How To} for a high level overview.
+  See the [Variables How To](https://tensorflow.org/guide/variables)
+  for a high level overview.
 
   A `ResourceVariable` allows you to maintain state across subsequent calls to
   session.run.
@@ -372,6 +355,15 @@ class ResourceVariable(variables.RefVariable):
       raise ValueError("initial_value must be specified.")
     init_from_fn = callable(initial_value)
 
+    if isinstance(initial_value, ops.Tensor) and hasattr(
+        initial_value, "graph") and initial_value.graph.building_function:
+      raise ValueError("Tensor-typed variable initializers must either be "
+                       "wrapped in an init_scope or callable "
+                       "(e.g., `tf.Variable(lambda : "
+                       "tf.truncated_normal([10, 40]))`) when building "
+                       "functions. Please file a feature request if this "
+                       "restriction inconveniences you.")
+
     if collections is None:
       collections = [ops.GraphKeys.GLOBAL_VARIABLES]
     if not isinstance(collections, (list, tuple, set)):
@@ -603,6 +595,22 @@ class ResourceVariable(variables.RefVariable):
   def __bool__(self):
     return bool(self.read_value())
 
+  def __copy__(self):
+    return self
+
+  def __deepcopy__(self, memo):
+    if not context.executing_eagerly():
+      raise NotImplementedError(
+          "__deepcopy__() is only available when eager execution is enabled.")
+    copied_variable = ResourceVariable(
+        initial_value=self.read_value(),
+        trainable=self._trainable,
+        constraint=self._constraint,
+        dtype=self._dtype,
+        name=self._shared_name + "_copy")
+    memo[self._unique_id] = copied_variable
+    return copied_variable
+
   @property
   def dtype(self):
     """The dtype of this variable."""
@@ -975,6 +983,231 @@ class ResourceVariable(variables.RefVariable):
         return self._lazy_read(assign_op)
     return assign_op
 
+  def __reduce__(self):
+    return (ResourceVariable, (self.numpy(),))
+
+  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
+    """Subtracts `IndexedSlices` from this variable.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be subtracted from this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, ops.IndexedSlices):
+      raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return self._lazy_read(gen_resource_variable_ops.resource_scatter_sub(
+        self.handle, sparse_delta.indices,
+        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+
+  def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    """Adds `IndexedSlices` from this variable.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be added to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, ops.IndexedSlices):
+      raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return self._lazy_read(gen_resource_variable_ops.resource_scatter_add(
+        self.handle, sparse_delta.indices,
+        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+
+  def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, ops.IndexedSlices):
+      raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return self._lazy_read(gen_resource_variable_ops.resource_scatter_update(
+        self.handle, sparse_delta.indices,
+        ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
+
+  def scatter_nd_sub(self, indices, updates, name=None):
+    """Applies sparse subtraction to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_sub(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, -9, 3, -6, -6, 6, 7, -4]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return self._lazy_read(gen_state_ops.resource_scatter_nd_sub(
+        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
+        name=name))
+
+  def scatter_nd_add(self, indices, updates, name=None):
+    """Applies sparse addition to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        add = ref.scatter_nd_add(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(add)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 13, 3, 14, 14, 6, 7, 20]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return self._lazy_read(gen_state_ops.resource_scatter_nd_add(
+        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
+        name=name))
+
+  def scatter_nd_update(self, indices, updates, name=None):
+    """Applies sparse assignment to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_update(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 11, 3, 10, 9, 6, 7, 12]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return self._lazy_read(gen_state_ops.resource_scatter_nd_update(
+        self.handle, indices, ops.convert_to_tensor(updates, self.dtype),
+        name=name))
+
   def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
                             end_mask, ellipsis_mask, new_axis_mask,
                             shrink_axis_mask):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 7b6ab20975..5c00d929bf 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -144,6 +145,28 @@ def _should_cache():
   return control_flow_util.GetContainingWhileContext(ctxt) is None
 
 
+def _is_keras_rnn_cell(rnn_cell):
+  """Check whether the cell is a Keras RNN cell.
+
+  The Keras RNN cell accept the state as a list even the state is a single
+  tensor, whereas the TF RNN cell does not wrap single state tensor in list.
+  This behavior difference should be unified in future version.
+
+  Args:
+    rnn_cell: An RNN cell instance that either follow the Keras interface or TF
+      RNN interface.
+  Returns:
+    Boolean, whether the cell is an Keras RNN cell.
+  """
+  # Cell type check is not strict enough since there are cells created by other
+  # library like Deepmind that didn't inherit tf.nn.rnn_cell.RNNCell.
+  # Keras cells never had zero_state method, which was from the original
+  # interface from TF RNN cell.
+  return (not isinstance(rnn_cell, rnn_cell_impl.RNNCell)
+          and isinstance(rnn_cell, base_layer.Layer)
+          and getattr(rnn_cell, "zero_state", None) is None)
+
+
 # pylint: disable=unused-argument
 def _rnn_step(
     time, sequence_length, min_sequence_length, max_sequence_length,
@@ -608,7 +631,11 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     else:
       if not dtype:
         raise ValueError("If there is no initial_state, you must give a dtype.")
-      state = cell.zero_state(batch_size, dtype)
+      if getattr(cell, "get_initial_state", None) is not None:
+        state = cell.get_initial_state(
+            inputs=None, batch_size=batch_size, dtype=dtype)
+      else:
+        state = cell.zero_state(batch_size, dtype)
 
     def _assert_has_shape(x, shape):
       x_shape = array_ops.shape(x)
@@ -788,6 +815,10 @@ def _dynamic_rnn_loop(cell,
       input_t = tuple(ta[time.numpy()] for ta in input_ta)
 
     input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t)
+    # Keras RNN cells only accept state as list, even if it's a single tensor.
+    is_keras_rnn_cell = _is_keras_rnn_cell(cell)
+    if is_keras_rnn_cell and not nest.is_sequence(state):
+      state = [state]
     call_cell = lambda: cell(input_t, state)
 
     if sequence_length is not None:
@@ -804,6 +835,9 @@ def _dynamic_rnn_loop(cell,
     else:
       (output, new_state) = call_cell()
 
+    # Keras cells always wrap state as list, even if it's a single tensor.
+    if is_keras_rnn_cell and len(new_state) == 1:
+      new_state = new_state[0]
     # Pack state if using state tuples
     output = nest.flatten(output)
 
@@ -1286,7 +1320,11 @@ def static_rnn(cell,
       if not dtype:
         raise ValueError("If no initial_state is provided, "
                          "dtype must be specified")
-      state = cell.zero_state(batch_size, dtype)
+      if getattr(cell, "get_initial_state", None) is not None:
+        state = cell.get_initial_state(
+            inputs=None, batch_size=batch_size, dtype=dtype)
+      else:
+        state = cell.zero_state(batch_size, dtype)
 
     if sequence_length is not None:  # Prepare variables
       sequence_length = ops.convert_to_tensor(
@@ -1315,6 +1353,10 @@ def static_rnn(cell,
       min_sequence_length = math_ops.reduce_min(sequence_length)
       max_sequence_length = math_ops.reduce_max(sequence_length)
 
+    # Keras RNN cells only accept state as list, even if it's a single tensor.
+    is_keras_rnn_cell = _is_keras_rnn_cell(cell)
+    if is_keras_rnn_cell and not nest.is_sequence(state):
+      state = [state]
     for time, input_ in enumerate(inputs):
       if time > 0:
         varscope.reuse_variables()
@@ -1333,8 +1375,10 @@ def static_rnn(cell,
             state_size=cell.state_size)
       else:
         (output, state) = call_cell()
-
       outputs.append(output)
+    # Keras RNN cells only return state as list, even if it's a single tensor.
+    if is_keras_rnn_cell and len(state) == 1:
+      state = state[0]
 
     return (outputs, state)
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index f481726d54..c128a1039a 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -80,13 +80,13 @@ def assert_like_rnncell(cell_name, cell):
   conditions = [
       hasattr(cell, "output_size"),
       hasattr(cell, "state_size"),
-      hasattr(cell, "zero_state"),
+      hasattr(cell, "get_initial_state") or hasattr(cell, "zero_state"),
       callable(cell),
   ]
   errors = [
       "'output_size' property is missing",
       "'state_size' property is missing",
-      "'zero_state' method is missing",
+      "either 'zero_state' or 'get_initial_state' method is required",
       "is not callable"
   ]
 
@@ -193,6 +193,13 @@ class RNNCell(base_layer.Layer):
   for each `s` in `self.batch_size`.
   """
 
+  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
+    super(RNNCell, self).__init__(
+        trainable=trainable, name=name, dtype=dtype, **kwargs)
+    # Attribute that indicates whether the cell is a TF RNN cell, due the slight
+    # difference between TF and Keras RNN cell.
+    self._is_tf_rnn_cell = True
+
   def __call__(self, inputs, state, scope=None):
     """Run this RNN cell on inputs, starting from the given state.
 
@@ -259,6 +266,36 @@ class RNNCell(base_layer.Layer):
     # self.add_variable() inside the call() method.
     pass
 
+  def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
+    if inputs is not None:
+      # Validate the given batch_size and dtype against inputs if provided.
+      inputs = ops.convert_to_tensor(inputs, name="inputs")
+      if batch_size is not None:
+        if tensor_util.is_tensor(batch_size):
+          static_batch_size = tensor_util.constant_value(
+              batch_size, partial=True)
+        else:
+          static_batch_size = batch_size
+        if inputs.shape[0].value != static_batch_size:
+          raise ValueError(
+              "batch size from input tensor is different from the "
+              "input param. Input tensor batch: {}, batch_size: {}".format(
+                  inputs.shape[0].value, batch_size))
+
+      if dtype is not None and inputs.dtype != dtype:
+        raise ValueError(
+            "dtype from input tensor is different from the "
+            "input param. Input tensor dtype: {}, dtype: {}".format(
+                inputs.dtype, dtype))
+
+      batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+      dtype = inputs.dtype
+    if None in [batch_size, dtype]:
+      raise ValueError(
+          "batch_size and dtype cannot be None while constructing initial "
+          "state: batch_size={}, dtype={}".format(batch_size, dtype))
+    return self.zero_state(batch_size, dtype)
+
   def zero_state(self, batch_size, dtype):
     """Return zero-filled state tensor(s).
 
@@ -524,8 +561,8 @@ class GRUCell(LayerRNNCell):
   def get_config(self):
     config = {
         "num_units": self._num_units,
-        "initializer": initializers.serialize(self._initializer),
         "kernel_initializer": initializers.serialize(self._kernel_initializer),
+        "bias_initializer": initializers.serialize(self._bias_initializer),
         "activation": activations.serialize(self._activation),
         "reuse": self._reuse,
     }
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index d11e446dbf..8d66de6b20 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Script Language Operators. See the @{$python/script_ops} guide."""
+"""Script Language Operators."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -343,7 +343,8 @@ def eager_py_func(func, inp, Tout, name=None):
   or print statements as desired, and wrap those functions in
   `tf.contrib.eager.py_func`.
 
-  For more information on eager execution, see @{$guide/eager}.
+  For more information on eager execution, see the
+  [Eager guide](https://tensorflow.org/guide/eager).
 
   `tf.contrib.eager.py_func` is similar in spirit to `tf.py_func`, but unlike
   the latter, the former lets you use TensorFlow operations in the wrapped
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index dee84bab0c..e229501c10 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -13,7 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor Handle Operations. See the @{$python/session_ops} guide."""
+"""Tensor Handle Operations.
+
+See the [Session Ops](https://tensorflow.org/api_guides/python/session_ops)
+guide.
+"""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index fd547dcb19..38ce5236e3 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -14,7 +14,10 @@
 # ==============================================================================
 
 # pylint: disable=g-short-docstring-punctuation
-"""Sparse Tensor Representation. See the @{$python/sparse_ops} guide."""
+"""Sparse Tensor Representation.
+
+See the [Sparse Ops](https://tensorflow.org/api_guides/python/sparse_ops) guide.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -38,6 +41,7 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_sparse_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -82,6 +86,104 @@ def _convert_to_sparse_tensors(sp_inputs):
   raise TypeError("Inputs must be a list or tuple.")
 
 
+def _make_int64_tensor(value, name):
+  if isinstance(value, compat.integral_types):
+    return ops.convert_to_tensor(value, name=name, dtype=dtypes.int64)
+  if not isinstance(value, ops.Tensor):
+    raise TypeError("{} must be an integer value".format(name))
+  if value.dtype == dtypes.int64:
+    return value
+  return math_ops.cast(value, dtypes.int64)
+
+
+@tf_export("sparse.expand_dims")
+def sparse_expand_dims(sp_input, axis=None, name=None):
+  """Inserts a dimension of 1 into a tensor's shape.
+
+  Given a tensor `sp_input`, this operation inserts a dimension of 1 at the
+  dimension index `axis` of `sp_input`'s shape. The dimension index `axis`
+  starts at zero; if you specify a negative number for `axis` it is counted
+  backwards from the end.
+
+  Args:
+    sp_input: A `SparseTensor`.
+    axis: 0-D (scalar). Specifies the dimension index at which to expand the
+      shape of `input`. Must be in the range `[-rank(sp_input) - 1,
+      rank(sp_input)]`.
+    name: The name of the output `SparseTensor`.
+
+  Returns:
+    A `SparseTensor` with the same data as `sp_input`, but its shape has an
+    additional dimension of size 1 added.
+  """
+  rank = sp_input.dense_shape.get_shape()[0]
+  axis = -1 if axis is None else axis
+
+  with ops.name_scope(name, default_name="expand_dims", values=[sp_input]):
+    if isinstance(axis, compat.integral_types):
+      axis = ops.convert_to_tensor(axis, name="axis", dtype=dtypes.int32)
+    elif not isinstance(axis, ops.Tensor):
+      raise TypeError("axis must be an integer value in range [-rank(sp_input)"
+                      " - 1, rank(sp_input)]")
+
+    # Convert axis to a positive value if it is negative.
+    axis = array_ops.where(axis >= 0, axis, axis + rank + 1)
+
+    # Create the new column of indices for the sparse tensor by slicing
+    # the indices and inserting a new column of indices for the new dimension.
+    column_size = array_ops.shape(sp_input.indices)[0]
+    new_index = array_ops.zeros([column_size, 1], dtype=dtypes.int64)
+    indices_before = array_ops.slice(sp_input.indices, [0, 0], [-1, axis])
+    indices_after = array_ops.slice(sp_input.indices, [0, axis], [-1, -1])
+    indices = array_ops.concat(
+        [indices_before, new_index, indices_after], axis=1)
+
+    # Create the new dense shape by splicing the tensor [1] in the correct
+    # dimension of the existing shape.
+    shape_before = array_ops.slice(sp_input.dense_shape, [0], [axis])
+    shape_after = array_ops.slice(sp_input.dense_shape, [axis], [-1])
+    new_shape = ops.convert_to_tensor([1], name="new_shape", dtype=dtypes.int64)
+    shape = array_ops.concat([shape_before, new_shape, shape_after], axis=0)
+
+    # Create the output sparse tensor.
+    return sparse_tensor.SparseTensor(
+        indices=indices, values=sp_input.values, dense_shape=shape)
+
+
+@tf_export("sparse.eye")
+def sparse_eye(num_rows,
+               num_columns=None,
+               dtype=dtypes.float32,
+               name=None):
+  """Creates a two-dimensional sparse tensor with ones along the diagonal.
+
+  Args:
+    num_rows: Non-negative integer or `int32` scalar `tensor` giving the number
+      of rows in the resulting matrix.
+    num_columns: Optional non-negative integer or `int32` scalar `tensor` giving
+      the number of columns in the resulting matrix. Defaults to `num_rows`.
+    dtype: The type of element in the resulting `Tensor`.
+    name: A name for this `Op`. Defaults to "eye".
+
+  Returns:
+    A `SparseTensor` of shape [num_rows, num_columns] with ones along the
+    diagonal.
+  """
+  with ops.name_scope(name, default_name="eye", values=[num_rows, num_columns]):
+    num_rows = _make_int64_tensor(num_rows, "num_rows")
+    num_columns = num_rows if num_columns is None else _make_int64_tensor(
+        num_columns, "num_columns")
+
+    # Create the sparse tensor.
+    diag_size = math_ops.minimum(num_rows, num_columns)
+    diag_range = math_ops.range(diag_size, dtype=dtypes.int64)
+
+    return sparse_tensor.SparseTensor(
+        indices=array_ops.stack([diag_range, diag_range], axis=1),
+        values=array_ops.ones(diag_size, dtype=dtype),
+        dense_shape=[num_rows, num_columns])
+
+
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
 @deprecation.deprecated_args(
@@ -796,6 +898,11 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
   with a single element is returned.  Additionally, the axes can be negative,
   similar to the indexing rules in Python.
 
+  The values not defined in `sp_input` don't participate in the reduce max,
+  as opposed to be implicitly assumed 0 -- hence it can return negative values
+  for sparse `reduction_axes`. But, in case there are no values in
+  `reduction_axes`, it will reduce to 0. See second example below.
+
   For example:
 
   ```python
@@ -807,6 +914,11 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
   tf.sparse_reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
   tf.sparse_reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
   tf.sparse_reduce_max(x, [0, 1]) ==> 3
+
+  # 'y' represents [[-7, ?]
+  #                 [ 4, 3]
+  #                 [ ?, ?]
+  tf.sparse_reduce_max(x, 1) ==> [-7, 4, 0]
   ```
 
   Args:
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
new file mode 100644
index 0000000000..4ee1569249
--- /dev/null
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -0,0 +1,81 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sparse ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseOpsTest(test_util.TensorFlowTestCase):
+
+  def testSparseEye(self):
+    def test_one(n, m, as_tensors):
+      expected = np.eye(n, m)
+      if as_tensors:
+        m = constant_op.constant(m)
+        n = constant_op.constant(n)
+      s = sparse_ops.sparse_eye(n, m)
+      d = sparse_ops.sparse_to_dense(s.indices, s.dense_shape, s.values)
+      self.assertAllEqual(self.evaluate(d), expected)
+
+    for n in range(2, 10, 2):
+      for m in range(2, 10, 2):
+        # Test with n and m as both constants and tensors.
+        test_one(n, m, True)
+        test_one(n, m, False)
+
+  def testSparseExpandDims(self):
+    for rank in range(1, 4):
+      # Create a dummy input. When rank=3, shape=[2, 4, 6].
+      shape = np.arange(1, rank + 1) * 2
+      before = np.arange(np.prod(shape)).reshape(shape)
+
+      # Make entries sparse.
+      before *= np.random.binomial(1, .2, before.shape)
+      dense_shape = before.shape
+      indices = np.array(np.where(before)).T
+      values = before[before != 0]
+
+      # Try every possible valid value of axis.
+      for axis in range(-rank - 1, rank):
+        expected_after = np.expand_dims(before, axis)
+
+        for axis_as_tensor in [False, True]:
+          dense_shape_t = constant_op.constant(dense_shape, dtype=dtypes.int64)
+          indices_t = constant_op.constant(indices)
+          values_t = constant_op.constant(values)
+          before_t = sparse_tensor.SparseTensor(
+              indices=indices_t, values=values_t, dense_shape=dense_shape_t)
+
+          if axis_as_tensor:
+            axis = constant_op.constant(axis)
+
+          s = sparse_ops.sparse_expand_dims(before_t, axis)
+          d = sparse_ops.sparse_to_dense(s.indices, s.dense_shape, s.values)
+          self.assertAllEqual(self.evaluate(d), expected_after)
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index d556d11a1b..920047f38b 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Variables. See the @{$python/state_ops} guide."""
+"""Variables.
+
+See the [Variables](https://tensorflow.org/api_guides/python/state_ops) guide.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,13 +24,15 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_state_ops import *
-from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access,g-doc-return-or-yield,g-doc-args
@@ -126,7 +131,7 @@ def is_variable_initialized(ref, name=None):
   return ref.is_initialized(name=name)
 
 
-@tf_export("assign_sub")
+@tf_export(v1=["assign_sub"])
 def assign_sub(ref, value, use_locking=None, name=None):
   """Update 'ref' by subtracting 'value' from it.
 
@@ -155,7 +160,7 @@ def assign_sub(ref, value, use_locking=None, name=None):
   return ref.assign_sub(value)
 
 
-@tf_export("assign_add")
+@tf_export(v1=["assign_add"])
 def assign_add(ref, value, use_locking=None, name=None):
   """Update 'ref' by adding 'value' to it.
 
@@ -184,7 +189,7 @@ def assign_add(ref, value, use_locking=None, name=None):
   return ref.assign_add(value)
 
 
-@tf_export("assign")
+@tf_export(v1=["assign"])
 def assign(ref, value, validate_shape=None, use_locking=None, name=None):
   """Update 'ref' by assigning 'value' to it.
 
@@ -217,7 +222,7 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
   return ref.assign(value, name=name)
 
 
-@tf_export("count_up_to")
+@tf_export(v1=["count_up_to"])
 def count_up_to(ref, limit, name=None):
   r"""Increments 'ref' until it reaches 'limit'.
 
@@ -240,7 +245,7 @@ def count_up_to(ref, limit, name=None):
       ref.handle, limit, T=ref.dtype, name=name)
 
 
-@tf_export("scatter_update")
+@tf_export(v1=["scatter_update"])
 def scatter_update(ref, indices, updates, use_locking=True, name=None):
   # pylint: disable=line-too-long
   r"""Applies sparse updates to a variable reference.
@@ -294,7 +299,7 @@ def scatter_update(ref, indices, updates, use_locking=True, name=None):
       name=name))
 
 
-@tf_export("scatter_nd_update")
+@tf_export(v1=["scatter_nd_update"])
 def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
   r"""Applies sparse `updates` to individual values or slices in a Variable.
 
@@ -356,7 +361,7 @@ def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
       name=name))
 
 
-@tf_export("scatter_add")
+@tf_export(v1=["scatter_add"])
 def scatter_add(ref, indices, updates, use_locking=False, name=None):
   # pylint: disable=line-too-long
   r"""Adds sparse updates to the variable referenced by `resource`.
@@ -408,7 +413,7 @@ def scatter_add(ref, indices, updates, use_locking=False, name=None):
       name=name))
 
 
-@tf_export("scatter_nd_add")
+@tf_export(v1=["scatter_nd_add"])
 def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
   r"""Applies sparse addition to individual values or slices in a Variable.
 
@@ -472,7 +477,7 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
       name=name))
 
 
-@tf_export("scatter_sub")
+@tf_export(v1=["scatter_sub"])
 def scatter_sub(ref, indices, updates, use_locking=False, name=None):
   r"""Subtracts sparse updates to a variable reference.
 
@@ -524,3 +529,164 @@ def scatter_sub(ref, indices, updates, use_locking=False, name=None):
   return ref._lazy_read(gen_resource_variable_ops.resource_scatter_sub(  # pylint: disable=protected-access
       ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
       name=name))
+
+
+@tf_export(v1=["scatter_nd_sub"])
+def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
+  r"""Applies sparse subtraction to individual values or slices in a Variable.
+
+  `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+  `indices` must be integer tensor, containing indices into `ref`.
+  It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+  The innermost dimension of `indices` (with length `K`) corresponds to
+  indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+  dimension of `ref`.
+
+  `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+  ```
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  ```
+
+  For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+  to 8 elements. In Python, that update would look like this:
+
+  ```python
+      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+      indices = tf.constant([[4], [3], [1] ,[7]])
+      updates = tf.constant([9, 10, 11, 12])
+      op = tf.scatter_nd_sub(ref, indices, updates)
+      with tf.Session() as sess:
+        print sess.run(op)
+  ```
+
+  The resulting update to ref would look like this:
+
+      [1, -9, 3, -6, -6, 6, 7, -4]
+
+  See `tf.scatter_nd` for more details about how to make updates to
+  slices.
+
+  Args:
+    ref: A mutable `Tensor`. Must be one of the following types: `float32`,
+      `float64`, `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`,
+      `qint8`, `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`,
+      `uint32`, `uint64`. A mutable Tensor. Should be from a Variable node.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A tensor of indices into ref.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A tensor of updated values to add to ref.
+    use_locking: An optional `bool`. Defaults to `False`.
+      An optional bool. Defaults to True. If True, the assignment will
+      be protected by a lock; otherwise the behavior is undefined,
+      but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    A mutable `Tensor`. Has the same type as `ref`.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_nd_sub(
+        ref, indices, updates, use_locking, name)
+  return ref._lazy_read(gen_state_ops.resource_scatter_nd_sub(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
+
+
+@tf_export("batch_scatter_update")
+def batch_scatter_update(ref, indices, updates, use_locking=True, name=None):
+  """Generalization of `tf.scatter_update` to axis different than 0.
+
+  Analogous to `batch_gather`. This assumes that `ref`, `indices` and `updates`
+  have a series of leading dimensions that are the same for all of them, and the
+  updates are performed on the last dimension of indices. In other words, the
+  dimensions should be the following:
+
+  `num_prefix_dims = indices.ndims - 1`
+  `batch_dim = num_prefix_dims + 1`
+  `updates.shape = indices.shape + var.shape[batch_dim:]`
+
+  where
+
+  `updates.shape[:num_prefix_dims]`
+  `== indices.shape[:num_prefix_dims]`
+  `== var.shape[:num_prefix_dims]`
+
+  And the operation performed can be expressed as:
+
+  `var[i_1, ..., i_n, indices[i_1, ..., i_n, j]] = updates[i_1, ..., i_n, j]`
+
+  When indices is a 1D tensor, this operation is equivalent to
+  `tf.scatter_update`.
+
+  To avoid this operation there would be 2 alternatives:
+  1) Reshaping the variable by merging the first `ndims` dimensions. However,
+     this is not possible because `tf.reshape` returns a Tensor, which we
+     cannot use `tf.scatter_update` on.
+  2) Looping over the first `ndims` of the variable and using
+     `tf.scatter_update` on the subtensors that result of slicing the first
+     dimension. This is a valid option for `ndims = 1`, but less efficient than
+     this implementation.
+
+  See also `tf.scatter_update` and `tf.scatter_nd_update`.
+
+  Args:
+    ref: `Variable` to scatter onto.
+    indices: Tensor containing indices as described above.
+    updates: Tensor of updates to apply to `ref`.
+    use_locking: Boolean indicating whether to lock the writing operation.
+    name: Optional scope name string.
+
+  Returns:
+    Ref to `variable` after it has been modified.
+
+  Raises:
+    ValueError: If the initial `ndims` of `ref`, `indices`, and `updates` are
+        not the same.
+  """
+  with ops.name_scope(name):
+    indices = ops.convert_to_tensor(indices, name="indices")
+    indices_shape = array_ops.shape(indices)
+    indices_dimensions = indices.get_shape().ndims
+
+    if indices_dimensions is None:
+      raise ValueError("batch_gather does not allow indices with unknown "
+                       "shape.")
+
+    nd_indices = array_ops.expand_dims(indices, axis=-1)
+    nd_indices_list = []
+
+    # Scatter ND requires indices to have an additional dimension, in which the
+    # coordinates of the updated things are specified. For this to be adapted to
+    # the scatter_update with several leading dimensions, we simply make use of
+    # a tf.range for all the leading dimensions followed by concat of all the
+    # coordinates we created with the original indices.
+
+    # For example if indices.shape = [2, 3, 4], we should generate the following
+    # indices for tf.scatter_nd_update:
+    # nd_indices[:, :, 0] = [[0, 0, 0], [1, 1, 1]]
+    # nd_indices[:, :, 1] = [[0, 1, 2], [0, 1, 2]]
+    # nd_indices[:, :, 2] = indices
+    for dimension in range(indices_dimensions - 1):
+      # In this loop we generate the following for the example (one for each
+      # iteration).
+      # nd_indices[:, :, 0] = [[0, 0, 0], [1, 1, 1]]
+      # nd_indices[:, :, 1] = [[0, 1, 2], [0, 1, 2]]
+      # This is done at every iteration with a tf.range over the size of the
+      # i-th dimension and using broadcasting over the desired shape.
+      dimension_size = indices_shape[dimension]
+      shape_to_broadcast = [1] * (indices_dimensions + 1)
+      shape_to_broadcast[dimension] = dimension_size
+      dimension_range = array_ops.reshape(
+          gen_math_ops._range(0, dimension_size, 1), shape_to_broadcast)
+      if dimension_range.dtype.base_dtype != nd_indices.dtype:
+        dimension_range = gen_math_ops.cast(dimension_range, nd_indices.dtype)
+      nd_indices_list.append(
+          dimension_range * array_ops.ones_like(nd_indices))
+    # Add the original indices at the end, as described above, and concat.
+    nd_indices_list.append(nd_indices)
+    final_indices = array_ops.concat(nd_indices_list, axis=-1)
+    return scatter_nd_update(
+        ref, final_indices, updates, use_locking=use_locking)
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 0280c89c10..c832ba4e2a 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -15,7 +15,7 @@
 
 """Operations for working with string Tensors.
 
-See the @{$python/string_ops} guide.
+See the [Strings](https://tensorflow.org/api_guides/python/string_ops) guide.
 """
 
 from __future__ import absolute_import
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +32,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import compat as util_compat
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -42,6 +44,41 @@ from tensorflow.python.util.tf_export import tf_export
 # Expose regex_full_match in strings namespace
 tf_export("strings.regex_full_match")(regex_full_match)
 
+
+def regex_replace(source, pattern, rewrite, replace_global=True):
+  r"""Replace elements of `source` matching regex `pattern with `rewrite`.
+
+  Args:
+    source: string `Tensor`, the source strings to process.
+    pattern: string or scalar string `Tensor`, regular expression to use,
+      see more details at https://github.com/google/re2/wiki/Syntax
+    rewrite: string or scalar string `Tensor`, value to use in match
+      replacement, supports backslash-escaped digits (\1 to \9) can be to insert
+      text matching corresponding parenthesized group.
+    replace_global: `bool`, if `True` replace all non-overlapping matches,
+      else replace only the first match.
+
+  Returns:
+    string `Tensor` of the same shape as `source` with specified replacements.
+  """
+  # TODO(b/112455102): Remove compat.forward_compatible once past the horizon.
+  if not compat.forward_compatible(2018, 10, 10):
+    return gen_string_ops.regex_replace(
+        input=source, pattern=pattern,
+        rewrite=rewrite, replace_global=replace_global)
+  if (isinstance(pattern, util_compat.bytes_or_text_types) and
+      isinstance(rewrite, util_compat.bytes_or_text_types)):
+    # When `pattern` and `rewrite` are static through the life of the op we can
+    # use a version which performs the expensive regex compilation once at
+    # creation time.
+    return gen_string_ops.static_regex_replace(
+        input=source, pattern=pattern,
+        rewrite=rewrite, replace_global=replace_global)
+  return gen_string_ops.regex_replace(
+      input=source, pattern=pattern,
+      rewrite=rewrite, replace_global=replace_global)
+
+
 @tf_export("string_split")
 def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=invalid-name
   """Split elements of `source` based on `delimiter` into a `SparseTensor`.
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index a793f634bd..b382c3b7ce 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -23,7 +23,7 @@ import re
 
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import distribute
+from tensorflow.python.training import distribution_strategy_context
 
 
 def collect(val, collections, default_collections):
@@ -49,7 +49,7 @@ def skip_summary():
   # TODO(priyag): Add a new optional argument that will provide multiple
   # alternatives to override default behavior. (e.g. run on last tower,
   # compute sum or mean across towers).
-  tower_context = distribute.get_tower_context()
+  tower_context = distribution_strategy_context.get_tower_context()
   return tower_context and tower_context.tower_id > 0
 
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c248dd9172..f53e06fdf9 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -40,8 +40,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -204,6 +206,42 @@ it does exist, simply return it.
 """
 
 
+_DEFAULT_USE_RESOURCE = False
+
+
+@tf_export(v1=["enable_resource_variables"])
+def enable_resource_variables():
+  """Creates resource variables by default.
+
+  Resource variables are improved versions of TensorFlow variables with a
+  well-defined memory model. Accessing a resource variable reads its value, and
+  all ops which access a specific read value of the variable are guaranteed to
+  see the same value for that tensor. Writes which happen after a read (by
+  having a control or data dependency on the read) are guaranteed not to affect
+  the value of the read tensor, and similarly writes which happen before a read
+  are guaranteed to affect the value. No guarantees are made about unordered
+  read/write pairs.
+
+  Calling tf.enable_resource_variables() lets you opt-in to this TensorFlow 2.0
+  feature.
+  """
+  global _DEFAULT_USE_RESOURCE
+  _DEFAULT_USE_RESOURCE = True
+
+
+@deprecation.deprecated(
+    None, "non-resource variables are not supported in the long term")
+@tf_export(v1=["disable_resource_variables"])
+def disable_resource_variables():
+  """Opts out of resource variables.
+
+  If your code needs tf.disable_resource_variables() to be called to work
+  properly please file a bug.
+  """
+  global _DEFAULT_USE_RESOURCE
+  _DEFAULT_USE_RESOURCE = False
+
+
 class _VariableStore(object):
   """Variable store that carries a number of named Variables.
 
@@ -837,9 +875,6 @@ class _VariableStore(object):
       raise ValueError("Variable %s does not exist, or was not created with "
                        "tf.get_variable(). Did you mean to set "
                        "reuse=tf.AUTO_REUSE in VarScope?" % name)
-    if not shape.is_fully_defined() and not initializing_from_value:
-      raise ValueError("Shape of a new variable (%s) must be fully defined, "
-                       "but instead was %s." % (name, shape))
 
     # Create the tensor to initialize the variable with default value.
     if initializer is None:
@@ -854,14 +889,23 @@ class _VariableStore(object):
         # Instantiate initializer if provided initializer is a type object.
         if isinstance(initializer, type(init_ops.Initializer)):
           initializer = initializer(dtype=dtype)
-        init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-            shape.as_list(), dtype=dtype, partition_info=partition_info)
+        if shape and shape.is_fully_defined():
+          init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+              shape.as_list(), dtype=dtype, partition_info=partition_info)
+        elif not tf_inspect.getargspec(initializer).args:
+          init_val = initializer
+        else:
+          raise ValueError("You can only pass an initializer function that"
+                           "expects no arguments to its callable when the "
+                           "shape is not fully defined. The given initializer "
+                           "function expects the following args %s" %
+                           tf_inspect.getargspec(initializer).args)
         variable_dtype = dtype.base_dtype
 
     # Create the variable.
     if use_resource is None:
       # Set the default value if unspecified.
-      use_resource = False
+      use_resource = _DEFAULT_USE_RESOURCE
     v = variable(
         initial_value=init_val,
         name=name,
@@ -1440,12 +1484,11 @@ def get_variable(name,
       aggregation=aggregation)
 
 
-get_variable_or_local_docstring = (
-    """%s
+get_variable_or_local_docstring = ("""%s
 
 %sThis function prefixes the name with the current variable scope
 and performs reuse checks. See the
-@{$variables$Variable Scope How To}
+[Variable Scope How To](https://tensorflow.org/guide/variables)
 for an extensive description of how reusing works. Here is a basic example:
 
 ```python
@@ -1895,8 +1938,8 @@ class variable_scope(object):
 
   Variable scope allows you to create new variables and to share already created
   ones while providing checks to not create or share by accident. For details,
-  see the @{$variables$Variable Scope How To}, here we present only a few basic
-  examples.
+  see the [Variable Scope How To](https://tensorflow.org/guide/variables), here
+  we present only a few basic examples.
 
   Simple example of how to create a new variable:
 
@@ -2363,6 +2406,8 @@ def default_variable_creator(next_creator=None, **kwargs):
 
   if use_resource is None:
     use_resource = get_variable_scope().use_resource
+  if use_resource is None:
+    use_resource = _DEFAULT_USE_RESOURCE
   use_resource = use_resource or context.executing_eagerly()
   if use_resource:
     return resource_variable_ops.ResourceVariable(
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 464c1167d9..f7da3f7d64 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -135,7 +136,7 @@ class VariableMetaclass(type):
 @tf_export("Variable")
 class Variable(six.with_metaclass(VariableMetaclass,
                                   checkpointable.CheckpointableBase)):
-  """See the @{$variables$Variables How To} for a high level overview.
+  """See the [Variables Guide](https://tensorflow.org/guide/variables).
 
   A variable maintains state in the graph across calls to `run()`. You add a
   variable to the graph by constructing an instance of the class `Variable`.
@@ -458,7 +459,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
-  def assign(self, value, use_locking=False):
+  def assign(self, value, use_locking=False, name=None, read_value=True):
     """Assigns a new value to the variable.
 
     This is essentially a shortcut for `assign(self, value)`.
@@ -466,6 +467,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Args:
       value: A `Tensor`. The new value for this variable.
       use_locking: If `True`, use locking during the assignment.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the
+        new value of the variable; if False will return the assign op.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
@@ -473,7 +477,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
-  def assign_add(self, delta, use_locking=False):
+  def assign_add(self, delta, use_locking=False, name=None, read_value=True):
     """Adds a value to this variable.
 
      This is essentially a shortcut for `assign_add(self, delta)`.
@@ -481,6 +485,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Args:
       delta: A `Tensor`. The value to add to this variable.
       use_locking: If `True`, use locking during the operation.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the
+        new value of the variable; if False will return the assign op.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
@@ -488,7 +495,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
-  def assign_sub(self, delta, use_locking=False):
+  def assign_sub(self, delta, use_locking=False, name=None, read_value=True):
     """Subtracts a value from this variable.
 
     This is essentially a shortcut for `assign_sub(self, delta)`.
@@ -496,6 +503,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Args:
       delta: A `Tensor`. The value to subtract from this variable.
       use_locking: If `True`, use locking during the operation.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the
+        new value of the variable; if False will return the assign op.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
@@ -503,15 +513,200 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
-  def scatter_sub(self, sparse_delta, use_locking=False):
+  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
     """Subtracts `IndexedSlices` from this variable.
 
-    This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices,
-    sparse_delta.values)`.
-
     Args:
       sparse_delta: `IndexedSlices` to be subtracted from this variable.
       use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
+  def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    """Adds `IndexedSlices` to this variable.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
+  def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
+  def scatter_nd_sub(self, indices, updates, name=None):
+    """Applies sparse subtraction to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_sub(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, -9, 3, -6, -6, 6, 7, -4]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
+  def scatter_nd_add(self, indices, updates, name=None):
+    """Applies sparse addition to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        add = ref.scatter_nd_add(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(add)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 13, 3, 14, 14, 6, 7, 20]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
+  def scatter_nd_update(self, indices, updates, name=None):
+    """Applies sparse assignment to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_assign(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 11, 3, 10, 9, 6, 7, 12]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
@@ -1264,7 +1459,7 @@ class RefVariable(Variable):
     """
     return self._constraint
 
-  def assign(self, value, use_locking=False):
+  def assign(self, value, use_locking=False, name=None, read_value=True):
     """Assigns a new value to the variable.
 
     This is essentially a shortcut for `assign(self, value)`.
@@ -1272,14 +1467,21 @@ class RefVariable(Variable):
     Args:
       value: A `Tensor`. The new value for this variable.
       use_locking: If `True`, use locking during the assignment.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the
+        new value of the variable; if False will return the assign op.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
       the assignment has completed.
     """
-    return state_ops.assign(self._variable, value, use_locking=use_locking)
+    assign = state_ops.assign(self._variable, value, use_locking=use_locking,
+                              name=name)
+    if read_value:
+      return assign
+    return assign.op
 
-  def assign_add(self, delta, use_locking=False):
+  def assign_add(self, delta, use_locking=False, name=None, read_value=True):
     """Adds a value to this variable.
 
      This is essentially a shortcut for `assign_add(self, delta)`.
@@ -1287,14 +1489,21 @@ class RefVariable(Variable):
     Args:
       delta: A `Tensor`. The value to add to this variable.
       use_locking: If `True`, use locking during the operation.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the
+        new value of the variable; if False will return the assign op.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
       the addition has completed.
     """
-    return state_ops.assign_add(self._variable, delta, use_locking=use_locking)
+    assign = state_ops.assign_add(
+        self._variable, delta, use_locking=use_locking, name=name)
+    if read_value:
+      return assign
+    return assign.op
 
-  def assign_sub(self, delta, use_locking=False):
+  def assign_sub(self, delta, use_locking=False, name=None, read_value=True):
     """Subtracts a value from this variable.
 
     This is essentially a shortcut for `assign_sub(self, delta)`.
@@ -1302,22 +1511,27 @@ class RefVariable(Variable):
     Args:
       delta: A `Tensor`. The value to subtract from this variable.
       use_locking: If `True`, use locking during the operation.
+      name: The name of the operation to be created
+      read_value: if True, will return something which evaluates to the
+        new value of the variable; if False will return the assign op.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
       the subtraction has completed.
     """
-    return state_ops.assign_sub(self._variable, delta, use_locking=use_locking)
+    assign = state_ops.assign_sub(
+        self._variable, delta, use_locking=use_locking, name=name)
+    if read_value:
+      return assign
+    return assign.op
 
-  def scatter_sub(self, sparse_delta, use_locking=False):
+  def scatter_sub(self, sparse_delta, use_locking=False, name=None):
     """Subtracts `IndexedSlices` from this variable.
 
-    This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices,
-    sparse_delta.values)`.
-
     Args:
       sparse_delta: `IndexedSlices` to be subtracted from this variable.
       use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
 
     Returns:
       A `Tensor` that will hold the new value of this variable after
@@ -1328,11 +1542,216 @@ class RefVariable(Variable):
     """
     if not isinstance(sparse_delta, ops.IndexedSlices):
       raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
-    return state_ops.scatter_sub(
+    return gen_state_ops.scatter_sub(
         self._variable,
         sparse_delta.indices,
         sparse_delta.values,
-        use_locking=use_locking)
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_add(self, sparse_delta, use_locking=False, name=None):
+    """Adds `IndexedSlices` from this variable.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be added to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, ops.IndexedSlices):
+      raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_add(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    if not isinstance(sparse_delta, ops.IndexedSlices):
+      raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta)
+    return gen_state_ops.scatter_update(
+        self._variable,
+        sparse_delta.indices,
+        sparse_delta.values,
+        use_locking=use_locking,
+        name=name)
+
+  def scatter_nd_sub(self, indices, updates, name=None):
+    """Applies sparse subtraction to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_sub(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, -9, 3, -6, -6, 6, 7, -4]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return gen_state_ops.scatter_nd_sub(
+        self._variable, indices, updates, use_locking=True, name=name)
+
+  def scatter_nd_add(self, indices, updates, name=None):
+    """Applies sparse addition to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        add = ref.scatter_nd_add(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(add)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 13, 3, 14, 14, 6, 7, 20]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return gen_state_ops.scatter_nd_add(
+        self._variable, indices, updates, use_locking=True, name=name)
+
+  def scatter_nd_update(self, indices, updates, name=None):
+    """Applies sparse assignment to individual values or slices in a Variable.
+
+    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+    `indices` must be integer tensor, containing indices into `ref`.
+    It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+    The innermost dimension of `indices` (with length `K`) corresponds to
+    indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+    dimension of `ref`.
+
+    `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+    ```
+    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    ```
+
+    For example, say we want to add 4 scattered elements to a rank-1 tensor to
+    8 elements. In Python, that update would look like this:
+
+    ```python
+        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        indices = tf.constant([[4], [3], [1] ,[7]])
+        updates = tf.constant([9, 10, 11, 12])
+        op = ref.scatter_nd_update(indices, updates)
+        with tf.Session() as sess:
+          print sess.run(op)
+    ```
+
+    The resulting update to ref would look like this:
+
+        [1, 11, 3, 10, 9, 6, 7, 12]
+
+    See `tf.scatter_nd` for more details about how to make updates to
+    slices.
+
+    Args:
+      indices: The indices to be used in the operation.
+      updates: The values to be used in the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return gen_state_ops.scatter_nd_update(
+        self._variable, indices, updates, use_locking=True, name=name)
 
   def _strided_slice_assign(self,
                             begin,
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 9ffb48c4a5..5dc4037d62 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -15,7 +15,7 @@
 
 """Testing.
 
-See the @{$python/test} guide.
+See the [Testing](https://tensorflow.org/api_guides/python/test) guide.
 
 Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock`
 depending on the python version.
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 9a0b276a4b..b7e217a35b 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -79,13 +79,13 @@ class SavedModelLoaderTest(test.TestCase):
 
   def test_load_function(self):
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
     loader2 = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader2.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
@@ -101,7 +101,7 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaises(KeyError):
       graph.get_tensor_by_name("z:0")
 
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       # Check that x and y are not initialized
       with self.assertRaises(errors.FailedPreconditionError):
         sess.run(x)
@@ -110,7 +110,7 @@ class SavedModelLoaderTest(test.TestCase):
 
   def test_load_with_import_scope(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       saver, _ = loader.load_graph(
           sess.graph, ["foo_graph"], import_scope="baz")
 
@@ -126,14 +126,14 @@ class SavedModelLoaderTest(test.TestCase):
 
     # Test combined load function.
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo_graph"], import_scope="baa")
       self.assertEqual(5, sess.graph.get_tensor_by_name("baa/x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("baa/y:0").eval())
 
   def test_restore_variables(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       x = variables.Variable(0, name="x")
       y = variables.Variable(0, name="y")
       z = x * y
@@ -151,7 +151,7 @@ class SavedModelLoaderTest(test.TestCase):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     graph = ops.Graph()
     saver, _ = loader.load_graph(graph, ["foo_graph"])
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       loader.restore_variables(sess, saver)
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
@@ -203,12 +203,12 @@ class SavedModelLoaderTest(test.TestCase):
       builder.save()
 
     loader = loader_impl.SavedModelLoader(path)
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       saver, _ = loader.load_graph(sess.graph, ["foo_graph"])
       self.assertFalse(variables._all_saveable_objects())
       self.assertIsNotNone(saver)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 00b669fc97..49d52d3bee 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -97,7 +97,7 @@ class SavedModelTest(test.TestCase):
     self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
 
   def _validate_inputs_tensor_info_fail(self, builder, tensor_info):
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       foo_signature = signature_def_utils.build_signature_def({
@@ -110,7 +110,7 @@ class SavedModelTest(test.TestCase):
           signature_def_map={"foo_key": foo_signature})
 
   def _validate_inputs_tensor_info_accept(self, builder, tensor_info):
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       foo_signature = signature_def_utils.build_signature_def({
@@ -121,7 +121,7 @@ class SavedModelTest(test.TestCase):
           signature_def_map={"foo_key": foo_signature})
 
   def _validate_outputs_tensor_info_fail(self, builder, tensor_info):
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       foo_signature = signature_def_utils.build_signature_def(
@@ -133,7 +133,7 @@ class SavedModelTest(test.TestCase):
           signature_def_map={"foo_key": foo_signature})
 
   def _validate_outputs_tensor_info_accept(self, builder, tensor_info):
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       foo_signature = signature_def_utils.build_signature_def(
@@ -153,7 +153,7 @@ class SavedModelTest(test.TestCase):
   def testBadSavedModelFileFormat(self):
     export_dir = self._get_export_dir("test_bad_saved_model_file_format")
     # Attempt to load a SavedModel from an export directory that does not exist.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       with self.assertRaisesRegexp(IOError,
                                    "SavedModel file does not exist at: %s" %
                                    export_dir):
@@ -164,7 +164,7 @@ class SavedModelTest(test.TestCase):
     path_to_pb = os.path.join(export_dir, constants.SAVED_MODEL_FILENAME_PB)
     with open(path_to_pb, "w") as f:
       f.write("invalid content")
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       with self.assertRaisesRegexp(IOError, "Cannot parse file.*%s" %
                                    constants.SAVED_MODEL_FILENAME_PB):
         loader.load(sess, ["foo"], export_dir)
@@ -178,7 +178,7 @@ class SavedModelTest(test.TestCase):
                                  constants.SAVED_MODEL_FILENAME_PBTXT)
     with open(path_to_pbtxt, "w") as f:
       f.write("invalid content")
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       with self.assertRaisesRegexp(IOError, "Cannot parse file.*%s" %
                                    constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
@@ -187,7 +187,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_verify_session_graph_usage")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
 
@@ -209,12 +209,12 @@ class SavedModelTest(test.TestCase):
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
     # invoked before any add_meta_graph() calls.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self.assertRaises(AssertionError, builder.add_meta_graph, ["foo"])
 
     # Expect an assertion error for multiple calls of
     # add_meta_graph_and_variables() since weights should be saved exactly once.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, ["bar"])
       self.assertRaises(AssertionError, builder.add_meta_graph_and_variables,
@@ -227,35 +227,35 @@ class SavedModelTest(test.TestCase):
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
     # - a single tag (from predefined constants).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
 
     # Graph that updates the single variable. SavedModel invoked to:
     # - simply add the model (weights are not updated).
     # - a single tag (from predefined constants).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 43)
       builder.add_meta_graph([tag_constants.SERVING])
 
     # Graph that updates the single variable. SavedModel invoked to:
     # - simply add the model (weights are not updated).
     # - multiple tags (from predefined constants).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 45)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
 
     # Graph that updates the single variable. SavedModel invoked to:
     # - simply add the model (weights are not updated).
     # - multiple tags (from predefined constants for serving on TPU).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 45)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
 
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 44)
       builder.add_meta_graph(["foo", "bar"])
 
@@ -263,49 +263,49 @@ class SavedModelTest(test.TestCase):
     builder.save()
 
     # Restore the graph with a single predefined tag whose variables were saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, [tag_constants.TRAINING], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
     # Restore the graph with a single predefined tag whose variables were not
     # saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, [tag_constants.SERVING], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
     # Restore the graph with multiple predefined tags whose variables were not
     # saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, [tag_constants.SERVING, tag_constants.GPU], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
     # Restore the graph with multiple predefined tags (for serving on TPU)
     # whose variables were not saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, [tag_constants.SERVING, tag_constants.TPU], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
     # Restore the graph with multiple tags. Provide duplicate tags to test set
     # semantics.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo", "bar", "foo"], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
     # Try restoring a graph with a non-existent tag. This should yield a runtime
     # error.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self.assertRaises(RuntimeError, loader.load, sess, ["INVALID"],
                         export_dir)
 
     # Try restoring a graph where a subset of the tags match. Since tag matching
     # for meta graph defs follows "all" semantics, this should yield a runtime
     # error.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self.assertRaises(RuntimeError, loader.load, sess, ["foo", "baz"],
                         export_dir)
 
@@ -315,7 +315,7 @@ class SavedModelTest(test.TestCase):
 
     # Graph with two variables. SavedModel invoked to:
     # - add with weights.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v1", 1)
       self._init_and_validate_variable(sess, "v2", 2)
       builder.add_meta_graph_and_variables(sess, ["foo"])
@@ -323,14 +323,14 @@ class SavedModelTest(test.TestCase):
     # Graph with a single variable (subset of the variables from the previous
     # graph whose weights were saved). SavedModel invoked to:
     # - simply add the model (weights are not updated).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v2", 3)
       builder.add_meta_graph(["bar"])
 
     # Graph with a single variable (disjoint set of variables from the previous
     # graph whose weights were saved). SavedModel invoked to:
     # - simply add the model (weights are not updated).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v3", 4)
       builder.add_meta_graph(["baz"])
 
@@ -338,7 +338,7 @@ class SavedModelTest(test.TestCase):
     builder.save()
 
     # Restore the graph with tag "foo", whose variables were saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
       self.assertEqual(len(collection_vars), 2)
@@ -348,7 +348,7 @@ class SavedModelTest(test.TestCase):
     # Restore the graph with tag "bar", whose variables were not saved. Only the
     # subset of the variables added to the graph will be restored with the
     # checkpointed value.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["bar"], export_dir)
       collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
       self.assertEqual(len(collection_vars), 1)
@@ -357,7 +357,7 @@ class SavedModelTest(test.TestCase):
     # Try restoring the graph with tag "baz", whose variables were not saved.
     # Since this graph has a disjoint set of variables from the set that was
     # saved, this should raise an error.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self.assertRaises(errors.NotFoundError, loader.load, sess, ["baz"],
                         export_dir)
 
@@ -366,12 +366,12 @@ class SavedModelTest(test.TestCase):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with no variables.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       constant_5_name = constant_op.constant(5.0).name
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Second graph with no variables
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       constant_6_name = constant_op.constant(6.0).name
       builder.add_meta_graph(["bar"])
 
@@ -379,7 +379,7 @@ class SavedModelTest(test.TestCase):
     builder.save()
 
     # Restore the graph with tag "foo".
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       # Read the constant a from the graph.
       a = ops.get_default_graph().get_tensor_by_name(constant_5_name)
@@ -388,7 +388,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(30.0, sess.run(c))
 
     # Restore the graph with tag "bar".
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["bar"], export_dir)
       # Read the constant a from the graph.
       a = ops.get_default_graph().get_tensor_by_name(constant_6_name)
@@ -402,7 +402,7 @@ class SavedModelTest(test.TestCase):
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
@@ -410,7 +410,7 @@ class SavedModelTest(test.TestCase):
     builder.save(as_text=True)
 
     # Restore the graph with tag "foo", whose variables were saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
@@ -426,13 +426,13 @@ class SavedModelTest(test.TestCase):
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Graph with the same single variable. SavedModel invoked to:
     # - simply add the model (weights are not updated).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 43)
       builder.add_meta_graph(["bar"])
 
@@ -440,13 +440,13 @@ class SavedModelTest(test.TestCase):
     builder.save(as_text=True)
 
     # Restore the graph with tag "foo", whose variables were saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
     # Restore the graph with tag "bar", whose variables were not saved.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["bar"], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
@@ -457,7 +457,7 @@ class SavedModelTest(test.TestCase):
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
     # - add with weights.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v = variables.Variable(42, name="v")
       ops.add_to_collection("foo_vars", v)
       sess.run(variables.global_variables_initializer())
@@ -467,7 +467,7 @@ class SavedModelTest(test.TestCase):
     # Graph with the same single variable added to a different collection.
     # SavedModel invoked to:
     # - simply add the model (weights are not updated).
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       v = variables.Variable(43, name="v")
       ops.add_to_collection("bar_vars", v)
       sess.run(variables.global_variables_initializer())
@@ -480,7 +480,7 @@ class SavedModelTest(test.TestCase):
     # Restore the graph with tag "foo", whose variables were saved. The
     # collection 'foo_vars' should contain a single element. The collection
     # 'bar_vars' should not be found.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       collection_foo_vars = ops.get_collection("foo_vars")
       self.assertEqual(len(collection_foo_vars), 1)
@@ -493,7 +493,7 @@ class SavedModelTest(test.TestCase):
     # reflect the new collection. The value of the variable in the
     # collection-def corresponds to the saved value (from the previous graph
     # with tag "foo").
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["bar"], export_dir)
       collection_bar_vars = ops.get_collection("bar_vars")
       self.assertEqual(len(collection_bar_vars), 1)
@@ -507,7 +507,7 @@ class SavedModelTest(test.TestCase):
 
     # Graph with a single variable and a single entry in the signature def map.
     # SavedModel is invoked to add with weights.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       # Build and populate an empty SignatureDef for testing.
       foo_signature = signature_def_utils.build_signature_def(dict(),
@@ -517,7 +517,7 @@ class SavedModelTest(test.TestCase):
 
     # Graph with the same single variable and multiple entries in the signature
     # def map. No weights are saved by SavedModel.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 43)
       # Build and populate a different SignatureDef for testing.
       bar_signature = signature_def_utils.build_signature_def(dict(),
@@ -539,7 +539,7 @@ class SavedModelTest(test.TestCase):
 
     # Restore the graph with tag "foo". The single entry in the SignatureDef map
     # corresponding to "foo_key" should exist.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
@@ -551,7 +551,7 @@ class SavedModelTest(test.TestCase):
     # Restore the graph with tag "bar". The SignatureDef map should have two
     # entries. One corresponding to "bar_key" and another corresponding to the
     # new value of "foo_key".
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       bar_graph = loader.load(sess, ["bar"], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
@@ -610,7 +610,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection.
@@ -628,7 +628,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       self._validate_asset_collection(export_dir, foo_graph.collection_def,
                                       "hello42.txt", "foo bar baz",
@@ -643,7 +643,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_assets_name_collision_diff_file")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       asset_collection = self._build_asset_collection(
@@ -660,7 +660,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       self._validate_asset_collection(export_dir, foo_graph.collection_def,
                                       "hello42.txt", "foo bar bak",
@@ -674,7 +674,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_assets_name_collision_same_path")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       asset_collection = self._build_asset_collection(
@@ -689,7 +689,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       self._validate_asset_collection(export_dir, foo_graph.collection_def,
                                       "hello42.txt", "foo bar baz",
@@ -709,7 +709,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_assets_name_collision_same_file")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       asset_collection = self._build_asset_collection(
@@ -726,7 +726,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       self._validate_asset_collection(export_dir, foo_graph.collection_def,
                                       "hello42.txt", "foo bar baz",
@@ -746,7 +746,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_assets_name_collision_many_files")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       for i in range(5):
@@ -761,7 +761,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       for i in range(1, 5):
         idx = str(i)
@@ -778,7 +778,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -801,7 +801,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
@@ -813,7 +813,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -835,7 +835,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
@@ -858,7 +858,7 @@ class SavedModelTest(test.TestCase):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     g = ops.Graph()
-    with self.test_session(graph=g) as sess:
+    with self.session(graph=g) as sess:
       # Initialize variable `v1` to 1.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -887,7 +887,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_train_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -905,7 +905,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       self.assertEqual(3, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
@@ -916,7 +916,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_train_op_group")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -934,7 +934,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
@@ -945,7 +945,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_train_op_after_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -964,12 +964,12 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
       self.assertIsInstance(
           ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["pre_foo"], export_dir)
       self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
 
@@ -977,7 +977,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `foo` graph.
@@ -988,7 +988,7 @@ class SavedModelTest(test.TestCase):
       builder.add_meta_graph_and_variables(
           sess, ["foo"], assets_collection=asset_collection)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `bar` graph.
@@ -1002,14 +1002,14 @@ class SavedModelTest(test.TestCase):
     builder.save()
 
     # Check assets restored for graph with tag "foo".
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       self._validate_asset_collection(export_dir, foo_graph.collection_def,
                                       "foo.txt", "content_foo",
                                       "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       bar_graph = loader.load(sess, ["bar"], export_dir)
       self._validate_asset_collection(export_dir, bar_graph.collection_def,
                                       "bar.txt", "content_bar",
@@ -1019,7 +1019,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_duplicate_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `foo` specific
@@ -1031,7 +1031,7 @@ class SavedModelTest(test.TestCase):
       builder.add_meta_graph_and_variables(
           sess, ["foo"], assets_collection=asset_collection)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `bar` specific
@@ -1046,14 +1046,14 @@ class SavedModelTest(test.TestCase):
     builder.save()
 
     # Check assets restored for graph with tag "foo".
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
       self._validate_asset_collection(export_dir, foo_graph.collection_def,
                                       "foo.txt", "content_foo",
                                       "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       bar_graph = loader.load(sess, ["bar"], export_dir)
 
       # Validate the assets for `bar` graph. `foo.txt` should contain the
@@ -1139,7 +1139,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_custom_saver")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       variables.Variable(1, name="v1")
       sess.run(variables.global_variables_initializer())
       custom_saver = training.Saver(name="my_saver")
@@ -1149,7 +1149,7 @@ class SavedModelTest(test.TestCase):
     builder.save()
 
     with ops.Graph().as_default() as graph:
-      with self.test_session(graph=graph) as sess:
+      with self.session(graph=graph) as sess:
         saved_graph = loader.load(sess, ["tag"], export_dir)
         graph_ops = [x.name for x in graph.get_operations()]
         self.assertTrue("my_saver/restore_all" in graph_ops)
@@ -1161,7 +1161,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_no_custom_saver")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       variables.Variable(1, name="v1")
       sess.run(variables.global_variables_initializer())
       training.Saver(name="my_saver")
@@ -1171,7 +1171,7 @@ class SavedModelTest(test.TestCase):
     builder.save()
 
     with ops.Graph().as_default() as graph:
-      with self.test_session(graph=graph) as sess:
+      with self.session(graph=graph) as sess:
         saved_graph = loader.load(sess, ["tag"], export_dir)
         graph_ops = [x.name for x in graph.get_operations()]
         self.assertTrue("my_saver/restore_all" in graph_ops)
@@ -1183,7 +1183,7 @@ class SavedModelTest(test.TestCase):
     export_dir = self._get_export_dir("test_multiple_custom_savers")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       variables.Variable(1, name="v1")
       sess.run(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["tag_0"])
@@ -1199,7 +1199,7 @@ class SavedModelTest(test.TestCase):
 
     def _validate_custom_saver(tag_name, saver_name):
       with ops.Graph().as_default() as graph:
-        with self.test_session(graph=graph) as sess:
+        with self.session(graph=graph) as sess:
           saved_graph = loader.load(sess, [tag_name], export_dir)
           self.assertEqual(
               saved_graph.saver_def.restore_op_name,
@@ -1214,7 +1214,7 @@ class SavedModelTest(test.TestCase):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Build a SavedModel with a variable, an asset, and a constant tensor.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
       asset_collection = self._build_asset_collection("foo.txt", "content_foo",
                                                       "asset_file_tensor")
@@ -1228,7 +1228,7 @@ class SavedModelTest(test.TestCase):
     # Save the SavedModel to disk.
     builder.save()
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       # Restore the SavedModel under an import_scope in a new graph/session.
       graph_proto = loader.load(
           sess, ["tag_name"], export_dir, import_scope="scope_name")
@@ -1281,7 +1281,7 @@ class SavedModelTest(test.TestCase):
 
     # Restore the graph with a single predefined tag whose variables were saved
     # without any device information.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, [tag_constants.TRAINING], export_dir)
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
diff --git a/tensorflow/python/saved_model/simple_save_test.py b/tensorflow/python/saved_model/simple_save_test.py
index b2fa40d4f1..18f82daada 100644
--- a/tensorflow/python/saved_model/simple_save_test.py
+++ b/tensorflow/python/saved_model/simple_save_test.py
@@ -60,7 +60,7 @@ class SimpleSaveTest(test.TestCase):
 
     # Initialize input and output variables and save a prediction graph using
     # the default parameters.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       var_x = self._init_and_validate_variable(sess, "var_x", 1)
       var_y = self._init_and_validate_variable(sess, "var_y", 2)
       inputs = {"x": var_x}
@@ -69,7 +69,7 @@ class SimpleSaveTest(test.TestCase):
 
     # Restore the graph with a valid tag and check the global variables and
     # signature def map.
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       graph = loader.load(sess, [tag_constants.SERVING], export_dir)
       collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
 
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index f3a6d47500..fbae2b77fa 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -15,7 +15,7 @@
 
 """Tensor summaries for exporting information about a model.
 
-See the @{$python/summary} guide.
+See the [Summary](https://tensorflow.org/api_guides/python/summary) guide.
 """
 
 from __future__ import absolute_import
@@ -268,7 +268,7 @@ def merge(inputs, collections=None, name=None):
   @compatibility(eager)
   Not compatible with eager execution. To write TensorBoard
   summaries under eager execution, use `tf.contrib.summary` instead.
-  @end_compatbility
+  @end_compatibility
   """
   # pylint: enable=line-too-long
   if _context.executing_eagerly():
@@ -304,7 +304,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
   @compatibility(eager)
   Not compatible with eager execution. To write TensorBoard
   summaries under eager execution, use `tf.contrib.summary` instead.
-  @end_compatbility
+  @end_compatibility
   """
   if _context.executing_eagerly():
     raise RuntimeError(
@@ -336,7 +336,7 @@ def get_summary_description(node_def):
   @compatibility(eager)
   Not compatible with eager execution. To write TensorBoard
   summaries under eager execution, use `tf.contrib.summary` instead.
-  @end_compatbility
+  @end_compatibility
   """
 
   if node_def.op != 'TensorSummary':
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index eb9dbf9645..ac5eb4dbbe 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.summary import summary as summary_lib
 class ScalarSummaryTest(test.TestCase):
 
   def testScalarSummary(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = constant_op.constant(3)
       with ops.name_scope('outer'):
         im = summary_lib.scalar('inner', i)
@@ -45,7 +45,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(values[0].simple_value, 3.0)
 
   def testScalarSummaryWithFamily(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = constant_op.constant(7)
       with ops.name_scope('outer'):
         im1 = summary_lib.scalar('inner', i, family='family')
@@ -68,7 +68,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(values[0].simple_value, 7.0)
 
   def testSummarizingVariable(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       c = constant_op.constant(42.0)
       v = variables.Variable(c)
       ss = summary_lib.scalar('summary', v)
@@ -83,7 +83,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(value.simple_value, 42.0)
 
   def testImageSummary(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
       with ops.name_scope('outer'):
         im = summary_lib.image('inner', i, max_outputs=3)
@@ -97,7 +97,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(tags, expected)
 
   def testImageSummaryWithFamily(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = array_ops.ones((5, 2, 3, 1))
       with ops.name_scope('outer'):
         im = summary_lib.image('inner', i, max_outputs=3, family='family')
@@ -113,7 +113,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(tags, expected)
 
   def testHistogramSummary(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
       with ops.name_scope('outer'):
         summ_op = summary_lib.histogram('inner', i)
@@ -124,7 +124,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(summary.value[0].tag, 'outer/inner')
 
   def testHistogramSummaryWithFamily(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
       with ops.name_scope('outer'):
         summ_op = summary_lib.histogram('inner', i, family='family')
@@ -136,7 +136,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(summary.value[0].tag, 'family/outer/family/inner')
 
   def testAudioSummary(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
       with ops.name_scope('outer'):
         aud = summary_lib.audio('inner', i, 0.2, max_outputs=3)
@@ -150,7 +150,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(tags, expected)
 
   def testAudioSummaryWithFamily(self):
-    with self.test_session() as s:
+    with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
       with ops.name_scope('outer'):
         aud = summary_lib.audio('inner', i, 0.2, max_outputs=3, family='family')
@@ -194,7 +194,7 @@ class ScalarSummaryTest(test.TestCase):
       new_summ_f = g.get_tensor_by_name('new_outer/family/inner:0')
 
       # However, the tags are unaffected.
-      with self.test_session() as s:
+      with self.cached_session() as s:
         new_summ_str, new_summ_f_str = s.run([new_summ, new_summ_f])
         new_summ_pb = summary_pb2.Summary()
         new_summ_pb.ParseFromString(new_summ_str)
diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py
index 4d357918f6..5b0db43cc1 100644
--- a/tensorflow/python/summary/text_summary_test.py
+++ b/tensorflow/python/summary/text_summary_test.py
@@ -33,7 +33,7 @@ class TextPluginTest(test_util.TensorFlowTestCase):
   """
 
   def testTextSummaryAPI(self):
-    with self.test_session():
+    with self.cached_session():
 
       with self.assertRaises(ValueError):
         num = array_ops.constant(1)
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 861a3e920d..16b8626476 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -352,7 +352,7 @@ class FileWriter(SummaryToEventTransformer):
     @compatibility(eager)
     `FileWriter` is not compatible with eager execution. To write TensorBoard
     summaries under eager execution, use `tf.contrib.summary` instead.
-    @end_compatbility
+    @end_compatibility
     """
     if context.executing_eagerly():
       raise RuntimeError(
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 222f856511..01d43e09d1 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -114,6 +114,12 @@ py_library(
     ],
 )
 
+py_library(
+    name = "component_api_helper",
+    srcs = ["component_api_helper.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_binary(
     name = "strip_unused",
     srcs = ["strip_unused.py"],
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index f87fdb2d88..36af091163 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -14,14 +14,13 @@ exports_files(
     ],
 )
 
-py_binary(
+py_library(
     name = "create_python_api",
     srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"],
-    main = "//tensorflow/python/tools/api/generator:create_python_api.py",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:no_contrib",
+        "//tensorflow/python:util",
         "//tensorflow/python/tools/api/generator:doc_srcs",
     ],
 )
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 7001e566ce..64f0469482 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -25,6 +25,7 @@ TENSORFLOW_API_INIT_FILES = [
     "keras/applications/inception_resnet_v2/__init__.py",
     "keras/applications/inception_v3/__init__.py",
     "keras/applications/mobilenet/__init__.py",
+    "keras/applications/mobilenet_v2/__init__.py",
     "keras/applications/nasnet/__init__.py",
     "keras/applications/resnet50/__init__.py",
     "keras/applications/vgg16/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 73d11199d9..bc2f3516d1 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -25,6 +25,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "keras/applications/inception_resnet_v2/__init__.py",
     "keras/applications/inception_v3/__init__.py",
     "keras/applications/mobilenet/__init__.py",
+    "keras/applications/mobilenet_v2/__init__.py",
     "keras/applications/nasnet/__init__.py",
     "keras/applications/resnet50/__init__.py",
     "keras/applications/vgg16/__init__.py",
diff --git a/tensorflow/python/tools/component_api_helper.py b/tensorflow/python/tools/component_api_helper.py
new file mode 100644
index 0000000000..988ecc61f0
--- /dev/null
+++ b/tensorflow/python/tools/component_api_helper.py
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions to help integrate TensorFlow components into TF API.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import os
+
+
+def package_hook(parent_package_str, child_package_str, error_msg=None):
+  """Used to hook in an external package into the TensorFlow namespace.
+
+  Example usage:
+  ### tensorflow/__init__.py
+  from tensorflow.python.tools import component_api_helper
+  component_api_helper.package_hook(
+      'tensorflow', 'tensorflow_estimator.python')
+  component_api_helper(
+      'tensorflow.contrib', 'tensorflow_estimator.contrib.python')
+  del component_api_helper
+
+  TODO(mikecase): This function has a minor issue, where if the child package
+  does not exist alone in its directory, sibling packages to it will also be
+  accessible from the parent. This is because we just add
+  `child_pkg.__file__/..` to the subpackage search path. This should not be
+  a big issue because of how our API generation scripts work (the child package
+  we are hooking up should always be alone). But there might be a better way
+  of doing this.
+
+  Args:
+    parent_package_str: Parent package name as a string such as 'tensorflow' or
+      'tensorflow.contrib'. This will become the parent package for the
+      component package being hooked in.
+    child_package_str: Child package name as a string such as
+      'tensorflow_estimator.python'. This package will be added as a subpackage
+      of the parent.
+    error_msg: Message to print if child package cannot be found.
+  """
+  parent_pkg = importlib.import_module(parent_package_str)
+  try:
+    child_pkg = importlib.import_module(child_package_str)
+  except ImportError:
+    if error_msg:
+      print(error_msg)
+    return
+
+  def set_child_as_subpackage():
+    """Sets child package as a subpackage of parent package.
+
+    Will allow the following import statement to work.
+    >>> import parent.child
+    """
+    child_pkg_path = [os.path.join(os.path.dirname(child_pkg.__file__), "..")]
+    try:
+      parent_pkg.__path__ += child_pkg_path
+    except AttributeError:
+      parent_pkg.__path__ = child_pkg_path
+
+  def set_child_as_attr():
+    """Sets child package as a attr of the parent package.
+
+    Will allow for the following.
+    >>> import parent
+    >>> parent.child
+    """
+    child_pkg_attr_name = child_pkg.__name__.split(".")[-1]
+    setattr(parent_pkg, child_pkg_attr_name, child_pkg)
+
+  set_child_as_subpackage()
+  set_child_as_attr()
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index acf070075e..c7f414c5dc 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -59,7 +59,7 @@ from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 
 
-def _has_variables(sess):
+def _has_no_variables(sess):
   """Determines if the graph has any variables.
 
   Args:
@@ -168,7 +168,7 @@ def freeze_graph_with_def_protos(input_graph_def,
                 "the flag --input_saved_model_dir.")
           return -1
         # Models that have been frozen previously do not contain Variables.
-        elif _has_variables(sess):
+        elif _has_no_variables(sess):
           print("No variables were found in this model. It is likely the model "
                 "was frozen previously. You cannot freeze a graph twice.")
           return 0
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index 6778f3c735..3508b98475 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -70,20 +70,24 @@ class AdagradOptimizer(optimizer.Optimizer):
 
   def _create_slots(self, var_list):
     for v in var_list:
-      with ops.colocate_with(v):
-        dtype = v.dtype.base_dtype
-        if v.get_shape().is_fully_defined():
-          init = init_ops.constant_initializer(self._initial_accumulator_value,
-                                               dtype=dtype)
-        else:
-          # Use a Tensor instead of initializer if variable does not have static
-          # shape.
-          init_constant = gen_array_ops.fill(array_ops.shape(v),
-                                             self._initial_accumulator_value)
-          init = math_ops.cast(init_constant, dtype)
+      dtype = v.dtype.base_dtype
+      if v.get_shape().is_fully_defined():
+        init = init_ops.constant_initializer(self._initial_accumulator_value,
+                                             dtype=dtype)
+      else:
+        init = self._init_constant_op(v, dtype)
       self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype,
                                               "accumulator", self._name)
 
+  def _init_constant_op(self, v, dtype):
+    def init():
+      # Use a Tensor instead of initializer if variable does not have
+      # static shape.
+      init_constant = gen_array_ops.fill(array_ops.shape(v),
+                                         self._initial_accumulator_value)
+      return math_ops.cast(init_constant, dtype)
+    return init
+
   def _prepare(self):
     learning_rate = self._call_if_callable(self._learning_rate)
     self._learning_rate_tensor = ops.convert_to_tensor(
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index c9aec33d09..4e634fff84 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -302,6 +302,39 @@ class AdagradOptimizerTest(test.TestCase):
       # Creating optimizer should cause no exception.
       adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
 
+  def testDynamicShapeVariableWithCallableInit(self):
+    var0 = variable_scope.get_variable("var0",
+                                       initializer=constant_op.constant(1.),
+                                       validate_shape=False)
+    self.assertFalse(var0.shape.is_fully_defined())
+
+    grads0 = constant_op.constant(0.1, dtype=dtypes.float32)
+    learning_rate = lambda: 3.0
+
+    ada_opt = adagrad.AdagradOptimizer(
+        learning_rate, initial_accumulator_value=0.1, use_locking=True)
+
+    if not context.executing_eagerly():
+      ada_update = ada_opt.apply_gradients(
+          zip([grads0], [var0]))
+      self.evaluate(variables.global_variables_initializer())
+
+    # Fetch params to validate initial values
+    v0_val = self.evaluate([var0])
+    self.assertAllClose([1.0], v0_val)
+
+    # Run 3 steps of adagrad
+    for _ in range(3):
+      if not context.executing_eagerly():
+        self.evaluate(ada_update)
+      else:
+        ada_opt.apply_gradients(zip([grads0], [var0]))
+
+    # Validate updated params
+    v0_val = self.evaluate([var0])
+    self.assertAllCloseAccordingToType(
+        np.array([-1.6026098728179932]), v0_val)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 8f84427654..778c672077 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -152,7 +152,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      with self.test_session(graph=ops.Graph()):
+      with self.session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index 9a90f91a7c..38910fb246 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -33,7 +33,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
+from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -508,7 +510,10 @@ class CheckpointManager(object):
       max_to_keep: An integer, the number of checkpoints to keep. Unless
         preserved by `keep_checkpoint_every_n_hours`, checkpoints will be
         deleted from the active set, oldest first, until only `max_to_keep`
-        checkpoints remain.
+        checkpoints remain. If `None`, no checkpoints are deleted and everything
+        stays in the active set. Note that `max_to_keep=None` will keep all
+        checkpoint paths in memory and in the checkpoint state protocol buffer
+        on disk.
       keep_checkpoint_every_n_hours: Upon removal from the active set, a
         checkpoint will be preserved if it has been at least
         `keep_checkpoint_every_n_hours` since the last preserved checkpoint. The
@@ -519,9 +524,10 @@ class CheckpointManager(object):
     """
     self._checkpoint = checkpoint
     self._save_counter_assign = None
-    if not max_to_keep or max_to_keep < 0:
+    if max_to_keep is not None and max_to_keep <= 0:
       raise ValueError(
-          "Expected a positive integer for `max_to_max_to_keep`, got %d."
+          ("Expected a positive integer or `None` for `max_to_max_to_keep`, "
+           "got %d.")
           % (max_to_keep,))
     self._max_to_keep = max_to_keep
     self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
@@ -532,7 +538,9 @@ class CheckpointManager(object):
     self._maybe_delete = collections.OrderedDict()
     if recovered_state is None:
       self._latest_checkpoint = None
-      self._last_preserved_timestamp = current_clock
+      # Set the clock back slightly to avoid race conditions when quckly
+      # re-creating a CheckpointManager.
+      self._last_preserved_timestamp = current_clock - 1.
     else:
       self._latest_checkpoint = recovered_state.model_checkpoint_path
       self._last_preserved_timestamp = recovered_state.last_preserved_timestamp
@@ -584,6 +592,10 @@ class CheckpointManager(object):
 
   def _sweep(self):
     """Deletes or preserves managed checkpoints."""
+    if not self._max_to_keep:
+      # Does not update self._last_preserved_timestamp, since everything is kept
+      # in the active set.
+      return
     while len(self._maybe_delete) > self._max_to_keep:
       filename, timestamp = self._maybe_delete.popitem(last=False)
       # Even if we're keeping this checkpoint due to
@@ -622,13 +634,19 @@ class CheckpointManager(object):
     """
     return self._checkpoint_prefix
 
-  def save(self, session=None):
+  def save(self, session=None, checkpoint_number=None):
     """Creates a new checkpoint and manages it.
 
     Args:
       session: The session to evaluate variables in. Ignored when executing
         eagerly. If not provided when graph building, the default session is
         used.
+      checkpoint_number: An optional integer, or an integer-dtype `Variable` or
+        `Tensor`, used to number the checkpoint. If `None` (default),
+        checkpoints are numbered using `checkpoint.save_counter`. Even if
+        `checkpoint_number` is provided, `save_counter` is still incremented. A
+        user-provided `checkpoint_number` is not incremented even if it is a
+        `Variable`.
 
     Returns:
       The path to the new checkpoint. It is also recorded in the `checkpoints`
@@ -639,7 +657,6 @@ class CheckpointManager(object):
     if context.executing_eagerly():
       save_counter = self._checkpoint.save_counter
       save_counter.assign_add(1)
-      checkpoint_number = save_counter.numpy()
     else:
       if session is None:
         session = ops.get_default_session()
@@ -653,8 +670,13 @@ class CheckpointManager(object):
       with variable_scope.variable_creator_scope(_initializing_creator):
         save_counter = self._checkpoint.save_counter
       if self._save_counter_assign is None:
-        self._save_counter_assign = save_counter.assign_add(1, read_value=True)
-      checkpoint_number = session.run(self._save_counter_assign)
+        self._save_counter_assign = save_counter.assign_add(1, read_value=False)
+      session.run(self._save_counter_assign)
+    if checkpoint_number is None:
+      checkpoint_number = save_counter
+    if not isinstance(checkpoint_number, compat.integral_types):
+      checkpoint_number = training_util.global_step(
+          sess=session, global_step_tensor=checkpoint_number)
     prefix = "%s-%d" % (self._prefix, checkpoint_number)
     save_path = self._checkpoint.write(prefix)
     timestamp = time.time()
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 95e688d3c7..8ef5048299 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -26,6 +26,8 @@ import tempfile
 from google.protobuf import text_format
 
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
@@ -271,7 +273,7 @@ class SaverUtilsTest(test.TestCase):
   def testCheckpointExists(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
-        with self.test_session(graph=ops_lib.Graph()) as sess:
+        with self.session(graph=ops_lib.Graph()) as sess:
           unused_v = variables.Variable(1.0, name="v")
           variables.global_variables_initializer().run()
           saver = saver_module.Saver(sharded=sharded, write_version=version)
@@ -289,7 +291,7 @@ class SaverUtilsTest(test.TestCase):
   def testGetCheckpointMtimes(self):
     prefixes = []
     for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
-      with self.test_session(graph=ops_lib.Graph()) as sess:
+      with self.session(graph=ops_lib.Graph()) as sess:
         unused_v = variables.Variable(1.0, name="v")
         variables.global_variables_initializer().run()
         saver = saver_module.Saver(write_version=version)
@@ -303,7 +305,7 @@ class SaverUtilsTest(test.TestCase):
   def testRemoveCheckpoint(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
-        with self.test_session(graph=ops_lib.Graph()) as sess:
+        with self.session(graph=ops_lib.Graph()) as sess:
           unused_v = variables.Variable(1.0, name="v")
           variables.global_variables_initializer().run()
           saver = saver_module.Saver(sharded=sharded, write_version=version)
@@ -332,6 +334,49 @@ class CheckpointManagerTest(test.TestCase):
     self.assertFalse(checkpoint_management.checkpoint_exists(first_path))
 
   @test_util.run_in_graph_and_eager_modes
+  def testKeepAll(self):
+    checkpoint = util.Checkpoint()
+    directory = os.path.join(
+        self.get_temp_dir(),
+        # Avoid sharing directories between eager and graph
+        # TODO(allenl): stop run_in_graph_and_eager_modes reusing directories
+        str(context.executing_eagerly()))
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=None)
+    first_path = manager.save()
+    second_path = manager.save()
+    third_path = manager.save()
+    self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(first_path))
+    self.assertEqual(third_path, manager.latest_checkpoint)
+    self.assertEqual([first_path, second_path, third_path],
+                     manager.checkpoints)
+    del manager
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=None)
+    fourth_path = manager.save()
+    self.assertEqual([first_path, second_path, third_path, fourth_path],
+                     manager.checkpoints)
+    del manager
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=3)
+    self.assertEqual([first_path, second_path, third_path, fourth_path],
+                     manager.checkpoints)
+    self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(second_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(first_path))
+    fifth_path = manager.save()
+    self.assertEqual([third_path, fourth_path, fifth_path],
+                     manager.checkpoints)
+    self.assertTrue(checkpoint_management.checkpoint_exists(fifth_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(fourth_path))
+    self.assertTrue(checkpoint_management.checkpoint_exists(third_path))
+    self.assertFalse(checkpoint_management.checkpoint_exists(second_path))
+    self.assertFalse(checkpoint_management.checkpoint_exists(first_path))
+
+  @test_util.run_in_graph_and_eager_modes
   @test.mock.patch.object(checkpoint_management, "time")
   def testSaveRestoreState(self, mock_time):
     directory = self.get_temp_dir()
@@ -344,8 +389,6 @@ class CheckpointManagerTest(test.TestCase):
     mock_time.time.return_value = first_time
     first_manager.save()
     state = checkpoint_management.get_checkpoint_state(directory)
-    self.assertEqual([first_time], state.all_model_checkpoint_timestamps)
-    self.assertEqual(3., state.last_preserved_timestamp)
     second_time = first_time + 3610.
     second_name = os.path.join(directory, "ckpt-2")
     mock_time.time.return_value = second_time
@@ -353,7 +396,6 @@ class CheckpointManagerTest(test.TestCase):
     state = checkpoint_management.get_checkpoint_state(directory)
     self.assertEqual([first_time, second_time],
                      state.all_model_checkpoint_timestamps)
-    self.assertEqual(3., state.last_preserved_timestamp)
     self.assertEqual([first_name, second_name], first_manager.checkpoints)
     self.assertEqual(second_name, first_manager.latest_checkpoint)
     del first_manager
@@ -486,6 +528,31 @@ class CheckpointManagerTest(test.TestCase):
     self.assertEqual([5020.],
                      state.all_model_checkpoint_timestamps)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomNumbering(self):
+    directory = self.get_temp_dir()
+    step = variables.Variable(0, dtype=dtypes.int64)
+    checkpoint = util.Checkpoint(step=step)
+    manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory, max_to_keep=2)
+    self.evaluate(step.initializer)
+    for i in range(5):
+      path = manager.save(checkpoint_number=step)
+      expected_suffix = "-%d" % (2 * i,)
+      if not path.endswith(expected_suffix):
+        self.fail("%s should have suffix %s" % (path, expected_suffix))
+      self.evaluate(step.assign_add(2))
+    self.assertEqual(5, self.evaluate(checkpoint.save_counter))
+    # Test regular integers
+    last_path = manager.save(checkpoint_number=32)
+    self.assertIn("-32", last_path)
+    self.assertEqual(last_path, manager.latest_checkpoint)
+    self.assertEqual(
+        last_path, checkpoint_management.latest_checkpoint(directory))
+    state = checkpoint_management.get_checkpoint_state(directory)
+    # Only the most recent two checkpoints are saved
+    self.assertEqual([path, last_path], state.all_model_checkpoint_paths)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 9b72b09f08..e6118177fd 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -29,7 +29,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
@@ -180,10 +180,10 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
     tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
     ValueError: If missing variables in current graph.
   """
-  if distribute_lib.get_cross_tower_context():
+  if distribution_strategy_context.get_cross_tower_context():
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
   else:
-    distribute_lib.get_tower_context().merge_call(
+    distribution_strategy_context.get_tower_context().merge_call(
         _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
 
 
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 1c1f126ce9..1aab16338a 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -119,7 +119,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable("my1", [1, 10])
           with variable_scope.variable_scope("some_other_scope"):
@@ -153,7 +153,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope(
             "some_scope", initializer=init_ops.zeros_initializer()):
           my1 = variable_scope.get_variable("my1", [1, 10])
@@ -190,7 +190,7 @@ class CheckpointsTest(test.TestCase):
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         session.run(variables.global_variables_initializer())
         self.assertAllEqual(my4.eval(session), v4)
         self.assertAllEqual(my5.eval(session), my5_init)
@@ -218,7 +218,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable("var1", [1, 10])
           my2 = variable_scope.get_variable("var2", [10, 10])
@@ -242,7 +242,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         my1 = variable_scope.get_variable("var1", [1, 10])
         my2 = variable_scope.get_variable("var2", [10, 10])
         my3 = variable_scope.get_variable("var3", [100, 100])
@@ -265,7 +265,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable(
               name="my1",
@@ -303,7 +303,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           my1 = variable_scope.get_variable(
               name="my1",
@@ -327,7 +327,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         with variable_scope.variable_scope("some_scope"):
           _ = variable_scope.get_variable("my1", [10, 10])
           _ = variable_scope.get_variable(
@@ -372,7 +372,7 @@ class CheckpointsTest(test.TestCase):
 
     # New graph and session.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as session:
+      with self.session(graph=g) as session:
         my1 = resource_variable_ops.ResourceVariable([[0.0] * 10], name="my1")
 
         with ops.name_scope("init_from_checkpoint"):
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index 8a289b31b5..d26932c1aa 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -101,15 +101,26 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":base",
+        ":data_structures",
         ":tracking",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:ops",
+        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saveable_object",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -118,10 +129,7 @@ py_test(
     name = "util_test",
     srcs = ["util_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",  # b/74395663
-    ],
+    tags = ["notsan"],  # b/74395663
     deps = [
         ":base",
         ":tracking",
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index 66837ee52f..9189d8f3e8 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -22,6 +22,7 @@ import functools
 import json
 import weakref
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -79,10 +80,6 @@ class CheckpointInitialValue(ops.Tensor):
       self.wrapped_value.set_shape(shape)
     self._checkpoint_position = checkpoint_position
 
-  @property
-  def __class__(self):
-    return (self.wrapped_value.__class__, CheckpointInitialValue)
-
   def __getattr__(self, attr):
     try:
       return getattr(self.wrapped_value, attr)
@@ -97,14 +94,17 @@ class CheckpointInitialValue(ops.Tensor):
 class PythonStringStateSaveable(saveable_object.SaveableObject):
   """Saves Python state in a checkpoint."""
 
-  def __init__(self, name, state_callback):
+  def __init__(self, name, state_callback, restore_callback=None):
     """Configure saving.
 
     Args:
       name: The checkpoint key to write to.
       state_callback: A function taking no arguments which returns a
         string. This function is run every time a checkpoint is written.
+      restore_callback: A function taking a Python string, used to restore
+        state. Optional; defaults to doing nothing.
     """
+    self._restore_callback = restore_callback
     if context.executing_eagerly():
       self._save_string = (
           lambda: constant_op.constant(state_callback(), dtype=dtypes.string))
@@ -117,9 +117,14 @@ class PythonStringStateSaveable(saveable_object.SaveableObject):
     super(PythonStringStateSaveable, self).__init__(
         self._save_string, [spec], name)
 
+  def python_restore(self, restored_strings):
+    """Called to restore Python state."""
+    if self._restore_callback:
+      restored, = restored_strings
+      self._restore_callback(restored)
+
   def restore(self, restored_tensors, restored_shapes):
-    # TODO(allenl): Add a Python hook for state coming out of a checkpoint
-    # (currently PythonStringStateSaveable is write-only).
+    """Called to restore TensorFlow state (nothing to do)."""
     return control_flow_ops.no_op()
 
 
@@ -231,7 +236,7 @@ class _CheckpointPosition(object):
         with ops.device("/cpu:0"):
           # Run the restore itself on the CPU.
           value, = io_ops.restore_v2(
-              prefix=self._checkpoint.save_path,
+              prefix=self._checkpoint.save_path_tensor,
               tensor_names=[checkpoint_key],
               shape_and_slices=[""],
               dtypes=[base_type],
@@ -240,42 +245,99 @@ class _CheckpointPosition(object):
         value_tensors[serialized_tensor.name] = array_ops.identity(value)
       return value_tensors
 
-  def restore_ops(self):
-    """Create or fetch restore ops for this object's attributes.
-
-    Requires that the `Checkpointable` Python object has been bound to an object
-    ID in the checkpoint.
-
-    Returns:
-      A list of operations when graph building, or an empty list when executing
-      eagerly.
-    """
+  def _gather_ops_or_named_saveables(self):
+    """Looks up or creates SaveableObjects which don't have cached ops."""
     saveables = self.checkpointable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
     # Name saveables based on the name this object had when it was checkpointed.
     named_saveables = {}
-    restore_ops = []
-    building_graph = not context.executing_eagerly()
+    python_saveables = []
+    existing_restore_ops = []
     for serialized_tensor in self.object_proto.attributes:
-      saveable_factory = saveables.get(serialized_tensor.name, None)
-      if saveable_factory is None:
-        # Purposefully does not throw an exception if attributes have been added
-        # or deleted. Stores unused attributes so an exception can be raised if
-        # the user decides to check that everything in the checkpoint was
-        # loaded.
-        self._checkpoint.unused_attributes.setdefault(
-            self.checkpointable, []).append(serialized_tensor.name)
+      if context.executing_eagerly():
+        existing_op = None
+      else:
+        existing_op = self._checkpoint.restore_ops_by_name.get(
+            serialized_tensor.checkpoint_key, None)
+      if existing_op is not None:
+        existing_restore_ops.append(existing_op)
         continue
-      if building_graph:
-        existing_ops = self._checkpoint.restore_ops_by_name.get(
-            serialized_tensor.name, None)
+
+      # Only if we don't have cached ops for this SaveableObject, we'll see if
+      # the SaveableObject itself has been cached. If not, we'll make it, and
+      # either way we'll extract new ops from it (or if it has Python state to
+      # restore, we'll run that).
+      if self._checkpoint.saveable_object_cache is None:
+        # No SaveableObject caching when executing eagerly.
+        saveable = None
       else:
-        existing_ops = None
-      if existing_ops is None:
+        # If we've already created and cached a SaveableObject for this
+        # attribute, we can re-use it to avoid re-creating some ops when graph
+        # building.
+        saveable_list = self._checkpoint.saveable_object_cache.get(
+            self.checkpointable, {}).get(serialized_tensor.name, (None,))
+        if len(saveable_list) == 1:
+          # Almost every attribute will have exactly one SaveableObject.
+          saveable, = saveable_list
+        else:
+          # Don't use cached SaveableObjects for partitioned variables, which is
+          # the only case where we'd have a list of SaveableObjects. Op caching
+          # will catch them.
+          saveable = None
+      if saveable is not None:
+        # The name of this attribute has changed, so we need to re-generate
+        # the SaveableObject.
+        if serialized_tensor.checkpoint_key not in saveable.name:
+          saveable = None
+          del self._checkpoint.saveable_object_cache[self.checkpointable]
+          break
+      if saveable is None:
+        # If there was no cached SaveableObject, we should check if the Python
+        # object has the attribute.
+        saveable_factory = saveables.get(serialized_tensor.name, None)
+        if saveable_factory is None:
+          # Purposefully does not throw an exception if attributes have been
+          # added or deleted. Stores unused attributes so an exception can be
+          # raised if the user decides to check that everything in the
+          # checkpoint was loaded.
+          self._checkpoint.unused_attributes.setdefault(
+              self.checkpointable, []).append(serialized_tensor.name)
+          continue
         if callable(saveable_factory):
           saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
         else:
           saveable = saveable_factory
+        if self._checkpoint.saveable_object_cache is not None:
+          self._checkpoint.saveable_object_cache.setdefault(
+              self.checkpointable, {})[serialized_tensor.name] = [saveable]
+      if isinstance(saveable, PythonStringStateSaveable):
+        python_saveables.append(saveable)
+      else:
         named_saveables[serialized_tensor.checkpoint_key] = saveable
+    return existing_restore_ops, named_saveables, python_saveables
+
+  def restore_ops(self):
+    """Create or fetch restore ops for this object's attributes.
+
+    Requires that the `Checkpointable` Python object has been bound to an object
+    ID in the checkpoint.
+
+    Returns:
+      A list of operations when graph building, or an empty list when executing
+      eagerly.
+    """
+    (restore_ops,
+     named_saveables,
+     python_saveables) = self._gather_ops_or_named_saveables()
+
+    # Eagerly run restorations for Python state.
+    reader = pywrap_tensorflow.NewCheckpointReader(
+        self._checkpoint.save_path_string)
+    for saveable in python_saveables:
+      spec_names = [spec.name for spec in saveable.specs]
+      saveable.python_restore(
+          [reader.get_tensor(name) for name in spec_names])
+
+    # If we have new SaveableObjects, extract and cache restore ops.
     if named_saveables:
       validated_saveables = (
           self._checkpoint.builder._ValidateAndSliceInputs(named_saveables))  # pylint: disable=protected-access
@@ -285,7 +347,7 @@ class _CheckpointPosition(object):
             ("Saveable keys changed when validating. Got back %s, was "
              "expecting %s") % (named_saveables.keys(), validated_names))
       all_tensors = self._checkpoint.builder.bulk_restore(
-          filename_tensor=self._checkpoint.save_path,
+          filename_tensor=self._checkpoint.save_path_tensor,
           saveables=validated_saveables, preferred_shard=-1,
           restore_sequentially=False)
       saveable_index = 0
@@ -295,7 +357,7 @@ class _CheckpointPosition(object):
             saveable_index:saveable_index + num_specs]
         saveable_index += num_specs
         restore_op = saveable.restore(saveable_tensors, restored_shapes=None)
-        if building_graph:
+        if not context.executing_eagerly():
           assert saveable.name not in self._checkpoint.restore_ops_by_name
           self._checkpoint.restore_ops_by_name[saveable.name] = restore_op
           restore_ops.append(restore_op)
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index 507cda8734..f06cbbfa15 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -128,7 +128,8 @@ class CheckpointableDataStructure(base.CheckpointableBase):
            "stored in a List object. Got %s, which does not inherit from "
            "CheckpointableBase.") % (value,))
     if (isinstance(value, CheckpointableDataStructure)
-        or layer_utils.is_layer(value)):
+        or layer_utils.is_layer(value)
+        or layer_utils.has_weights(value)):
       # Check for object-identity rather than with __eq__ to avoid
       # de-duplicating empty container types. Automatically generated list
       # wrappers keep things like "[] == []" true, which means "[] in [[]]" is
@@ -149,14 +150,14 @@ class CheckpointableDataStructure(base.CheckpointableBase):
   def trainable_weights(self):
     return layer_utils.gather_trainable_weights(
         trainable=self.trainable,
-        sub_layers=self.layers,
+        sub_layers=self._layers,
         extra_variables=self._extra_variables)
 
   @property
   def non_trainable_weights(self):
     return layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
-        sub_layers=self.layers,
+        sub_layers=self._layers,
         extra_variables=self._extra_variables)
 
   @property
@@ -183,7 +184,8 @@ class CheckpointableDataStructure(base.CheckpointableBase):
     # have any inputs.
     aggregated = []
     for layer in self.layers:
-      aggregated += layer.updates
+      if hasattr(layer, "updates"):
+        aggregated += layer.updates
     return aggregated
 
   @property
@@ -191,7 +193,8 @@ class CheckpointableDataStructure(base.CheckpointableBase):
     """Aggregate losses from any `Layer` instances."""
     aggregated = []
     for layer in self.layers:
-      aggregated += layer.losses
+      if hasattr(layer, "losses"):
+        aggregated += layer.losses
     return aggregated
 
   def __hash__(self):
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index 472b7c32b4..4638917b4c 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.layers import core as non_keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
@@ -96,6 +97,11 @@ class ListTests(test.TestCase):
     model.load_weights(save_path)
     self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
                         self.evaluate(model.variables[0]))
+    v = variables.Variable(1.)
+    model.var_list = [v]
+    self.assertIn(v, model.variables)
+    self.assertIn(v, model.trainable_variables)
+    self.assertNotIn(v, model.non_trainable_variables)
 
   def testUpdatesForwarded(self):
     with context.graph_mode():
diff --git a/tensorflow/python/training/checkpointable/layer_utils.py b/tensorflow/python/training/checkpointable/layer_utils.py
index d65b631fe9..ec764bca89 100644
--- a/tensorflow/python/training/checkpointable/layer_utils.py
+++ b/tensorflow/python/training/checkpointable/layer_utils.py
@@ -30,13 +30,20 @@ def is_layer(obj):
           and hasattr(obj, "variables"))
 
 
+def has_weights(obj):
+  """Implicit check for Layer-like objects."""
+  # TODO(b/110718070): Replace with isinstance(obj, base_layer.Layer).
+  return (hasattr(obj, "trainable_weights")
+          and hasattr(obj, "non_trainable_weights"))
+
+
 def filter_empty_layer_containers(layer_list):
   """Filter out empty Layer-like containers."""
   filtered = []
   for obj in layer_list:
     if is_layer(obj):
       filtered.append(obj)
-    else:
+    elif hasattr(obj, "layers"):
       # Checkpointable data structures will not show up in ".layers" lists, but
       # the layers they contain will.
       filtered.extend(obj.layers)
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index e42f989469..d1b50d1362 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -68,16 +68,25 @@ _OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
 class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
-  def __init__(self, object_graph_proto, save_path, dtype_map=None):
+  def __init__(self, object_graph_proto, save_path, save_path_tensor,
+               restore_op_cache, saveable_object_cache):
     """Specify the checkpoint being loaded.
 
     Args:
       object_graph_proto: The CheckpointableObjectGraph protocol buffer
         associated with this checkpoint.
-      save_path: A string `Tensor`. The path to the checkpoint, as returned by
+      save_path: A string, the path to the checkpoint, as returned by
         `tf.train.latest_checkpoint`.
-      dtype_map: When executing eagerly, specifies dtypes for creating slot
-        variables. None when graph building.
+      save_path_tensor: A string `Tensor` which contains or will be fed the save
+        path.
+      restore_op_cache: A dictionary shared between
+        `_CheckpointRestoreCoordinator`s for the same Python objects, used to
+        look up restore ops by name to avoid re-creating them across multiple
+        `restore()` calls.
+      saveable_object_cache: A mapping of checkpointable objects -> attribute
+        names -> list(`SaveableObject`s), used when `SaveableObjects` must be
+        referenced every restore (e.g. for Python state); otherwise they would
+        create their own ops every restore.
     """
     self.builder = saver_lib.BulkSaverBuilder()
     self.object_graph_proto = object_graph_proto
@@ -97,12 +106,17 @@ class _CheckpointRestoreCoordinator(object):
     # loading). Used to make status assertions fail when loading checkpoints
     # that don't quite match.
     self.all_python_objects = _ObjectIdentityWeakSet()
-    self.save_path = save_path
-    self.dtype_map = dtype_map
+    self.save_path_tensor = save_path_tensor
+    self.save_path_string = save_path
+    self.dtype_map = pywrap_tensorflow.NewCheckpointReader(
+        save_path).get_variable_to_dtype_map()
+    # A NewCheckpointReader for the most recent checkpoint, for streaming Python
+    # state restoration.
     # When graph building, contains a list of ops to run to restore objects from
     # this checkpoint.
     self.restore_ops = []
-    self.restore_ops_by_name = {}
+    self.restore_ops_by_name = restore_op_cache
+    self.saveable_object_cache = saveable_object_cache
     self.new_restore_ops_callback = None
     # A mapping from optimizer proto ids to lists of slot variables to be
     # restored when the optimizer is tracked. Only includes slot variables whose
@@ -820,6 +834,11 @@ class _LoadStatus(object):
     pass
 
   @abc.abstractmethod
+  def assert_existing_objects_matched(self):
+    """Raises an exception unless existing Python objects have been matched."""
+    pass
+
+  @abc.abstractmethod
   def run_restore_ops(self, session=None):
     """Runs restore ops from the checkpoint. Requires a valid checkpoint."""
     pass
@@ -889,13 +908,11 @@ class CheckpointLoadStatus(_LoadStatus):
         or if there are any checkpointed values which have not been matched to
         Python objects.
     """
+    self.assert_existing_objects_matched()
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
       checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
       if checkpointable is None:
         raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
-      if checkpointable._update_uid < self._checkpoint.restore_uid:  # pylint: disable=protected-access
-        raise AssertionError(
-            "Object not assigned a value from checkpoint: %s" % (node,))
     if self._checkpoint.slot_restorations:
       # Sanity check; this collection should be clear if everything has been
       # restored.
@@ -906,6 +923,31 @@ class CheckpointLoadStatus(_LoadStatus):
           ("Unused attributes in these objects (the attributes exist in the "
            "checkpoint but not in the objects): %s") % (
                self._checkpoint.unused_attributes.items(),))
+    return self
+
+  def assert_existing_objects_matched(self):
+    """Asserts that checkpointable Python objects have been matched.
+
+    Note that this is a weaker assertion than `assert_consumed`. It will only
+    fail for existing Python objects which are (transitive) dependencies of the
+    root object and which do not have an entry in the checkpoint.
+
+    It will not fail, for example, if a `tf.keras.Layer` object has not yet been
+    built and so has not created any `tf.Variable` objects.
+
+    Returns:
+      `self` for chaining.
+
+    Raises:
+      AssertionError: If a Python object exists in the transitive dependencies
+        of the root object but does not have a value in the checkpoint.
+    """
+    for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
+      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if (checkpointable is not None
+          and checkpointable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
+        raise AssertionError(
+            "Object not assigned a value from checkpoint: %s" % (node,))
     for checkpointable_object in list_objects(self._root_checkpointable):
       self._checkpoint.all_python_objects.add(checkpointable_object)
     unused_python_objects = (
@@ -915,7 +957,7 @@ class CheckpointLoadStatus(_LoadStatus):
       raise AssertionError(
           ("Some Python objects were not bound to checkpointed values, likely "
            "due to changes in the Python program: %s")
-          % (unused_python_objects,))
+          % (list(unused_python_objects),))
     return self
 
   def run_restore_ops(self, session=None):
@@ -977,6 +1019,11 @@ class InitializationOnlyStatus(_LoadStatus):
     raise AssertionError(
         "No checkpoint specified (save_path=None); nothing is being restored.")
 
+  def assert_existing_objects_matched(self):
+    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
+    raise AssertionError(
+        "No checkpoint specified (save_path=None); nothing is being restored.")
+
   def run_restore_ops(self, session=None):
     """For consistency with `CheckpointLoadStatus`.
 
@@ -1050,6 +1097,15 @@ class NameBasedSaverStatus(_LoadStatus):
       if checkpointable._update_uid < self._checkpoint.restore_uid:
         raise AssertionError("Object not restored: %s" % (checkpointable,))
       # pylint: enable=protected-access
+    return self
+
+  def assert_existing_objects_matched(self):
+    """Raises an exception if currently created objects are unmatched."""
+    # For name-based checkpoints there's no object information in the
+    # checkpoint, so there's no distinction between
+    # assert_existing_objects_matched and assert_consumed (and both are less
+    # useful since we don't touch Python objects or Python state).
+    return self.assert_consumed()
 
   def _gather_saveable_objects(self):
     """Walk the object graph, using global names for SaveableObjects."""
@@ -1153,16 +1209,15 @@ class CheckpointableSaver(object):
     self._last_save_object_graph = None
     self._last_save_saver = None
 
-    # Op caching for restore
-    self._last_restore_object_graph = None
-    self._last_restore_checkpoint = None
+    # Op caching for restore, shared between _CheckpointRestoreCoordinators
+    self._restore_op_cache = {}
 
     if context.executing_eagerly():
       # SaveableObjects are always recreated when executing eagerly.
       self._saveable_object_cache = None
     else:
-      # Maps Checkpointable objects -> attribute names -> SaveableObjects, to
-      # avoid re-creating SaveableObjects when graph building.
+      # Maps Checkpointable objects -> attribute names -> list(SaveableObjects),
+      # to avoid re-creating SaveableObjects when graph building.
       self._saveable_object_cache = _ObjectIdentityWeakKeyDictionary()
 
   @property
@@ -1340,22 +1395,12 @@ class CheckpointableSaver(object):
     object_graph_proto = (
         checkpointable_object_graph_pb2.CheckpointableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
-    if graph_building and object_graph_proto == self._last_restore_object_graph:
-      checkpoint = self._last_restore_checkpoint
-    else:
-      checkpoint = _CheckpointRestoreCoordinator(
-          object_graph_proto=object_graph_proto,
-          save_path=file_prefix_tensor,
-          dtype_map=dtype_map)
-      if graph_building:
-        if self._last_restore_object_graph is not None:
-          raise NotImplementedError(
-              "Using a single Saver to restore different object graphs is not "
-              "currently supported when graph building. Use a different Saver "
-              "for each object graph (restore ops will be duplicated), or "
-              "file a feature request if this limitation bothers you.")
-        self._last_restore_checkpoint = checkpoint
-        self._last_restore_object_graph = object_graph_proto
+    checkpoint = _CheckpointRestoreCoordinator(
+        object_graph_proto=object_graph_proto,
+        save_path=save_path,
+        save_path_tensor=file_prefix_tensor,
+        restore_op_cache=self._restore_op_cache,
+        saveable_object_cache=self._saveable_object_cache)
     base._CheckpointPosition(  # pylint: disable=protected-access
         checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
     load_status = CheckpointLoadStatus(
@@ -1644,6 +1689,17 @@ class Checkpoint(tracking.Checkpointable):
           Python objects in the dependency graph with no values in the
           checkpoint. This method returns the status object, and so may be
           chained with `initialize_or_restore` or `run_restore_ops`.
+      -  `assert_existing_objects_matched()`:
+          Raises an exception if any existing Python objects in the dependency
+          graph are unmatched. Unlike `assert_consumed`, this assertion will
+          pass if values in the checkpoint have no corresponding Python
+          objects. For example a `tf.keras.Layer` object which has not yet been
+          built, and so has not created any variables, will pass this assertion
+          but fail `assert_consumed`. Useful when loading part of a larger
+          checkpoint into a new Python program, e.g. a training checkpoint with
+          a `tf.train.Optimizer` was saved but only the state required for
+          inference is being loaded. This method returns the status object, and
+          so may be chained with `initialize_or_restore` or `run_restore_ops`.
       - `initialize_or_restore(session=None)`:
           When graph building, runs variable initializers if `save_path` is
           `None`, but otherwise runs restore operations. If no `session` is
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index a0a87b6b79..697b44c3ff 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -384,8 +384,8 @@ class CheckpointingTests(test.TestCase):
     saver = saver_lib.Saver(var_list=[v])
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
-    self.evaluate(v.non_dep_variable.assign(42.))
     with self.test_session() as sess:
+      self.evaluate(v.non_dep_variable.assign(42.))
       save_path = saver.save(sess, prefix)
       self.evaluate(v.non_dep_variable.assign(43.))
       self.evaluate(v.mirrored.assign(44.))
@@ -437,6 +437,9 @@ class CheckpointingTests(test.TestCase):
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
     status = on_create_root.restore(save_path=save_path)
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
     on_create_model(constant_op.constant([[3.]]))  # create variables
     self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
     self.assertAllEqual([42.],
@@ -444,6 +447,9 @@ class CheckpointingTests(test.TestCase):
                             on_create_model._named_dense.variables[1]))
     on_create_m_bias_slot = on_create_optimizer.get_slot(
         on_create_model._named_dense.variables[1], "m")
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
     # Optimizer slot variables are created when the original variable is
     # restored.
     self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
@@ -451,6 +457,7 @@ class CheckpointingTests(test.TestCase):
                         self.evaluate(on_create_optimizer.variables()))
     dummy_var = resource_variable_ops.ResourceVariable([1.])
     on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_existing_objects_matched()
     status.assert_consumed()
     beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
     self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
@@ -499,15 +506,18 @@ class CheckpointingTests(test.TestCase):
               global_step=root.global_step)
           checkpoint_path = checkpoint_management.latest_checkpoint(
               checkpoint_directory)
-          with self.test_session(graph=ops.get_default_graph()) as session:
+          with self.session(graph=ops.get_default_graph()) as session:
             status = root.restore(save_path=checkpoint_path)
             status.initialize_or_restore(session=session)
             if checkpoint_path is None:
               self.assertEqual(0, training_continuation)
               with self.assertRaises(AssertionError):
                 status.assert_consumed()
+              with self.assertRaises(AssertionError):
+                status.assert_existing_objects_matched()
             else:
               status.assert_consumed()
+              status.assert_existing_objects_matched()
             for _ in range(num_training_steps):
               session.run(train_op)
             root.save(file_prefix=checkpoint_prefix, session=session)
@@ -704,11 +714,12 @@ class CheckpointingTests(test.TestCase):
     load_into = LateDependencies()
     status = checkpointable_utils.CheckpointableSaver(
         load_into).restore(save_path)
+    status.assert_existing_objects_matched()
     with self.assertRaises(AssertionError):
       status.assert_consumed()
     load_into.add_dep()
     status.assert_consumed()
-    status.run_restore_ops()
+    status.assert_existing_objects_matched().run_restore_ops()
     self.assertEqual(123., self.evaluate(load_into.dep.var))
 
   @test_util.run_in_graph_and_eager_modes
@@ -785,6 +796,7 @@ class CheckpointingTests(test.TestCase):
     no_slot_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(new_root.var))
     new_root.optimizer = adam.AdamOptimizer(0.1)
+    slot_status.assert_existing_objects_matched()
     with self.assertRaisesRegexp(AssertionError, "beta1_power"):
       slot_status.assert_consumed()
     self.assertEqual(12., self.evaluate(new_root.var))
@@ -884,6 +896,8 @@ class CheckpointingTests(test.TestCase):
         load_root.dep_one.dep_three, name="var", initializer=0.)
     with self.assertRaises(AssertionError):
       status.assert_consumed()
+    with self.assertRaises(AssertionError):
+      status.assert_existing_objects_matched()
 
   @test_util.run_in_graph_and_eager_modes
   def testObjectsCombined(self):
@@ -907,7 +921,7 @@ class CheckpointingTests(test.TestCase):
     v2 = checkpointable_utils.add_variable(
         load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
     status = checkpointable_utils.CheckpointableSaver(load_root).restore(
-        save_path).assert_consumed()
+        save_path).assert_consumed().assert_existing_objects_matched()
     status.run_restore_ops()
     self.assertEqual(32., self.evaluate(v1))
     self.assertEqual(64., self.evaluate(v2))
@@ -994,7 +1008,7 @@ class CheckpointingTests(test.TestCase):
     """Saves after the first should not modify the graph."""
     with context.graph_mode():
       graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
+      with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         obj = tracking.Checkpointable()
@@ -1073,22 +1087,17 @@ class CheckpointingTests(test.TestCase):
     self.assertEqual(5, self.evaluate(checkpoint.var_5))
     self.assertEqual(1, self.evaluate(checkpoint.var_1))
     self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    if context.executing_eagerly():
-      checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
-      self.assertEqual(9, self.evaluate(checkpoint.var_9))
-      self.assertEqual(8, self.evaluate(checkpoint.var_8))
-      self.assertEqual(1, self.evaluate(checkpoint.var_1))
-      self.assertEqual(0, self.evaluate(checkpoint.var_0))
-    else:
-      # Restoring into modified graphs is an error while graph building.
-      with self.assertRaises(NotImplementedError):
-        checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+    checkpoint.restore(checkpoint_prefix + "-10").run_restore_ops()
+    self.assertEqual(9, self.evaluate(checkpoint.var_9))
+    self.assertEqual(8, self.evaluate(checkpoint.var_8))
+    self.assertEqual(1, self.evaluate(checkpoint.var_1))
+    self.assertEqual(0, self.evaluate(checkpoint.var_0))
 
   def testManyRestoresGraph(self):
     """Restores after the first should not modify the graph."""
     with context.graph_mode():
       graph = ops.Graph()
-      with graph.as_default(), self.test_session(graph):
+      with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
         obj = tracking.Checkpointable()
@@ -1244,6 +1253,8 @@ class CheckpointingTests(test.TestCase):
       status.initialize_or_restore()
       train_fn()
       with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+      with self.assertRaises(AssertionError):
         status.assert_consumed()
 
     # Make sure initialization doesn't clobber later restores
@@ -1456,11 +1467,15 @@ class CheckpointCompatibilityTests(test.TestCase):
       if context.executing_eagerly():
         with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
           status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_existing_objects_matched()
       else:
         # When graph building, we haven't read any keys, so we don't know
         # whether the restore will be complete.
         with self.assertRaisesRegexp(AssertionError, "not restored"):
           status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_existing_objects_matched()
       status.run_restore_ops()
       self._check_sentinels(root)
       self._set_sentinels(root)
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 581db45e80..1ac7c39872 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import threading
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
+from tensorflow.python.eager import context as eager_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -31,71 +31,11 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import device_util
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import nest
 
 
 # ------------------------------------------------------------------------------
-# Internal API for setting the current thread mode as being either in a
-# tower or cross-tower context for a particular distribution strategy.
-
-
-class _ThreadMode(object):
-
-  def __init__(self, dist, cross, tower):
-    self.distribution_strategy = dist
-    self.cross_tower_context = cross
-    self.tower_context = tower
-
-
-class _CrossTowerThreadMode(_ThreadMode):
-
-  def __init__(self, distribution_strategy):
-    _ThreadMode.__init__(
-        self, distribution_strategy, distribution_strategy, None)
-
-
-class _InTowerThreadMode(_ThreadMode):
-
-  def __init__(self, tower_ctx):
-    _ThreadMode.__init__(
-        self, tower_ctx.distribution_strategy, None, tower_ctx)
-
-
-_per_thread_mode = threading.local()
-
-
-def _push_per_thread_mode(context):
-  if not hasattr(_per_thread_mode, "stack"):
-    _per_thread_mode.stack = []
-  _per_thread_mode.stack.append(context)
-
-
-def _pop_per_thread_mode():
-  _per_thread_mode.stack.pop(-1)
-
-
-class _DefaultTowerThreadMode(_ThreadMode):
-  """Type of default value returned by `_get_per_thread_mode()`.
-
-  Used when the thread-local stack is empty.
-  """
-
-  def __init__(self):
-    # _default_distribution_strategy and _default_tower_context are
-    # defined at the bottom of this file.
-    _ThreadMode.__init__(
-        self, _default_distribution_strategy, None, _default_tower_context)
-
-
-def _get_per_thread_mode():
-  try:
-    return _per_thread_mode.stack[-1]
-  except (AttributeError, IndexError):
-    # _default_tower_mode is defined at the bottom of this file.
-    return _default_tower_mode
-
-
-# ------------------------------------------------------------------------------
 # Context tracking whether in a distribution.update() or .update_non_slot()
 # call.
 
@@ -128,96 +68,6 @@ class UpdateContext(object):
 
 
 # ------------------------------------------------------------------------------
-# Public API for accessing the current thread mode
-
-
-def get_tower_context():
-  """Returns the current TowerContext or None if in a cross-tower context.
-
-  Note that execution:
-  1. starts in the default (single-tower) tower context (this function
-     will return the default TowerContext object);
-  2. switches to cross-tower context (in which case this will return
-     None) when entering a `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) tower context inside
-     `call_for_each_tower(fn, ...)`;
-  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-tower context (and again
-     this function will return None).
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-tower context for the default `DistributionStrategy`. You may
-  also switch from the cross-tower context of 4 to a tower context by
-  calling `call_for_each_tower()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-tower context, in a tower context you should use the
-  `TowerContext` API instead.
-
-  Returns:
-    The current `TowerContext` object when in a tower context scope, else None.
-
-    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().tower_context
-
-
-def get_cross_tower_context():
-  """Returns the current DistributionStrategy if in a cross-tower context.
-
-  Note that execution:
-  1. starts in the default (single-tower) tower context;
-  2. switches to cross-tower context when entering a
-     `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) tower context inside
-     `call_for_each_tower(fn, ...)`;
-  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-tower context.
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-tower context for the default `DistributionStrategy`. You may
-  also switch from the cross-tower context of 4 to a tower context by
-  calling `call_for_each_tower()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-tower context.
-
-  Returns:
-    Returns the current `DistributionStrategy` object in a cross-tower
-    context, or None.
-
-    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().cross_tower_context
-
-
-def get_distribution_strategy():
-  """Returns the current `DistributionStrategy` object.
-
-  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
-  instead when possible.
-
-  Returns:
-    A `DistributionStrategy` object. Inside a
-    `with distribution_strategy.scope()` block, it returns
-    `distribution_strategy`, otherwise it returns the default
-    (single-tower) `DistributionStrategy` object.
-  """
-  return _get_per_thread_mode().distribution_strategy
-
-
-def has_distribution_strategy():
-  """Return if there is a current non-default `DistributionStrategy`.
-
-  Returns:
-    True if inside a `with distribution_strategy.scope():`.
-  """
-  return get_distribution_strategy() is not _default_distribution_strategy
-
-
-# ------------------------------------------------------------------------------
 # Public utility functions.
 
 
@@ -239,7 +89,8 @@ def _require_cross_tower_context(distribution_strategy):
   if context.cross_tower_context is distribution_strategy: return
   # We have an error to report, figure out the right message.
   if context.distribution_strategy is not distribution_strategy:
-    if context.distribution_strategy is _default_distribution_strategy:
+    if (context.distribution_strategy is
+        distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
       raise RuntimeError(
           'Need to be inside "with distribution_strategy.scope()" for %s' %
           (distribution_strategy,))
@@ -272,7 +123,8 @@ def _require_distribution_strategy_scope(distribution_strategy):
   context = _get_per_thread_mode()
   if context.distribution_strategy is distribution_strategy: return
   # We have an error to report, figure out the right message.
-  if context.distribution_strategy is _default_distribution_strategy:
+  if (context.distribution_strategy is
+      distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
     raise RuntimeError(
         'Need to be inside "with distribution_strategy.scope()" for %s' %
         (distribution_strategy,))
@@ -295,7 +147,8 @@ class _CurrentDistributionContext(object):
                var_creator_scope,
                var_scope=None,
                default_device=None):
-    self._context = _CrossTowerThreadMode(distribution_strategy)
+    self._context = distribution_strategy_context._CrossTowerThreadMode(  # pylint: disable=protected-access
+        distribution_strategy)
     self._var_creator_scope = var_creator_scope
     self._var_scope = var_scope
     if default_device:
@@ -395,6 +248,7 @@ class DistributionStrategy(object):
     devices.
 
   We have then a few approaches we want to support:
+
   * Code written (as if) with no knowledge of class `DistributionStrategy`.
     This code should work as before, even if some of the layers, etc.
     used by that code are written to be distribution-aware. This is done
@@ -588,7 +442,7 @@ class DistributionStrategy(object):
     Returns:
       A context manager.
     """
-    if has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       _require_cross_tower_context(self)
       return _SameScopeAgainContext(self)
 
@@ -740,7 +594,7 @@ class DistributionStrategy(object):
       In eager mode, returns `None`.
       In graph mode, a list of ops to execute. Empty list if nothing to be done.
     """
-    if context.executing_eagerly():
+    if eager_context.executing_eagerly():
       return
     else:
       return []
@@ -757,7 +611,7 @@ class DistributionStrategy(object):
       In eager mode, returns `None`.
       In graph mode, a list of ops to execute. Empty list if nothing to be done.
     """
-    if context.executing_eagerly():
+    if eager_context.executing_eagerly():
       return
     else:
       return []
@@ -771,13 +625,18 @@ class DistributionStrategy(object):
 
     Args:
       fn: function to run using this distribution strategy. The function must
-        have the following signature: def fn(context, inputs).
+        have the following signature: def fn(context, *inputs).
         `context` is an instance of `MultiStepContext` that will be passed when
         `fn` is run. `context` can be used to specify the outputs to be returned
         from `fn` by calling `context.set_last_step_output`. It can also be used
         to capture non tensor outputs by `context.set_non_tensor_output`.
         See `MultiStepContext` documentation for more information.
-        `inputs` will have same type/structure as `iterator.get_next()`.
+        `inputs` will have same type/structure as `iterator.get_next()`. If the
+        `iterator.get_next()` returns a tuple say `return x, y` then whose will
+        be unpacked and passed to the `step_fn`; and step_fn signature would
+        look like `def step_fn(context, x, y)`. If the iterator returns a single
+        value say `return x` then the value is passed as is; the step_fn
+        signature would look like `def step_fn(context, x)`.
         Typically, `fn` will use `call_for_each_tower` method of the strategy
         to distribute the computation over multiple towers.
       iterator: Iterator of a dataset that represents the input for `fn`. The
@@ -1077,9 +936,37 @@ class DistributionStrategy(object):
   def _worker_device_index(self):
     raise NotImplementedError("must be implemented in descendants")
 
-  def configure(self, session_config=None):
-    """Find the best configuration given a tensorflow session config."""
-    del session_config
+  @property
+  def between_graph(self):
+    """Whether the strategy uses between-graph replication or not.
+
+      This is expected to return a constant value that will not be changed
+      throughout its life cycle.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    """Configures the strategy class."""
+    del session_config, cluster_spec, task_type, task_id
+
+  @property
+  def should_init(self):
+    """Whether initialization is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_checkpoint(self):
+    """Whether checkpointing is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_save_summary(self):
+    """Whether saving summaries is needed."""
+    raise NotImplementedError("must be implemented in descendants")
 
 
 # A note about the difference between the context managers
@@ -1106,7 +993,8 @@ class TowerContext(object):
 
   def __init__(self, distribution_strategy, tower_id):
     self._distribution_strategy = distribution_strategy
-    self._thread_context = _InTowerThreadMode(self)
+    self._thread_context = distribution_strategy_context._InTowerThreadMode(  # pylint: disable=protected-access
+        self)
     self._tower_id = tower_id
 
   def __enter__(self):
@@ -1149,7 +1037,8 @@ class TowerContext(object):
   def _merge_call(self, merge_fn, *args, **kwargs):
     """Default implementation for single tower."""
     _push_per_thread_mode(  # thread-local, so not needed with multiple threads
-        _CrossTowerThreadMode(self._distribution_strategy))
+        distribution_strategy_context._CrossTowerThreadMode(  # pylint: disable=protected-access
+            self._distribution_strategy))
     try:
       return merge_fn(self._distribution_strategy, *args, **kwargs)
     finally:
@@ -1196,7 +1085,7 @@ class _DefaultDistributionStrategy(DistributionStrategy):
 
   def scope(self):
     """Context manager setting a variable creator and `self` as current."""
-    if has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       raise RuntimeError("Must not nest DistributionStrategy scopes.")
 
     def creator(next_creator, *args, **kwargs):
@@ -1277,6 +1166,7 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     raise RuntimeError("worker_device_index() method unsupported by "
                        "_DefaultDistributionStrategy.")
 
+
 # ------------------------------------------------------------------------------
 # Common operations
 
@@ -1292,20 +1182,11 @@ def increment_var(v, amount=1):
   def merge_fn(dist, vm):
     return dist.group(dist.update(vm, update))
 
-  tower_context = get_tower_context()
+  tower_context = distribution_strategy_context.get_tower_context()
   return tower_context.merge_call(merge_fn, v)
 
 
 # ------------------------------------------------------------------------------
-# Singletons
-
-_default_distribution_strategy = _DefaultDistributionStrategy()
-_default_tower_context = TowerContext(
-    _default_distribution_strategy, tower_id=0)
-_default_tower_mode = _DefaultTowerThreadMode()
-
-
-# ------------------------------------------------------------------------------
 # We haven't yet implemented deserialization for DistributedVariables.
 # So here we catch any attempts to deserialize variables
 # when using distribution strategies.
@@ -1314,7 +1195,7 @@ _original_from_proto = resource_variable_ops._from_proto_fn
 
 
 def _from_proto_fn(v, import_scope=None):
-  if has_distribution_strategy():
+  if distribution_strategy_context.has_distribution_strategy():
     raise NotImplementedError(
         "Deserialization of variables is not yet supported when using"
         "distributed strategies.")
@@ -1323,3 +1204,10 @@ def _from_proto_fn(v, import_scope=None):
 
 resource_variable_ops._from_proto_fn = _from_proto_fn
 # pylint: enable=protected-access
+
+
+#-------------------------------------------------------------------------------
+# Shorthand for some methods from distribution_strategy_context.
+_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
+_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
+_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/training/distribute_test.py
index 694145ede7..f03bd39100 100644
--- a/tensorflow/python/training/distribute_test.py
+++ b/tensorflow/python/training/distribute_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.training import distribute
+from tensorflow.python.training import distribution_strategy_context
 
 
 class _TestTowerContext(distribute.TowerContext):
@@ -49,12 +50,12 @@ class _TestStrategy(distribute.DistributionStrategy):
 
 
 def _assert_in_default_state(t):
-  t.assertIs(distribute._default_tower_context,
-             distribute.get_tower_context())
-  t.assertIs(None, distribute.get_cross_tower_context())
-  t.assertIs(distribute._default_distribution_strategy,
-             distribute.get_distribution_strategy())
-  t.assertFalse(distribute.has_distribution_strategy())
+  t.assertIs(distribution_strategy_context._get_default_tower_context(),
+             distribution_strategy_context.get_tower_context())
+  t.assertIs(None, distribution_strategy_context.get_cross_tower_context())
+  t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
+             distribution_strategy_context.get_distribution_strategy())
+  t.assertFalse(distribution_strategy_context.has_distribution_strategy())
 
 
 class TestStrategyTest(test.TestCase):
@@ -64,11 +65,13 @@ class TestStrategyTest(test.TestCase):
     dist = _TestStrategy()
 
     def run_fn():
-      tower_context = distribute.get_tower_context()
+      tower_context = distribution_strategy_context.get_tower_context()
       self.assertTrue(tower_context is not None)
-      self.assertIs(None, distribute.get_cross_tower_context())
-      self.assertTrue(distribute.has_distribution_strategy())
-      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertIs(None,
+                    distribution_strategy_context.get_cross_tower_context())
+      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
       self.assertEqual("foo", tower_context.merge_call(None, test_arg="foo"))
       expected_value = _get_test_variable(
           "bar", variable_scope.VariableSynchronization.AUTO,
@@ -86,10 +89,12 @@ class TestStrategyTest(test.TestCase):
     _assert_in_default_state(self)
     dist = _TestStrategy()
     with dist.scope():
-      self.assertIs(None, distribute.get_tower_context())
-      self.assertIs(dist, distribute.get_cross_tower_context())
-      self.assertTrue(distribute.has_distribution_strategy())
-      self.assertIs(dist, distribute.get_distribution_strategy())
+      self.assertIs(None, distribution_strategy_context.get_tower_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_cross_tower_context())
+      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
       expected_value = _get_test_variable(
           "baz", variable_scope.VariableSynchronization.AUTO,
           variable_scope.VariableAggregation.NONE)
@@ -120,15 +125,21 @@ class DefaultDistributionStrategyTest(test.TestCase):
     _assert_in_default_state(self)
 
     def merge_fn(dist, s):
-      self.assertIs(distribute._default_distribution_strategy, dist)
-      self.assertIs(None, distribute.get_tower_context())
-      self.assertIs(dist, distribute.get_cross_tower_context())
-      self.assertIs(dist, distribute.get_distribution_strategy())
-      self.assertFalse(distribute.has_distribution_strategy())
+      self.assertIs(
+          distribution_strategy_context._get_default_distribution_strategy(),
+          dist)
+      self.assertIs(None, distribution_strategy_context.get_tower_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_cross_tower_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
+      self.assertFalse(
+          distribution_strategy_context.has_distribution_strategy())
       return "foo_" + s
 
-    tower_ctx = distribute.get_tower_context()
-    self.assertIs(distribute._default_tower_context, tower_ctx)
+    tower_ctx = distribution_strategy_context.get_tower_context()
+    self.assertIs(distribution_strategy_context._get_default_tower_context(),
+                  tower_ctx)
     self.assertEqual("foo_bar", tower_ctx.merge_call(merge_fn, "bar"))
     _assert_in_default_state(self)
 
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
new file mode 100644
index 0000000000..998b5c35ce
--- /dev/null
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -0,0 +1,203 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to get distribution strategy related contexts."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+
+# There is a circular dependency between this and `distribute` module. So we
+# load it lazily to workaround this.
+distribute_lib = LazyLoader(
+    "distribute_lib", globals(),
+    "tensorflow.python.training.distribute")
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# tower or cross-tower context for a particular distribution strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, tower):
+    self.distribution_strategy = dist
+    self.cross_tower_context = cross
+    self.tower_context = tower
+
+
+class _CrossTowerThreadMode(_ThreadMode):
+
+  def __init__(self, distribution_strategy):
+    _ThreadMode.__init__(
+        self, distribution_strategy, distribution_strategy, None)
+
+
+class _InTowerThreadMode(_ThreadMode):
+
+  def __init__(self, tower_ctx):
+    _ThreadMode.__init__(
+        self, tower_ctx.distribution_strategy, None, tower_ctx)
+
+
+def _push_per_thread_mode(context):
+  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
+
+
+def _pop_per_thread_mode():
+  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
+
+
+class _DefaultTowerThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
+                         _get_default_tower_context())
+
+
+def _get_per_thread_mode():
+  try:
+    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
+  except (AttributeError, IndexError):
+    return _get_default_tower_mode()
+
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+def get_tower_context():
+  """Returns the current TowerContext or None if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context (this function
+     will return the default TowerContext object);
+  2. switches to cross-tower context (in which case this will return
+     None) when entering a `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context (and again
+     this function will return None).
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context, in a tower context you should use the
+  `TowerContext` API instead.
+
+  Returns:
+    The current `TowerContext` object when in a tower context scope, else None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().tower_context
+
+
+def get_cross_tower_context():
+  """Returns the current DistributionStrategy if in a cross-tower context.
+
+  Note that execution:
+  1. starts in the default (single-tower) tower context;
+  2. switches to cross-tower context when entering a
+     `with DistributionStrategy.scope():` block;
+  3. switches to a (non-default) tower context inside
+     `call_for_each_tower(fn, ...)`;
+  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-tower context.
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-tower context for the default `DistributionStrategy`. You may
+  also switch from the cross-tower context of 4 to a tower context by
+  calling `call_for_each_tower()`, jumping back to step 3.
+
+  Most `DistributionStrategy` methods may only be executed in
+  a cross-tower context.
+
+  Returns:
+    Returns the current `DistributionStrategy` object in a cross-tower
+    context, or None.
+
+    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
+    will return None in a particular block.
+  """
+  return _get_per_thread_mode().cross_tower_context
+
+
+def get_distribution_strategy():
+  """Returns the current `DistributionStrategy` object.
+
+  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
+  instead when possible.
+
+  Returns:
+    A `DistributionStrategy` object. Inside a
+    `with distribution_strategy.scope()` block, it returns
+    `distribution_strategy`, otherwise it returns the default
+    (single-tower) `DistributionStrategy` object.
+  """
+  return _get_per_thread_mode().distribution_strategy
+
+
+def has_distribution_strategy():
+  """Return if there is a current non-default `DistributionStrategy`.
+
+  Returns:
+    True if inside a `with distribution_strategy.scope():`.
+  """
+  return get_distribution_strategy() is not _get_default_distribution_strategy()
+
+
+# ------------------------------------------------------------------------------
+# Defaults that are used when no distribution strategy is explicitly created.
+# We create them lazily in a function so that we can workaround the circular
+# dependency on distribute_lib. See lazy loader at the top of this file.
+
+_defaults = {
+    "distribution_strategy": None,
+    "tower_context": None,
+    "tower_mode": None
+}
+
+
+def _get_default_distribution_strategy():
+  if _defaults["distribution_strategy"] is None:
+    _defaults["distribution_strategy"] = (
+        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
+  return _defaults["distribution_strategy"]
+
+
+def _get_default_tower_context():
+  if _defaults["tower_context"] is None:
+    _defaults["tower_context"] = distribute_lib.TowerContext(
+        _get_default_distribution_strategy(), tower_id=0)
+  return _defaults["tower_context"]
+
+
+def _get_default_tower_mode():
+  if _defaults["tower_mode"] is None:
+    _defaults["tower_mode"] = _DefaultTowerThreadMode()
+  return _defaults["tower_mode"]
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index caa26581e8..0d6207f8c4 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -15,7 +15,8 @@
 
 """Input pipeline.
 
-Please see the @{$reading_data$reading data how-to}
+Please see the [reading data
+how-to](https://tensorflow.org/api_guides/python/reading_data)
 for context.
 """
 
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 7b06bffa4b..c077630de2 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -25,6 +25,7 @@ import sys
 import six
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import distribute_coordinator_context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -284,6 +285,63 @@ class Scaffold(object):
         resources.initialize_resources(resources.local_resources()))
 
 
+def _create_monitored_session_with_worker_context(worker_context,  # pylint: disable=missing-docstring
+                                                  scaffold,
+                                                  checkpoint_dir=None,
+                                                  hooks=None,
+                                                  chief_only_hooks=None,
+                                                  save_checkpoint_secs=None,
+                                                  save_summaries_steps=None,
+                                                  save_summaries_secs=None,
+                                                  config=None,
+                                                  stop_grace_period_secs=120,
+                                                  log_step_count_steps=100,
+                                                  max_wait_secs=7200,
+                                                  save_checkpoint_steps=None,
+                                                  summary_dir=None):
+  all_hooks = []
+  if hooks:
+    all_hooks.extend(hooks)
+  if chief_only_hooks and worker_context.is_chief:
+    all_hooks.extend(chief_only_hooks)
+
+  summary_dir = summary_dir or checkpoint_dir
+  if summary_dir and worker_context.should_save_summary:
+    if log_step_count_steps and log_step_count_steps > 0:
+      all_hooks.append(
+          basic_session_run_hooks.StepCounterHook(
+              output_dir=summary_dir, every_n_steps=log_step_count_steps))
+
+    if (save_summaries_steps and save_summaries_steps > 0) or (
+        save_summaries_secs and save_summaries_secs > 0):
+      all_hooks.append(
+          basic_session_run_hooks.SummarySaverHook(
+              scaffold=scaffold,
+              save_steps=save_summaries_steps,
+              save_secs=save_summaries_secs,
+              output_dir=summary_dir))
+
+  if checkpoint_dir and worker_context.should_checkpoint:
+    if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
+        save_checkpoint_steps and save_checkpoint_steps > 0):
+      all_hooks.append(
+          basic_session_run_hooks.CheckpointSaverHook(
+              checkpoint_dir,
+              save_steps=save_checkpoint_steps,
+              save_secs=save_checkpoint_secs,
+              scaffold=scaffold))
+
+  session_creator = worker_context.session_creator(
+      scaffold,
+      config=config,
+      checkpoint_dir=checkpoint_dir,
+      max_wait_secs=max_wait_secs)
+  return MonitoredSession(
+      session_creator=session_creator,
+      hooks=all_hooks,
+      stop_grace_period_secs=stop_grace_period_secs)
+
+
 @tf_export('train.MonitoredTrainingSession')
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
@@ -373,14 +431,35 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
     save_checkpoint_steps = None
 
   scaffold = scaffold or Scaffold()
+  worker_context = distribute_coordinator_context.get_current_worker_context()
+
+  if worker_context:
+    return _create_monitored_session_with_worker_context(
+        worker_context,
+        scaffold,
+        checkpoint_dir=checkpoint_dir,
+        hooks=hooks,
+        chief_only_hooks=chief_only_hooks,
+        save_checkpoint_secs=save_checkpoint_secs,
+        save_summaries_steps=save_summaries_steps,
+        save_summaries_secs=save_summaries_secs,
+        config=config,
+        stop_grace_period_secs=stop_grace_period_secs,
+        log_step_count_steps=log_step_count_steps,
+        max_wait_secs=max_wait_secs,
+        save_checkpoint_steps=save_checkpoint_steps,
+        summary_dir=summary_dir)
+
   if not is_chief:
     session_creator = WorkerSessionCreator(
         scaffold=scaffold,
         master=master,
         config=config,
         max_wait_secs=max_wait_secs)
-    return MonitoredSession(session_creator=session_creator, hooks=hooks or [],
-                            stop_grace_period_secs=stop_grace_period_secs)
+    return MonitoredSession(
+        session_creator=session_creator,
+        hooks=hooks or [],
+        stop_grace_period_secs=stop_grace_period_secs)
 
   all_hooks = []
   if chief_only_hooks:
@@ -400,25 +479,29 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
 
     if (save_summaries_steps and save_summaries_steps > 0) or (
         save_summaries_secs and save_summaries_secs > 0):
-      all_hooks.append(basic_session_run_hooks.SummarySaverHook(
-          scaffold=scaffold,
-          save_steps=save_summaries_steps,
-          save_secs=save_summaries_secs,
-          output_dir=summary_dir))
+      all_hooks.append(
+          basic_session_run_hooks.SummarySaverHook(
+              scaffold=scaffold,
+              save_steps=save_summaries_steps,
+              save_secs=save_summaries_secs,
+              output_dir=summary_dir))
 
   if checkpoint_dir:
     if (save_checkpoint_secs and save_checkpoint_secs > 0) or (
         save_checkpoint_steps and save_checkpoint_steps > 0):
-      all_hooks.append(basic_session_run_hooks.CheckpointSaverHook(
-          checkpoint_dir,
-          save_steps=save_checkpoint_steps,
-          save_secs=save_checkpoint_secs,
-          scaffold=scaffold))
+      all_hooks.append(
+          basic_session_run_hooks.CheckpointSaverHook(
+              checkpoint_dir,
+              save_steps=save_checkpoint_steps,
+              save_secs=save_checkpoint_secs,
+              scaffold=scaffold))
 
   if hooks:
     all_hooks.extend(hooks)
-  return MonitoredSession(session_creator=session_creator, hooks=all_hooks,
-                          stop_grace_period_secs=stop_grace_period_secs)
+  return MonitoredSession(
+      session_creator=session_creator,
+      hooks=all_hooks,
+      stop_grace_period_secs=stop_grace_period_secs)
 
 
 @tf_export('train.SessionCreator')
@@ -546,6 +629,11 @@ class _MonitoredSession(object):
     self._hooks = hooks or []
     for h in self._hooks:
       h.begin()
+
+    worker_context = distribute_coordinator_context.get_current_worker_context()
+    if not session_creator and worker_context:
+      session_creator = worker_context.session_creator()
+
     # Create the session.
     self._coordinated_creator = self._CoordinatedSessionCreator(
         session_creator=session_creator or ChiefSessionCreator(),
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 92533ca4f3..ff586b6c03 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -32,6 +32,7 @@ from tensorflow.contrib.testing.python.framework import util_test
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import debug_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute import distribute_coordinator
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -381,6 +382,119 @@ class MonitoredTrainingSessionTest(test.TestCase):
         self.assertEqual(0, session.run(gstep))
 
 
+class MockStrategy(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=True,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self._between_graph = between_graph
+    self._should_init = should_init
+    self._should_checkpoint = should_checkpoint
+    self._should_save_summary = should_save_summary
+
+  @property
+  def between_graph(self):
+    return self._between_graph
+
+  @property
+  def should_init(self):
+    return self._should_init
+
+  @property
+  def should_checkpoint(self):
+    return self._should_checkpoint
+
+  @property
+  def should_save_summary(self):
+    return self._should_save_summary
+
+
+class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
+  """Test distribute coordinator controls summary saving and checkpointing."""
+
+  def test_summary_hook_enabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_save_summary=True), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_summaries_enabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      summary.scalar('my_summary_tag', new_gstep * 2)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_summaries_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(101):
+          session.run(new_gstep)
+
+    summaries = util_test.latest_summaries(logdir)
+    tags = [s.summary.value[0].tag for s in summaries]
+    self.assertIn('my_summary_tag', tags)
+    self.assertIn('global_step/sec', tags)
+
+  def test_summary_hook_disabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_save_summary=False), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_summaries_disabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      summary.scalar('my_summary_tag', new_gstep * 2)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_summaries_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(101):
+          session.run(new_gstep)
+
+    # No summary is saved.
+    summaries = util_test.latest_summaries(logdir)
+    self.assertEqual(len(summaries), 0)
+
+  def test_checkpoint_hook_enabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_checkpoint=True), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_enabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_checkpoint_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(100):
+          session.run(new_gstep)
+
+      # A restart will find the checkpoint and recover automatically.
+      with monitored_session.MonitoredTrainingSession(
+          is_chief=True, checkpoint_dir=logdir) as session:
+        self.assertEqual(100, session.run(gstep))
+
+  def test_checkpoint_hook_disabled(self):
+    context = distribute_coordinator._WorkerContext(
+        MockStrategy(should_checkpoint=False), None, None, None)
+
+    logdir = _test_dir(self.get_temp_dir(), 'test_save_checkpoint_disabled')
+    with ops.Graph().as_default():
+      gstep = variables_lib.get_or_create_global_step()
+      new_gstep = state_ops.assign_add(gstep, 1)
+      with context, monitored_session.MonitoredTrainingSession(
+          checkpoint_dir=logdir,
+          save_checkpoint_steps=100,
+          log_step_count_steps=10) as session:
+        for _ in range(100):
+          session.run(new_gstep)
+
+    # No checkpoint is saved.
+    checkpoint = checkpoint_management.latest_checkpoint(logdir)
+    self.assertIsNone(checkpoint)
+
+
 class StopAtNSession(monitored_session._WrappedSession):
   """A wrapped session that stops at the N-th call to _check_stop."""
 
@@ -1365,8 +1479,8 @@ class MonitoredSessionTest(test.TestCase):
       with monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
               scaffold,
-              checkpoint_filename_with_path=
-              checkpoint_management.latest_checkpoint(logdir))) as session:
+              checkpoint_filename_with_path=checkpoint_management.
+              latest_checkpoint(logdir))) as session:
         self.assertEqual(2, session.run(gstep))
 
   def test_retry_initialization_on_aborted_error(self):
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 4b91d1e963..177a7ddfa5 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -363,10 +363,12 @@ class ExponentialMovingAverage(object):
     `GraphKeys.ALL_VARIABLES` collection.  They will be returned by calls to
     `tf.global_variables()`.
 
-    Returns an op that updates all shadow variables as described above.
+    Returns an op that updates all shadow variables from the current value of
+    their associated variables.
 
-    Note that `apply()` can be called multiple times with different lists of
-    variables.
+    Note that `apply()` can be called multiple times. When eager execution is
+    enabled each call to apply will update the variables once, so this needs to
+    be called in a loop.
 
     Args:
       var_list: A list of Variable or Tensor objects. The variables
@@ -389,31 +391,30 @@ class ExponentialMovingAverage(object):
                                       dtypes.float64]:
         raise TypeError("The variables must be half, float, or double: %s" %
                         var.name)
-      if var in self._averages:
-        raise ValueError("Moving average already computed for: %s" % var.name)
 
-      # For variables: to lower communication bandwidth across devices we keep
-      # the moving averages on the same device as the variables. For other
-      # tensors, we rely on the existing device allocation mechanism.
-      with ops.init_scope():
-        if isinstance(var, variables.Variable):
-          avg = slot_creator.create_slot(var,
-                                         var.initialized_value(),
-                                         self.name,
-                                         colocate_with_primary=True)
-          # NOTE(mrry): We only add `tf.Variable` objects to the
-          # `MOVING_AVERAGE_VARIABLES` collection.
-          ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
-        else:
-          avg = slot_creator.create_zeros_slot(
-              var,
-              self.name,
-              colocate_with_primary=(var.op.type in ["Variable",
-                                                     "VariableV2",
-                                                     "VarHandleOp"]))
-          if self._zero_debias:
-            zero_debias_true.add(avg)
-      self._averages[var] = avg
+      if var not in self._averages:
+        # For variables: to lower communication bandwidth across devices we keep
+        # the moving averages on the same device as the variables. For other
+        # tensors, we rely on the existing device allocation mechanism.
+        with ops.init_scope():
+          if isinstance(var, variables.Variable):
+            avg = slot_creator.create_slot(var,
+                                           var.initialized_value(),
+                                           self.name,
+                                           colocate_with_primary=True)
+            # NOTE(mrry): We only add `tf.Variable` objects to the
+            # `MOVING_AVERAGE_VARIABLES` collection.
+            ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
+          else:
+            avg = slot_creator.create_zeros_slot(
+                var,
+                self.name,
+                colocate_with_primary=(var.op.type in ["Variable",
+                                                       "VariableV2",
+                                                       "VarHandleOp"]))
+            if self._zero_debias:
+              zero_debias_true.add(avg)
+        self._averages[var] = avg
 
     with ops.name_scope(self.name) as scope:
       decay = ops.convert_to_tensor(self._decay, name="decay")
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 3e85e6bfa7..fdb8d795c3 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import variable_scope
@@ -254,6 +256,25 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(1, sess.run(v0))
       self.assertEqual([17.5], sess.run(v1_avg))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicEager(self):
+    v0 = variables.Variable(1.0)
+    v1 = variables.Variable(2.0)
+
+    ema = moving_averages.ExponentialMovingAverage(0.25)
+    op = ema.apply([v0, v1])
+    if not context.executing_eagerly():
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(op)
+
+    self.evaluate(v0.assign(2.0))
+    self.evaluate(v1.assign(4.0))
+
+    self.evaluate(ema.apply([v0, v1]))
+
+    self.assertAllEqual(self.evaluate(ema.average(v0)), 1.75)
+    self.assertAllEqual(self.evaluate(ema.average(v1)), 3.5)
+
   def averageVariablesNamesHelper(self, zero_debias):
     with self.test_session():
       v0 = variables.Variable(10.0, name="v0")
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 6d95b144d5..2304a461c1 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -464,7 +465,8 @@ class Optimizer(
         # TODO(josh11b): Test that we handle weight decay in a reasonable way.
         if (distribute_lib.get_loss_reduction() ==
             variable_scope.VariableAggregation.MEAN):
-          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          num_towers = distribution_strategy_context.get_distribution_strategy(
+          ).num_towers
           if num_towers > 1:
             loss_value *= (1. / num_towers)
 
@@ -482,7 +484,8 @@ class Optimizer(
     # Scale loss if using a "mean" loss reduction and multiple towers.
     if (distribute_lib.get_loss_reduction() ==
         variable_scope.VariableAggregation.MEAN):
-      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      num_towers = distribution_strategy_context.get_distribution_strategy(
+      ).num_towers
       if num_towers > 1:
         loss *= (1. / num_towers)
 
@@ -548,15 +551,15 @@ class Optimizer(
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
     # Handle DistributionStrategy case.
-    if distribute_lib.get_cross_tower_context():
+    if distribution_strategy_context.get_cross_tower_context():
       raise RuntimeError("Use `_distributed_apply()` instead of "
                          "`apply_gradients()` in a cross-tower context.")
     # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
     # always calling _distributed_apply(), using the default distribution
     # as needed.
-    if distribute_lib.has_distribution_strategy():
+    if distribution_strategy_context.has_distribution_strategy():
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
-      return distribute_lib.get_tower_context().merge_call(
+      return distribution_strategy_context.get_tower_context().merge_call(
           self._distributed_apply, grads_and_vars, global_step, name)
 
     # No DistributionStrategy case.
@@ -769,16 +772,15 @@ class Optimizer(
     Returns:
       A list of variables.
     """
-    executing_eagerly = context.executing_eagerly()
     current_graph = ops.get_default_graph()
 
     def _from_current_graph(variable):
-      if executing_eagerly:
+      if variable._in_graph_mode:  # pylint: disable=protected-access
+        return variable.op.graph is current_graph
+      else:
         # No variable.op in eager mode. We don't expect lots of eager graphs,
         # but behavior should be consistent with graph mode.
         return variable._graph_key == current_graph._graph_key  # pylint: disable=protected-access
-      else:
-        return variable.op.graph is current_graph
 
     optimizer_variables = [v for v in self._non_slot_variables()
                            if _from_current_graph(v)]
@@ -799,7 +801,8 @@ class Optimizer(
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_checkpointable()
-      distribution_strategy = distribute_lib.get_distribution_strategy()
+      distribution_strategy = (
+          distribution_strategy_context.get_distribution_strategy())
       with distribution_strategy.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index ac26e75bb9..900f9706ac 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -303,7 +303,7 @@ class QueueRunnerTest(test.TestCase):
       init_op = variables.global_variables_initializer()
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
       queue_runner_impl.add_queue_runner(qr)
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       init_op.run()
       threads = queue_runner_impl.start_queue_runners(sess)
       for t in threads:
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 04fce496bd..274c856686 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -809,6 +809,22 @@ class BaseSaverBuilder(object):
           keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
           version=self._write_version)
     else:
+      graph = ops.get_default_graph()
+      # Do some sanity checking on collections containing
+      # PartitionedVariables. If a saved collection has a PartitionedVariable,
+      # the GraphDef needs to include concat ops to get the value (or there'll
+      # be a lookup error on load).
+      check_collection_list = graph.get_all_collection_keys()
+      for collection_type in check_collection_list:
+        for element in graph.get_collection(collection_type):
+          if isinstance(element, variables.PartitionedVariable):
+            try:
+              graph.get_operation_by_name(element.name)
+            except KeyError:
+              # Create a concat op for this PartitionedVariable. The user may
+              # not need it, but we'll try looking it up on MetaGraph restore
+              # since it's in a collection.
+              element.as_tensor()
       return saver_pb2.SaverDef(
           filename_tensor_name=filename_tensor.name,
           save_tensor_name=save_tensor.name,
@@ -869,7 +885,7 @@ def _get_saver_or_default():
 class Saver(object):
   """Saves and restores variables.
 
-  See @{$variables$Variables}
+  See [Variables](https://tensorflow.org/guide/variables)
   for an overview of variables, saving and restoring.
 
   The `Saver` class adds ops to save and restore variables to and from
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index b55e64122a..f5b2a22327 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -84,7 +84,7 @@ class SaverTest(test.TestCase):
   def basicSaveRestore(self, variable_op):
     save_path = os.path.join(self.get_temp_dir(), "basic_save_restore")
 
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
       v0 = variable_op(10.0, name="v0")
@@ -115,7 +115,7 @@ class SaverTest(test.TestCase):
 
     # Start a second session.  In that session the parameter nodes
     # have not been initialized either.
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       v0 = variable_op(-1.0, name="v0")
       v1 = variable_op(-1.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
@@ -137,7 +137,7 @@ class SaverTest(test.TestCase):
 
     # Build another graph with 2 nodes, initialized
     # differently, and a Restore node for them.
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       v0_2 = variable_op(1000.0, name="v0")
       v1_2 = variable_op(2000.0, name="v1")
       v2_2 = saver_test_utils.CheckpointedOp(name="v2")
@@ -222,7 +222,7 @@ class SaverTest(test.TestCase):
     # Save from graph mode and restore from eager mode.
     graph_ckpt_prefix = os.path.join(self.get_temp_dir(), "graph_ckpt")
     with context.graph_mode():
-      with self.test_session(graph=ops_lib.Graph()) as sess:
+      with self.session(graph=ops_lib.Graph()) as sess:
         # Create a graph model and save the checkpoint.
         w1 = resource_variable_ops.ResourceVariable(1.0, name="w1")
         w2 = resource_variable_ops.ResourceVariable(2.0, name="w2")
@@ -256,7 +256,7 @@ class SaverTest(test.TestCase):
       graph_saver.save(None, eager_ckpt_prefix)
 
     with context.graph_mode():
-      with self.test_session(graph=ops_lib.Graph()) as sess:
+      with self.session(graph=ops_lib.Graph()) as sess:
         w3 = resource_variable_ops.ResourceVariable(0.0, name="w3")
         w4 = resource_variable_ops.ResourceVariable(0.0, name="w4")
         graph_saver = saver_module.Saver([w3, w4])
@@ -268,7 +268,7 @@ class SaverTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testResourceSaveRestoreCachingDevice(self):
     save_path = os.path.join(self.get_temp_dir(), "resource_cache")
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       v = resource_variable_ops.ResourceVariable([1], caching_device="/cpu:0",
                                                  name="v")
       if context.executing_eagerly():
@@ -465,7 +465,7 @@ class SaverTest(test.TestCase):
   def testBasicsWithListOfVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "basics_with_list")
 
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
       v0 = variables.Variable(10.0, name="v0")
@@ -489,7 +489,7 @@ class SaverTest(test.TestCase):
 
     # Start a second session.  In that session the variables
     # have not been initialized either.
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       v0 = variables.Variable(-1.0, name="v0")
       v1 = variables.Variable(-1.0, name="v1")
       v2 = saver_test_utils.CheckpointedOp(name="v2")
@@ -514,7 +514,7 @@ class SaverTest(test.TestCase):
 
     # Build another graph with 2 nodes, initialized
     # differently, and a Restore node for them.
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       v0_2 = variables.Variable(1000.0, name="v0")
       v1_2 = variables.Variable(2000.0, name="v1")
       v2_2 = saver_test_utils.CheckpointedOp(name="v2")
@@ -536,14 +536,14 @@ class SaverTest(test.TestCase):
       self.assertEqual(30.0, v2_2.values().eval())
 
   def _SaveAndLoad(self, var_name, var_value, other_value, save_path):
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       var = resource_variable_ops.ResourceVariable(var_value, name=var_name)
       save = saver_module.Saver({var_name: var})
       if not context.executing_eagerly():
         self.evaluate(var.initializer)
       val = save.save(sess, save_path)
       self.assertEqual(save_path, val)
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       var = resource_variable_ops.ResourceVariable(other_value, name=var_name)
       save = saver_module.Saver({var_name: var})
       save.restore(sess, save_path)
@@ -693,7 +693,7 @@ class SaverTest(test.TestCase):
     # Save and reload one Variable named "var0".
     self._SaveAndLoad("var0", 0.0, 1.0, save_path)
     for use_tensor in [True, False]:
-      with self.test_session(graph=ops_lib.Graph()):
+      with self.session(graph=ops_lib.Graph()):
         var = resource_variable_ops.ResourceVariable(1.0, name="var0")
         save = saver_module.Saver(
             {
@@ -791,7 +791,7 @@ class SaverTest(test.TestCase):
       save_path = os.path.join(self.get_temp_dir(), "basic_save_restore")
 
       # Build the first session.
-      with self.test_session(graph=ops_lib.Graph()) as sess:
+      with self.session(graph=ops_lib.Graph()) as sess:
         v0 = variable_op(10.0, name="v0", dtype=dtypes.float32)
 
         if not context.executing_eagerly():
@@ -801,7 +801,7 @@ class SaverTest(test.TestCase):
         save.save(sess, save_path)
 
       # Start a second session.
-      with self.test_session(graph=ops_lib.Graph()) as sess:
+      with self.session(graph=ops_lib.Graph()) as sess:
         v0_wrong_dtype = variable_op(1, name="v0", dtype=dtypes.int32)
         # Restore the saved value with different dtype
         # in the parameter nodes.
@@ -822,7 +822,7 @@ class SaverTest(test.TestCase):
       return small_v + large_v
 
     save_graph = ops_lib.Graph()
-    with save_graph.as_default(), self.test_session(graph=save_graph) as sess:
+    with save_graph.as_default(), self.session(graph=save_graph) as sess:
       orig_vars = _model()
       sess.run(variables.global_variables_initializer())
       save = saver_module.Saver(max_to_keep=1)
@@ -999,7 +999,7 @@ class SaveRestoreShardedTest(test.TestCase):
     call_saver_with_dict = False  # updated by test loop below
 
     def _save(slices=None, partitioner=None):
-      with self.test_session(graph=ops_lib.Graph()) as sess:
+      with self.session(graph=ops_lib.Graph()) as sess:
         # Calls .eval() to return the ndarray that makes up the full variable.
         rnd = random_ops.random_uniform(var_full_shape).eval()
 
@@ -1036,7 +1036,7 @@ class SaveRestoreShardedTest(test.TestCase):
         return rnd
 
     def _restore(slices=None, partitioner=None):
-      with self.test_session(graph=ops_lib.Graph()) as sess:
+      with self.session(graph=ops_lib.Graph()) as sess:
         if slices:
           assert not partitioner
           new_vs = partitioned_variables.create_partitioned_variables(
@@ -1549,7 +1549,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
   def _testNonReshape(self, variable_op):
     save_path = os.path.join(self.get_temp_dir(), "non_reshape")
 
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Build a graph with 2 parameter nodes, and Save and
       # Restore nodes for them.
       v0 = variable_op(10.0, name="v0")
@@ -1574,7 +1574,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
 
     # Verify that the mapped names are present in the Saved file and can be
     # Restored using remapped names.
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       v0 = variable_op(-1.0, name="v0")
       v1 = variable_op(-1.0, name="v1")
 
@@ -1594,7 +1594,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
 
     # Add a prefix to the node names in the current graph and Restore using
     # remapped names.
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       v0 = variable_op(-1.0, name="restore_prefix/v0")
       v1 = variable_op(-1.0, name="restore_prefix/v1")
 
@@ -1709,7 +1709,7 @@ class MetaGraphTest(test.TestCase):
     filename = os.path.join(test_dir, "metafile")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Creates a graph.
       v0 = variables.Variable([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
       v1 = variables.Variable(11.0, name="v1")
@@ -1753,7 +1753,7 @@ class MetaGraphTest(test.TestCase):
     filename = os.path.join(test_dir, "metafile")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Imports from meta_graph.
       saver_module.import_meta_graph(filename)
       # Retrieves SAVERS collection. Verifies there are 2 entries.
@@ -1786,7 +1786,7 @@ class MetaGraphTest(test.TestCase):
     filename = os.path.join(test_dir, "metafile")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Creates a graph.
       v0 = variables.Variable([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name="v0")
       v1 = variables.Variable(11.0, name="v1")
@@ -1838,25 +1838,25 @@ class MetaGraphTest(test.TestCase):
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
     filename = os.path.join(test_dir, "metafile")
-    with self.test_session(graph=ops_lib.Graph()):
+    with self.session(graph=ops_lib.Graph()):
       # Creates a graph.
       variables.Variable(10.0, name="v0")
       # Exports the graph as binary format.
       saver_module.export_meta_graph(filename, as_text=False)
-    with self.test_session(graph=ops_lib.Graph()):
+    with self.session(graph=ops_lib.Graph()):
       # Imports the binary format graph.
       saver = saver_module.import_meta_graph(filename)
       self.assertIsNotNone(saver)
       # Exports the graph as text format.
       saver.export_meta_graph(filename, as_text=True)
-    with self.test_session(graph=ops_lib.Graph()):
+    with self.session(graph=ops_lib.Graph()):
       # Imports the text format graph.
       saver_module.import_meta_graph(filename)
       # Writes wrong contents to the file.
       graph_io.write_graph(saver.as_saver_def(),
                            os.path.dirname(filename),
                            os.path.basename(filename))
-    with self.test_session(graph=ops_lib.Graph()):
+    with self.session(graph=ops_lib.Graph()):
       # Import should fail.
       with self.assertRaisesWithPredicateMatch(IOError,
                                                lambda e: "Cannot parse file"):
@@ -1961,7 +1961,7 @@ class MetaGraphTest(test.TestCase):
     filename = os.path.join(test_dir, "metafile")
     train_filename = os.path.join(test_dir, "train_metafile")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Restores from MetaGraphDef.
       new_saver = saver_module.import_meta_graph(filename)
       # Generates a new MetaGraphDef.
@@ -1998,7 +1998,7 @@ class MetaGraphTest(test.TestCase):
   def _testRestoreFromTrainGraphWithControlContext(self, test_dir):
     train_filename = os.path.join(test_dir, "train_metafile")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       # Restores from MetaGraphDef.
       new_saver = saver_module.import_meta_graph(train_filename)
       # Restores from checkpoint.
@@ -2177,7 +2177,7 @@ class MetaGraphTest(test.TestCase):
     # With strip_default_attrs disabled, attributes "T" (float32) and "Tout"
     # (complex64) in the "Complex" op must *not* be removed, even if they map
     # to their defaults.
-    with self.test_session(graph=ops_lib.Graph()):
+    with self.session(graph=ops_lib.Graph()):
       real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
@@ -2541,7 +2541,7 @@ class ScopedGraphTest(test.TestCase):
           export_scope="hidden1")
       self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
 
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       sess.run(variables.global_variables_initializer())
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.save(sess, os.path.join(test_dir, ckpt_filename), write_state=False)
@@ -2601,7 +2601,7 @@ class ScopedGraphTest(test.TestCase):
           set(variables.global_variables()) - set(var_list.keys()))
       init_rest_op = variables.variables_initializer(rest_variables)
 
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.restore(sess, os.path.join(test_dir, ckpt_filename))
       # Verify that we have restored weights1 and biases1.
@@ -2635,7 +2635,7 @@ class ScopedGraphTest(test.TestCase):
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
     # Run the graph and save scoped checkpoint.
-    with self.test_session(graph=graph1) as sess:
+    with self.session(graph=graph1) as sess:
       sess.run(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           export_scope="hidden1")
@@ -2656,7 +2656,7 @@ class ScopedGraphTest(test.TestCase):
       var_list_2 = meta_graph.copy_scoped_meta_graph(
           from_scope="hidden1", to_scope="hidden2")
 
-    with self.test_session(graph=graph1) as sess:
+    with self.session(graph=graph1) as sess:
       saver1 = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
       saver1.restore(sess, saver0_ckpt)
       saver2 = saver_module.Saver(var_list=var_list_2, max_to_keep=1)
@@ -2672,7 +2672,7 @@ class ScopedGraphTest(test.TestCase):
         from_graph=graph1,
         to_graph=graph2)
 
-    with self.test_session(graph=graph2) as sess:
+    with self.session(graph=graph2) as sess:
       saver3 = saver_module.Saver(var_list=new_var_list_1, max_to_keep=1)
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
@@ -2691,7 +2691,7 @@ class ScopedGraphTest(test.TestCase):
         nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
 
     # Run the graph and save scoped checkpoint.
-    with self.test_session(graph=graph1) as sess:
+    with self.session(graph=graph1) as sess:
       sess.run(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           graph_def=graph1.as_graph_def(), export_scope="hidden1")
@@ -2708,7 +2708,7 @@ class ScopedGraphTest(test.TestCase):
         from_graph=graph1,
         to_graph=graph2)
 
-    with self.test_session(graph=graph2) as sess:
+    with self.session(graph=graph2) as sess:
       saver3 = saver_module.Saver(var_list=new_var_list_1, max_to_keep=1)
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
@@ -2729,7 +2729,7 @@ class ScopedGraphTest(test.TestCase):
       saver2 = saver_module.Saver(var_list=[variable2], name="hidden2/")
       graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver2)
 
-    with self.test_session(graph=graph) as sess:
+    with self.session(graph=graph) as sess:
       variables.global_variables_initializer().run()
       saver1.save(sess, saver1_ckpt, write_state=False)
       saver2.save(sess, saver2_ckpt, write_state=False)
@@ -2745,7 +2745,7 @@ class ScopedGraphTest(test.TestCase):
     saver_list1 = graph1.get_collection(ops_lib.GraphKeys.SAVERS)
     self.assertEqual(1, len(saver_list1))
 
-    with self.test_session(graph=graph1) as sess:
+    with self.session(graph=graph1) as sess:
       saver_list1[0].restore(sess, saver1_ckpt)
       self.assertEqual(1.0, var_dict1["variable1:0"].eval())
 
@@ -2760,7 +2760,7 @@ class ScopedGraphTest(test.TestCase):
     saver_list2 = graph2.get_collection(ops_lib.GraphKeys.SAVERS)
     self.assertEqual(1, len(saver_list2))
 
-    with self.test_session(graph=graph2) as sess:
+    with self.session(graph=graph2) as sess:
       saver_list2[0].restore(sess, saver2_ckpt)
       self.assertEqual(2.0, var_dict2["variable2:0"].eval())
 
@@ -2853,8 +2853,8 @@ class CheckpointableCompatibilityTests(test.TestCase):
     saver = saver_module.Saver(var_list=[v])
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
-    self.evaluate(v.non_dep_variable.assign(42.))
     with self.test_session() as sess:
+      self.evaluate(v.non_dep_variable.assign(42.))
       save_path = saver.save(sess, prefix)
       self.evaluate(v.non_dep_variable.assign(43.))
       saver.restore(sess, save_path)
@@ -2979,14 +2979,14 @@ class CheckpointableCompatibilityTests(test.TestCase):
       a = variables.Variable(1., name="a")
       a_saver = saver_module.Saver([a])
 
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sess.run(a.initializer)
         save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
 
     with ops_lib.Graph().as_default() as g:
       a = variables.Variable([1.], name="a")
       a_saver = saver_module.Saver([a])
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         with self.assertRaisesRegexp(
             errors.InvalidArgumentError,
             "a mismatch between the current graph and the graph"):
@@ -2997,7 +2997,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
     save_graph = ops_lib.Graph()
-    with save_graph.as_default(), self.test_session(graph=save_graph) as sess:
+    with save_graph.as_default(), self.session(graph=save_graph) as sess:
       root = self._initialized_model()
       object_saver = checkpointable_utils.CheckpointableSaver(root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
@@ -3031,7 +3031,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
     save_graph = ops_lib.Graph()
-    with save_graph.as_default(), self.test_session(graph=save_graph):
+    with save_graph.as_default(), self.session(graph=save_graph):
       root = self._initialized_model()
       object_saver = checkpointable_utils.CheckpointableSaver(root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 258a6f045d..d76b22acd8 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -45,7 +45,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import distribution_strategy_context
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
@@ -112,7 +112,8 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = distribute_lib.get_distribution_strategy()
+      distribution_strategy = (
+          distribution_strategy_context.get_distribution_strategy())
       with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
@@ -149,7 +150,8 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = distribute_lib.get_distribution_strategy()
+      distribution_strategy = (
+          distribution_strategy_context.get_distribution_strategy())
       with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 0c6cf910d1..7afaa92699 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -53,7 +53,7 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   which replicas can fetch the new variables and continue.
 
   The following accumulators/queue are created:
-  <empty line>
+
   * N `gradient accumulators`, one per variable to train. Gradients are pushed
     to them and the chief worker will wait until enough gradients are collected
     and then average them before applying to variables. The accumulator will
@@ -68,7 +68,7 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   The optimizer adds nodes to the graph to collect gradients and pause the
   trainers until variables are updated.
   For the Parameter Server job:
-  <empty line>
+
   1. An accumulator is created for each variable, and each replica pushes the
      gradients into the accumulators instead of directly applying them to the
      variables.
@@ -81,7 +81,7 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
      update its local_step variable and start the next batch.
 
   For the replicas:
-  <empty line>
+
   1. Start a step: fetch variables and compute gradients.
   2. Once the gradients have been computed, push them into gradient
      accumulators. Each accumulator will check the staleness and drop the stale.
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 6f6305a505..686c4be31a 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -15,7 +15,7 @@
 
 """Support for training models.
 
-See the @{$python/train} guide.
+See the [Training](https://tensorflow.org/api_guides/python/train) guide.
 """
 
 # Optimizers.
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 0ba7ba983d..c0dd46bfa5 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -32,7 +32,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.VocabInfo", allow_multiple_exports=True)
+@tf_export("train.VocabInfo")
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 6a4c207d79..70a84bc3f6 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -59,7 +59,7 @@ class WarmStartingUtilTest(test.TestCase):
                            initializer=None,
                            partitioner=None):
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         var = variable_scope.get_variable(
             var_name,
             shape=shape,
@@ -102,7 +102,7 @@ class WarmStartingUtilTest(test.TestCase):
         "fruit_weights", initializer=[[0.5], [1.], [1.5], [2.]])
 
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
         ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
@@ -118,7 +118,7 @@ class WarmStartingUtilTest(test.TestCase):
     prev_val = np.concatenate([weights[0], weights[1]], axis=0)
 
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
         ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
@@ -130,7 +130,7 @@ class WarmStartingUtilTest(test.TestCase):
         "fruit_weights", initializer=[[0.5], [1.], [1.5], [2.]])
 
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights",
             shape=[4, 1],
@@ -154,7 +154,7 @@ class WarmStartingUtilTest(test.TestCase):
     prev_val = np.concatenate([weights[0], weights[1]], axis=0)
     # New session and new graph.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "new_scope/fruit_weights",
             shape=[4, 1],
@@ -183,7 +183,7 @@ class WarmStartingUtilTest(test.TestCase):
         ["orange", "guava", "banana", "apple", "raspberry"], "new_vocab")
     # New session and new graph.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
@@ -203,7 +203,7 @@ class WarmStartingUtilTest(test.TestCase):
         ["orange", "guava", "banana", "apple", "raspberry"], "new_vocab")
     # New session and new graph.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(
@@ -232,7 +232,7 @@ class WarmStartingUtilTest(test.TestCase):
         ["orange", "guava", "banana", "apple", "raspberry"], "new_vocab")
     # New session and new graph.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
@@ -252,7 +252,7 @@ class WarmStartingUtilTest(test.TestCase):
         ["orange", "guava", "banana", "apple", "raspberry"], "new_vocab")
     # New session and new graph.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights",
             shape=[6, 1],
@@ -289,7 +289,7 @@ class WarmStartingUtilTest(test.TestCase):
          "blueberry"], "new_vocab")
     # New session and new graph.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights",
             shape=[6, 1],
@@ -315,7 +315,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         # Initialize with zeros.
         var = variable_scope.get_variable(
             "v1",
@@ -335,7 +335,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         # Initialize with zeros.
         var = variable_scope.get_variable(
             "v1",
@@ -359,7 +359,7 @@ class WarmStartingUtilTest(test.TestCase):
     partitioner = lambda shape, dtype: [1] * len(shape)
     # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
         sess.run(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
@@ -369,7 +369,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=".*sc_int.*")
         sess.run(variables.global_variables_initializer())
@@ -388,7 +388,7 @@ class WarmStartingUtilTest(test.TestCase):
     partitioner = lambda shape, dtype: [1] * len(shape)
     # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
         sess.run(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
@@ -398,7 +398,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*")
@@ -422,7 +422,7 @@ class WarmStartingUtilTest(test.TestCase):
     partitioner = lambda shape, dtype: [1] * len(shape)
     # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         sess.run(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
@@ -432,7 +432,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         # Since old vocab is not explicitly set in WarmStartSettings, the old
         # vocab is assumed to be same as new vocab.
@@ -458,7 +458,7 @@ class WarmStartingUtilTest(test.TestCase):
     partitioner = lambda shape, dtype: [1] * len(shape)
     # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         sess.run(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
@@ -468,7 +468,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         # Since old vocab is not explicitly set in WarmStartSettings, the old
         # vocab is assumed to be same as new vocab.
@@ -503,7 +503,7 @@ class WarmStartingUtilTest(test.TestCase):
     partitioner = lambda shape, dtype: [1] * len(shape)
     # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         sess.run(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
@@ -513,7 +513,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         vocab_info = ws_util.VocabInfo(
             new_vocab=sc_vocab.vocabulary_file,
@@ -546,7 +546,7 @@ class WarmStartingUtilTest(test.TestCase):
     partitioner = lambda shape, dtype: [1] * len(shape)
     # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
         sess.run(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
@@ -556,7 +556,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*")
@@ -586,7 +586,7 @@ class WarmStartingUtilTest(test.TestCase):
     # Save checkpoint from which to warm-start.  Also create a bias variable,
     # so we can check that it's also warm-started.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         sc_int_weights = variable_scope.get_variable(
             "linear_model/sc_int/weights", shape=[10, 1], initializer=ones())
         sc_hash_weights = variable_scope.get_variable(
@@ -617,7 +617,7 @@ class WarmStartingUtilTest(test.TestCase):
     partitioner = lambda shape, dtype: [1] * len(shape)
     # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
         sess.run(variables.global_variables_initializer())
         # Without warm-starting, all weights should be initialized using default
@@ -633,7 +633,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
         vocab_info = ws_util.VocabInfo(
             new_vocab=sc_vocab.vocabulary_file,
@@ -675,7 +675,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # Save checkpoint from which to warm-start.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         variable_scope.get_variable(
             "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
         sc_keys_weights = variable_scope.get_variable(
@@ -694,7 +694,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
         vocab_info = ws_util.VocabInfo(
             new_vocab=sc_vocab.vocabulary_file,
@@ -743,7 +743,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # Save checkpoint from which to warm-start.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         variable_scope.get_variable(
             "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
         sc_keys_weights = variable_scope.get_variable(
@@ -756,7 +756,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols,
                                                  partitioner=None)
         vocab_info = ws_util.VocabInfo(
@@ -802,7 +802,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # Save checkpoint from which to warm-start.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         variable_scope.get_variable(
             "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
         variable_scope.get_variable(
@@ -820,7 +820,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
         vocab_info = ws_util.VocabInfo(
             new_vocab=sc_vocab.vocabulary_file,
@@ -866,7 +866,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # Save checkpoint from which to warm-start.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         variable_scope.get_variable(
             "input_layer/sc_vocab_embedding/embedding_weights",
             initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
@@ -887,7 +887,7 @@ class WarmStartingUtilTest(test.TestCase):
     all_deep_cols = [emb_vocab_column]
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = {}
         with variable_scope.variable_scope("", partitioner=_partitioner):
           # Create the variables.
@@ -933,7 +933,7 @@ class WarmStartingUtilTest(test.TestCase):
 
     # Save checkpoint from which to warm-start.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         variable_scope.get_variable(
             "linear_model/sc_vocab_embedding/embedding_weights",
             initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
@@ -957,7 +957,7 @@ class WarmStartingUtilTest(test.TestCase):
     all_deep_cols = [emb_vocab]
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
+      with self.session(graph=g) as sess:
         cols_to_vars = {}
         with variable_scope.variable_scope("", partitioner=_partitioner):
           # Create the variables.
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index faae0d89c3..2968ca9c07 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -62,6 +62,10 @@ def _is_namedtuple(instance, strict=False):
   return _pywrap_tensorflow.IsNamedtuple(instance, strict)
 
 
+# See the swig file (util.i) for documentation.
+_is_mapping = _pywrap_tensorflow.IsMapping
+
+
 def _sequence_like(instance, args):
   """Converts the sequence `args` to the same type as `instance`.
 
@@ -73,7 +77,7 @@ def _sequence_like(instance, args):
   Returns:
     `args` with the type of `instance`.
   """
-  if isinstance(instance, (dict, _collections.Mapping)):
+  if _is_mapping(instance):
     # Pack dictionaries in a deterministic order by sorting the keys.
     # Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -89,7 +93,7 @@ def _sequence_like(instance, args):
 
 
 def _yield_value(iterable):
-  if isinstance(iterable, (dict, _collections.Mapping)):
+  if _is_mapping(iterable):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -102,53 +106,16 @@ def _yield_value(iterable):
       yield value
 
 
-def is_sequence(seq):
-  """Returns a true if its input is a collections.Sequence (except strings).
+# See the swig file (util.i) for documentation.
+is_sequence = _pywrap_tensorflow.IsSequence
 
-  Args:
-    seq: an input sequence.
 
-  Returns:
-    True if the sequence is a not a string and is a collections.Sequence or a
-    dict.
-  """
-  return _pywrap_tensorflow.IsSequence(seq)
+# See the swig file (util.i) for documentation.
+flatten = _pywrap_tensorflow.Flatten
 
 
-def flatten(nest):
-  """Returns a flat list from a given nested structure.
-
-  If `nest` is not a sequence, tuple, or dict, then returns a single-element
-  list: `[nest]`.
-
-  In the case of dict instances, the sequence consists of the values, sorted by
-  key to ensure deterministic behavior. This is true also for `OrderedDict`
-  instances: their sequence order is ignored, the sorting order of keys is
-  used instead. The same convention is followed in `pack_sequence_as`. This
-  correctly repacks dicts and `OrderedDict`s after they have been flattened,
-  and also allows flattening an `OrderedDict` and then repacking it back using
-  a corresponding plain dict, or vice-versa.
-  Dictionaries with non-sortable keys cannot be flattened.
-
-  Users must not modify any collections used in `nest` while this function is
-  running.
-
-  Args:
-    nest: an arbitrarily nested structure or a scalar object. Note, numpy
-        arrays are considered scalars.
-
-  Returns:
-    A Python list, the flattened version of the input.
-
-  Raises:
-    TypeError: The nest is or contains a dict with non-sortable keys.
-  """
-  return _pywrap_tensorflow.Flatten(nest)
-
-
-def _same_namedtuples(nest1, nest2):
-  """Returns True if the two namedtuples have the same name and fields."""
-  return _pywrap_tensorflow.SameNamedtuples(nest1, nest2)
+# See the swig file (util.i) for documentation.
+_same_namedtuples = _pywrap_tensorflow.SameNamedtuples
 
 
 def assert_same_structure(nest1, nest2, check_types=True):
@@ -311,14 +278,17 @@ def pack_sequence_as(structure, flat_sequence):
                        % len(flat_sequence))
     return flat_sequence[0]
 
-  flat_structure = flatten(structure)
-  if len(flat_structure) != len(flat_sequence):
-    raise ValueError(
-        "Could not pack sequence. Structure had %d elements, but flat_sequence "
-        "had %d elements.  Structure: %s, flat_sequence: %s."
-        % (len(flat_structure), len(flat_sequence), structure, flat_sequence))
-
-  _, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
+  try:
+    final_index, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
+    if final_index < len(flat_sequence):
+      raise IndexError
+  except IndexError:
+    flat_structure = flatten(structure)
+    if len(flat_structure) != len(flat_sequence):
+      raise ValueError(
+          "Could not pack sequence. Structure had %d elements, but "
+          "flat_sequence had %d elements.  Structure: %s, flat_sequence: %s." %
+          (len(flat_structure), len(flat_sequence), structure, flat_sequence))
   return _sequence_like(structure, packed)
 
 
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 274f32c21f..a5ac430ce7 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -136,11 +136,14 @@ class api_export(object):  # pylint: disable=invalid-name
           has no effect on exporting a constant.
         api_name: Name of the API you want to generate (e.g. `tensorflow` or
           `estimator`). Default is `tensorflow`.
+        allow_multiple_exports: Allow symbol to be exported multiple time under
+          different names.
     """
     self._names = args
     self._names_v1 = kwargs.get('v1', args)
     self._api_name = kwargs.get('api_name', TENSORFLOW_API_NAME)
     self._overrides = kwargs.get('overrides', [])
+    self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
 
   def __call__(self, func):
     """Calls this decorator.
@@ -173,9 +176,10 @@ class api_export(object):  # pylint: disable=invalid-name
     # __dict__ instead of using hasattr to verify that subclasses have
     # their own _tf_api_names as opposed to just inheriting it.
     if api_names_attr in func.__dict__:
-      raise SymbolAlreadyExposedError(
-          'Symbol %s is already exposed as %s.' %
-          (func.__name__, getattr(func, api_names_attr)))  # pylint: disable=protected-access
+      if not self._allow_multiple_exports:
+        raise SymbolAlreadyExposedError(
+            'Symbol %s is already exposed as %s.' %
+            (func.__name__, getattr(func, api_names_attr)))  # pylint: disable=protected-access
     setattr(func, api_names_attr, names)
 
   def export_constant(self, module_name, name):
@@ -213,4 +217,5 @@ class api_export(object):  # pylint: disable=invalid-name
 
 
 tf_export = functools.partial(api_export, api_name=TENSORFLOW_API_NAME)
-estimator_export = functools.partial(tf_export, api_name=ESTIMATOR_API_NAME)
+estimator_export = functools.partial(
+    api_export, api_name=ESTIMATOR_API_NAME, allow_multiple_exports=True)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index ebb72079ef..562bbdcfeb 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -470,12 +470,14 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 // Leaves `error_msg` empty if structures matched. Else, fills `error_msg`
 // with appropriate error and sets `is_type_error` to true iff
 // the error to be raised should be TypeError.
-bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
-                               string* error_msg, bool* is_type_error) {
+bool AssertSameStructureHelper(
+    PyObject* o1, PyObject* o2, bool check_types, string* error_msg,
+    bool* is_type_error,
+    const std::function<int(PyObject*)>& is_sequence_helper) {
   DCHECK(error_msg);
   DCHECK(is_type_error);
-  const bool is_seq1 = IsSequence(o1);
-  const bool is_seq2 = IsSequence(o2);
+  const bool is_seq1 = is_sequence_helper(o1);
+  const bool is_seq2 = is_sequence_helper(o2);
   if (PyErr_Occurred()) return false;
   if (is_seq1 != is_seq2) {
     string seq_str = is_seq1 ? PyObjectToString(o1) : PyObjectToString(o2);
@@ -487,7 +489,9 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
     return true;
   }
 
-  // Got to scalars, so finished checking. Structures are the same.
+  // Got to objects that are considered non-sequences. Note that in tf.data
+  // use case lists and sparse_tensors are not considered sequences. So finished
+  // checking, structures are the same.
   if (!is_seq1) return true;
 
   if (check_types) {
@@ -586,7 +590,7 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
         return false;
       }
       bool no_internal_errors = AssertSameStructureHelper(
-          v1, v2, check_types, error_msg, is_type_error);
+          v1, v2, check_types, error_msg, is_type_error, is_sequence_helper);
       Py_LeaveRecursiveCall();
       if (!no_internal_errors) return false;
       if (!error_msg->empty()) return true;
@@ -647,6 +651,7 @@ void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class) {
 }
 
 bool IsSequence(PyObject* o) { return IsSequenceHelper(o) == 1; }
+bool IsMapping(PyObject* o) { return IsMappingHelper(o) == 1; }
 
 PyObject* Flatten(PyObject* nested) {
   PyObject* list = PyList_New(0);
@@ -758,7 +763,32 @@ PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
 PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types) {
   string error_msg;
   bool is_type_error = false;
-  AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error);
+  AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
+                            IsSequenceHelper);
+  if (PyErr_Occurred()) {
+    // Don't hide Python exceptions while checking (e.g. errors fetching keys
+    // from custom mappings).
+    return nullptr;
+  }
+  if (!error_msg.empty()) {
+    PyErr_SetString(
+        is_type_error ? PyExc_TypeError : PyExc_ValueError,
+        tensorflow::strings::StrCat(
+            "The two structures don't have the same nested structure.\n\n",
+            "First structure: ", PyObjectToString(o1), "\n\nSecond structure: ",
+            PyObjectToString(o2), "\n\nMore specifically: ", error_msg)
+            .c_str());
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+}
+
+PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
+                                     bool check_types) {
+  string error_msg;
+  bool is_type_error = false;
+  AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
+                            IsSequenceForDataHelper);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 41dcc969f8..343605285e 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -47,6 +47,15 @@ bool IsSequence(PyObject* o);
 //   True if `instance` is a `namedtuple`.
 PyObject* IsNamedtuple(PyObject* o, bool strict);
 
+// Returns a true if its input is a collections.Mapping.
+//
+// Args:
+//   seq: the input to be checked.
+//
+// Returns:
+//   True if the sequence subclasses mapping.
+bool IsMapping(PyObject* o);
+
 // Implements the same interface as tensorflow.util.nest._same_namedtuples
 // Returns Py_True iff the two namedtuples have the same name and fields.
 // Raises RuntimeError if `o1` or `o2` don't look like namedtuples (don't have
@@ -135,16 +144,20 @@ void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class);
 // 1. It removes support for lists as a level of nesting in nested structures.
 // 2. It adds support for `SparseTensorValue` as an atomic element.
 
-// IsSequence specialized for the data package. Additional comments about
-// difference in functionality can be found in nest.py in tensorflow.data.util
-// and in the comments for Flatten above.
+// IsSequence specialized for `tf.data`. Additional comments about
+// difference in functionality can be found in nest.py in
+// `tensorflow.python.data.util` and in the comments for Flatten above.
 bool IsSequenceForData(PyObject* o);
 
-// IsSequence specialized for the data package. Additional comments about
-// difference in functionality can be found in nest.py in tensorflow.data.util
-// and in the comments for Flatten above.
+// Flatten specialized for `tf.data`. Additional comments about
+// difference in functionality can be found in nest.py in
+// `tensorflow.python.data.util` and in the comments for Flatten above.
 PyObject* FlattenForData(PyObject* nested);
 
+// AssertSameStructure specialized for `tf.data`.
+PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
+                                     bool check_types);
+
 }  // namespace swig
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 6ad1484295..6d336ac39d 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -37,18 +37,70 @@ limitations under the License.
 %unignore tensorflow::swig::RegisterSparseTensorValueClass;
 %noexception tensorflow::swig::RegisterSparseTensorValueClass;
 
+%feature("docstring") tensorflow::swig::IsSequence
+"""Returns a true if its input is a collections.Sequence (except strings).
+
+Args:
+  seq: an input sequence.
+
+Returns:
+  True if the sequence is a not a string and is a collections.Sequence or a
+  dict.
+"""
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
 %unignore tensorflow::swig::IsNamedtuple;
 %noexception tensorflow::swig::IsNamedtuple;
 
+%feature("docstring") tensorflow::swig::IsMapping
+"""Returns True iff `instance` is a `collections.Mapping`.
+
+Args:
+  instance: An instance of a Python object.
+
+Returns:
+  True if `instance` is a `collections.Mapping`.
+"""
+%unignore tensorflow::swig::IsMapping;
+%noexception tensorflow::swig::IsMapping;
+
+%feature("docstring") tensorflow::swig::SameNamedtuples
+"Returns True if the two namedtuples have the same name and fields."
 %unignore tensorflow::swig::SameNamedtuples;
 %noexception tensorflow::swig::SameNamedtuples;
 
 %unignore tensorflow::swig::AssertSameStructure;
 %noexception tensorflow::swig::AssertSameStructure;
 
+%feature("docstring") tensorflow::swig::Flatten
+"""Returns a flat list from a given nested structure.
+
+If `nest` is not a sequence, tuple, or dict, then returns a single-element
+list: `[nest]`.
+
+In the case of dict instances, the sequence consists of the values, sorted by
+key to ensure deterministic behavior. This is true also for `OrderedDict`
+instances: their sequence order is ignored, the sorting order of keys is
+used instead. The same convention is followed in `pack_sequence_as`. This
+correctly repacks dicts and `OrderedDict`s after they have been flattened,
+and also allows flattening an `OrderedDict` and then repacking it back using
+a corresponding plain dict, or vice-versa.
+Dictionaries with non-sortable keys cannot be flattened.
+
+Users must not modify any collections used in `nest` while this function is
+running.
+
+Args:
+  nest: an arbitrarily nested structure or a scalar object. Note, numpy
+      arrays are considered scalars.
+
+Returns:
+  A Python list, the flattened version of the input.
+
+Raises:
+  TypeError: The nest is or contains a dict with non-sortable keys.
+"""
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
 
@@ -58,6 +110,9 @@ limitations under the License.
 %unignore tensorflow::swig::FlattenForData;
 %noexception tensorflow::swig::FlattenForData;
 
+%unignore tensorflow::swig::AssertSameStructureForData;
+%noexception tensorflow::swig::AssertSameStructureForData;
+
 %include "tensorflow/python/util/util.h"
 
 %unignoreall
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 73f05b94db..e30f50ea2a 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -164,8 +164,8 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
 
   VLOG(2) << "could not find compute-capability specific file at: "
           << cc_specific;
-  if (port::FileExists(filename.ToString()).ok()) {
-    *found_filename = filename.ToString();
+  if (port::FileExists(string(filename)).ok()) {
+    *found_filename = string(filename);
     return true;
   }
 
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index 3ef8deb72e..d78bbfd425 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -32,7 +32,7 @@ inline Status FileExists(const string& filename) {
 }
 
 inline Status FileExists(const port::StringPiece& filename) {
-  return Env::Default()->FileExists(std::string(filename));
+  return Env::Default()->FileExists(string(filename));
 }
 
 }  // namespace port
diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc
index 58a862206c..3d3da103e1 100644
--- a/tensorflow/stream_executor/lib/path.cc
+++ b/tensorflow/stream_executor/lib/path.cc
@@ -33,7 +33,7 @@ string JoinPathImpl(std::initializer_list<port::StringPiece> paths) {
     if (path.empty()) continue;
 
     if (result.empty()) {
-      result = std::string(path);
+      result = string(path);
       continue;
     }
 
diff --git a/tensorflow/stream_executor/lib/statusor_internals.h b/tensorflow/stream_executor/lib/statusor_internals.h
index 09f88f5825..a159da57a2 100644
--- a/tensorflow/stream_executor/lib/statusor_internals.h
+++ b/tensorflow/stream_executor/lib/statusor_internals.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_INTERNALS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_INTERNALS_H_
 
-
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index b02fe4f56f..e77dfcef76 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -31,7 +31,7 @@ inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix)
   if (tensorflow::str_util::EndsWith(str, suffix)) {
     str.remove_suffix(suffix.size());
   }
-  return std::string(str);
+  return string(str);
 }
 
 using tensorflow::str_util::Lowercase;
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 9efd34de24..19d3b2389a 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -1959,7 +1959,9 @@ Stream *Stream::GetOrCreateSubStream() {
                             false);
   Stream *sub_stream = sub_streams_.back().first.get();
   sub_stream->Init();
-  CHECK(ok_) << "sub-stream failed to be initialized";
+  if (!sub_stream->ok_) {
+    LOG(ERROR) << "sub-stream failed to be initialized";
+  }
   VLOG(1) << DebugStreamPointers() << " created new sub_stream "
           << sub_stream->DebugStreamPointers();
 
@@ -5285,12 +5287,11 @@ Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
 Stream &Stream::ThenDoHostCallback(std::function<void()> callback) {
   VLOG_CALL(PARAM(callback));
 
-  if (ok()) {
-    CheckError(parent_->HostCallback(this, callback));
-  } else {
+  if (!ok()) {
     LOG(INFO) << DebugStreamPointers()
               << " was in error state before adding host callback";
   }
+  CheckError(parent_->HostCallback(this, std::move(callback)));
   return *this;
 }
 
@@ -5298,12 +5299,11 @@ Stream &Stream::ThenDoHostCallbackWithStatus(
     std::function<port::Status()> callback) {
   VLOG_CALL(PARAM(callback));
 
-  if (ok()) {
-    CheckError(parent_->HostCallback(this, std::move(callback)));
-  } else {
-    LOG(WARNING) << "stream " << DebugStreamPointers()
-                 << " was in error state before adding host callback";
+  if (!ok()) {
+    LOG(INFO) << DebugStreamPointers()
+              << " was in error state before adding host callback";
   }
+  CheckError(parent_->HostCallback(this, std::move(callback)));
   return *this;
 }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index fc1f9e956f..6d6e8941c5 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -4,12 +4,12 @@
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-    "tf_sycl_tests_tags",
+    "if_dynamic_kernels",
+    "if_static",
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
-    "if_static",
-    "if_dynamic_kernels",
+    "tf_cuda_tests_tags",
+    "tf_sycl_tests_tags",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -17,17 +17,23 @@ load(
 )
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
-    "if_cuda",
     "cuda_default_copts",
+    "if_cuda",
 )
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
-    "if_mkl_lnx_x64"
+    "if_mkl_lnx_x64",
+    "if_mkl_ml",
+    "mkl_deps",
 )
 load(
     "//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkl_open_source_only"
+    "if_mkl_open_source_only",
+)
+load(
+    "//third_party/ngraph:build_defs.bzl",
+    "if_ngraph",
 )
 def register_extension_info(**kwargs):
     pass
@@ -36,153 +42,154 @@ def register_extension_info(**kwargs):
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
-  return src.replace("/", "_").split(".")[0]
+    return src.replace("/", "_").split(".")[0]
 
 def full_path(relative_paths):
-  return [native.package_name() + "/" + relative for relative in relative_paths]
+    return [native.package_name() + "/" + relative for relative in relative_paths]
 
 def _add_tfcore_prefix(src):
-  if src.startswith("//"):
-    return src
-  return "//tensorflow/core:" + src
+    if src.startswith("//"):
+        return src
+    return "//tensorflow/core:" + src
 
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
-  return [
-      _add_tfcore_prefix(p) for p in core_proto_sources_relative
-  ]
+    return [
+        _add_tfcore_prefix(p)
+        for p in core_proto_sources_relative
+    ]
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
-  return ([
-      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h")
-      for p in core_proto_sources_relative
-  ] + [
-      _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h")
-      for p in core_proto_sources_relative
-  ])
+    return ([
+        _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h")
+        for p in core_proto_sources_relative
+    ] + [
+        _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h")
+        for p in core_proto_sources_relative
+    ])
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
-  return str(Label(dep))
+    return str(Label(dep))
 
 def if_android_x86(a):
-  return select({
-      clean_dep("//tensorflow:android_x86"): a,
-      clean_dep("//tensorflow:android_x86_64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_x86"): a,
+        clean_dep("//tensorflow:android_x86_64"): a,
+        "//conditions:default": [],
+    })
 
 def if_android_arm(a):
-  return select({
-      clean_dep("//tensorflow:android_arm"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_arm"): a,
+        "//conditions:default": [],
+    })
 
 def if_android_arm64(a):
-  return select({
-      clean_dep("//tensorflow:android_arm64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_arm64"): a,
+        "//conditions:default": [],
+    })
 
 def if_android_mips(a):
-  return select({
-      clean_dep("//tensorflow:android_mips"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android_mips"): a,
+        "//conditions:default": [],
+    })
 
 def if_not_android(a):
-  return select({
-      clean_dep("//tensorflow:android"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:android"): [],
+        "//conditions:default": a,
+    })
 
 def if_not_android_mips_and_mips64(a):
-  return select({
-      clean_dep("//tensorflow:android_mips"): [],
-      clean_dep("//tensorflow:android_mips64"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:android_mips"): [],
+        clean_dep("//tensorflow:android_mips64"): [],
+        "//conditions:default": a,
+    })
 
 def if_android(a):
-  return select({
-      clean_dep("//tensorflow:android"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android"): a,
+        "//conditions:default": [],
+    })
 
 def if_ios(a):
-  return select({
-      clean_dep("//tensorflow:ios"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:ios"): a,
+        "//conditions:default": [],
+    })
 
 def if_ios_x86_64(a):
-  return select({
-      clean_dep("//tensorflow:ios_x86_64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:ios_x86_64"): a,
+        "//conditions:default": [],
+    })
 
 def if_mobile(a):
-  return select({
-      clean_dep("//tensorflow:android"): a,
-      clean_dep("//tensorflow:ios"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:android"): a,
+        clean_dep("//tensorflow:ios"): a,
+        "//conditions:default": [],
+    })
 
 def if_not_mobile(a):
-  return select({
-      clean_dep("//tensorflow:android"): [],
-      clean_dep("//tensorflow:ios"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:android"): [],
+        clean_dep("//tensorflow:ios"): [],
+        "//conditions:default": a,
+    })
 
 # Config setting selector used when building for products
 # which requires restricted licenses to be avoided.
 def if_not_lgpl_restricted(a):
-  _ = (a,)
-  return select({
-      "//conditions:default": [],
-  })
+    _ = (a,)
+    return select({
+        "//conditions:default": [],
+    })
 
 def if_not_windows(a):
-  return select({
-      clean_dep("//tensorflow:windows"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": a,
+    })
 
 def if_windows(a):
-  return select({
-      clean_dep("//tensorflow:windows"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:windows"): a,
+        "//conditions:default": [],
+    })
 
 def if_not_windows_cuda(a):
-  return select({
-      clean_dep("//tensorflow:with_cuda_support_windows_override"): [],
-      "//conditions:default": a,
-  })
+    return select({
+        clean_dep("//tensorflow:with_cuda_support_windows_override"): [],
+        "//conditions:default": a,
+    })
 
 def if_linux_x86_64(a):
-  return select({
-      clean_dep("//tensorflow:linux_x86_64"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:linux_x86_64"): a,
+        "//conditions:default": [],
+    })
 
 def if_darwin(a):
-  return select({
-      clean_dep("//tensorflow:darwin"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:darwin"): a,
+        "//conditions:default": [],
+    })
 
 def if_override_eigen_strong_inline(a):
-  return select({
-      clean_dep("//tensorflow:override_eigen_strong_inline"): a,
-      "//conditions:default": [],
-  })
+    return select({
+        clean_dep("//tensorflow:override_eigen_strong_inline"): a,
+        "//conditions:default": [],
+    })
 
-def get_win_copts(is_external=False):
+def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
         "/DPLATFORM_WINDOWS",
         "/DEIGEN_HAS_C99_MATH",
@@ -200,161 +207,170 @@ def get_win_copts(is_external=False):
         "/DNOGDI",
     ]
     if is_external:
-      return WINDOWS_COPTS + ["/UTF_COMPILE_LIBRARY"]
+        return WINDOWS_COPTS + ["/UTF_COMPILE_LIBRARY"]
     else:
-      return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
+        return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
 
 # LINT.IfChange
-def tf_copts(android_optimization_level_override="-O2", is_external=False):
-  # For compatibility reasons, android_optimization_level_override
-  # is currently only being set for Android.
-  # To clear this value, and allow the CROSSTOOL default
-  # to be used, pass android_optimization_level_override=None
-  android_copts = [
-      "-std=c++11",
-      "-DTF_LEAN_BINARY",
-      "-Wno-narrowing",
-      "-fomit-frame-pointer",
-  ]
-  if android_optimization_level_override:
-    android_copts.append(android_optimization_level_override)
-  return (
-      if_not_windows([
-          "-DEIGEN_AVOID_STL_ARRAY",
-          "-Iexternal/gemmlowp",
-          "-Wno-sign-compare",
-          "-fno-exceptions",
-          "-ftemplate-depth=900"])
-      + if_cuda(["-DGOOGLE_CUDA=1"])
-      + if_tensorrt(["-DGOOGLE_TENSORRT=1"])
-      + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"])
-      + if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"])
-      + if_mkl_lnx_x64(["-fopenmp"])
-      + if_android_arm(["-mfpu=neon"])
-      + if_linux_x86_64(["-msse3"])
-      + if_ios_x86_64(["-msse4.1"])
-      + select({
+def tf_copts(android_optimization_level_override = "-O2", is_external = False):
+    # For compatibility reasons, android_optimization_level_override
+    # is currently only being set for Android.
+    # To clear this value, and allow the CROSSTOOL default
+    # to be used, pass android_optimization_level_override=None
+    android_copts = [
+        "-std=c++11",
+        "-DTF_LEAN_BINARY",
+        "-Wno-narrowing",
+        "-fomit-frame-pointer",
+    ]
+    if android_optimization_level_override:
+        android_copts.append(android_optimization_level_override)
+    return (
+        if_not_windows([
+            "-DEIGEN_AVOID_STL_ARRAY",
+            "-Iexternal/gemmlowp",
+            "-Wno-sign-compare",
+            "-fno-exceptions",
+            "-ftemplate-depth=900",
+        ]) +
+        if_cuda(["-DGOOGLE_CUDA=1"]) +
+        if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
+        if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
+        if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
+        if_ngraph(["-DINTEL_NGRAPH=1"]) +
+        if_mkl_lnx_x64(["-fopenmp"]) +
+        if_android_arm(["-mfpu=neon"]) +
+        if_linux_x86_64(["-msse3"]) +
+        if_ios_x86_64(["-msse4.1"]) +
+        select({
             clean_dep("//tensorflow:framework_shared_object"): [],
             "//conditions:default": ["-DTENSORFLOW_MONOLITHIC_BUILD"],
-      })
-      + select({
+        }) +
+        select({
             clean_dep("//tensorflow:android"): android_copts,
             clean_dep("//tensorflow:darwin"): [],
             clean_dep("//tensorflow:windows"): get_win_copts(is_external),
             clean_dep("//tensorflow:ios"): ["-std=c++11"],
             clean_dep("//tensorflow:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
-            "//conditions:default": ["-pthread"]
-      }))
-
+            "//conditions:default": ["-pthread"],
+        })
+    )
 
 def tfe_xla_copts():
-  return select({
-      "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
-      "//conditions:default": [],
-  })
+    return select({
+        "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
+        "//conditions:default": [],
+    })
 
 def tf_opts_nortti_if_android():
-  return if_android([
-      "-fno-rtti",
-      "-DGOOGLE_PROTOBUF_NO_RTTI",
-      "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
-  ])
+    return if_android([
+        "-fno-rtti",
+        "-DGOOGLE_PROTOBUF_NO_RTTI",
+        "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
+    ])
 
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
 def tf_features_nomodules_if_android():
-  return if_android(["-use_header_modules"])
+    return if_android(["-use_header_modules"])
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps=None, is_external=True):
-  # Make library out of each op so it can also be used to generate wrappers
-  # for various languages.
-  if not deps:
-    deps = []
-  for n in op_lib_names:
-    native.cc_library(
-        name=n + "_op_lib",
-        copts=tf_copts(is_external=is_external),
-        srcs=["ops/" + n + ".cc"],
-        deps=deps + [clean_dep("//tensorflow/core:framework")],
-        visibility=["//visibility:public"],
-        alwayslink=1,
-        linkstatic=1,)
+def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
+    # Make library out of each op so it can also be used to generate wrappers
+    # for various languages.
+    if not deps:
+        deps = []
+    for n in op_lib_names:
+        native.cc_library(
+            name = n + "_op_lib",
+            copts = tf_copts(is_external = is_external),
+            srcs = ["ops/" + n + ".cc"],
+            deps = deps + [clean_dep("//tensorflow/core:framework")],
+            visibility = ["//visibility:public"],
+            alwayslink = 1,
+            linkstatic = 1,
+        )
 
 def _make_search_paths(prefix, levels_to_root):
-  return ",".join(
-      ["-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
-       for search_level in range(levels_to_root + 1)])
+    return ",".join(
+        [
+            "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
+            for search_level in range(levels_to_root + 1)
+        ],
+    )
 
 def _rpath_linkopts(name):
-  # Search parent directories up to the TensorFlow root directory for shared
-  # object dependencies, even if this op shared object is deeply nested
-  # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
-  # the root and tensorflow/libtensorflow_framework.so should exist when
-  # deployed. Other shared object dependencies (e.g. shared between contrib/
-  # ops) are picked up as long as they are in either the same or a parent
-  # directory in the tensorflow/ tree.
-  levels_to_root = native.package_name().count("/") + name.count("/")
-  return select({
-      clean_dep("//tensorflow:darwin"): [
-          "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
-      ],
-      clean_dep("//tensorflow:windows"): [],
-      "//conditions:default": [
-          "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
-      ],
-  })
+    # Search parent directories up to the TensorFlow root directory for shared
+    # object dependencies, even if this op shared object is deeply nested
+    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
+    # the root and tensorflow/libtensorflow_framework.so should exist when
+    # deployed. Other shared object dependencies (e.g. shared between contrib/
+    # ops) are picked up as long as they are in either the same or a parent
+    # directory in the tensorflow/ tree.
+    levels_to_root = native.package_name().count("/") + name.count("/")
+    return select({
+        clean_dep("//tensorflow:darwin"): [
+            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
+        ],
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": [
+            "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
+        ],
+    })
 
 # Bazel-generated shared objects which must be linked into TensorFlow binaries
 # to define symbols from //tensorflow/core:framework and //tensorflow/core:lib.
 def tf_binary_additional_srcs():
-  return if_static(
-      extra_deps=[],
-      otherwise=[
-          clean_dep("//tensorflow:libtensorflow_framework.so"),
-      ])
-
+    return if_static(
+        extra_deps = [],
+        otherwise = [
+            clean_dep("//tensorflow:libtensorflow_framework.so"),
+        ],
+    )
 
 # Helper functions to add kernel dependencies to tf binaries when using dynamic
 # kernel linking.
 def tf_binary_dynamic_kernel_dsos(kernels):
-  return if_dynamic_kernels(
-      extra_deps=["libtfkernel_%s.so" % clean_dep(k) for k in kernels],
-      otherwise=[])
+    return if_dynamic_kernels(
+        extra_deps = ["libtfkernel_%s.so" % clean_dep(k) for k in kernels],
+        otherwise = [],
+    )
 
 # Helper functions to add kernel dependencies to tf binaries when using static
 # kernel linking.
 def tf_binary_dynamic_kernel_deps(kernels):
-  return if_dynamic_kernels(
-      extra_deps=[],
-      otherwise=kernels)
+    return if_dynamic_kernels(
+        extra_deps = [],
+        otherwise = kernels,
+    )
 
 def tf_cc_shared_object(
-    name,
-    srcs=[],
-    deps=[],
-    data=[],
-    linkopts=[],
-    framework_so=tf_binary_additional_srcs(),
-    kernels=[],
-    **kwargs):
-  native.cc_binary(
-      name=name,
-      srcs=srcs + framework_so,
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels),
-      linkshared = 1,
-      data = data + tf_binary_dynamic_kernel_dsos(kernels),
-      linkopts=linkopts + _rpath_linkopts(name) + select({
-          clean_dep("//tensorflow:darwin"): [
-              "-Wl,-install_name,@rpath/" + name.split("/")[-1],
-          ],
-          clean_dep("//tensorflow:windows"): [],
-          "//conditions:default": [
-              "-Wl,-soname," + name.split("/")[-1],
-          ],
-      }),
-      **kwargs)
+        name,
+        srcs = [],
+        deps = [],
+        data = [],
+        linkopts = [],
+        framework_so = tf_binary_additional_srcs(),
+        kernels = [],
+        **kwargs):
+    native.cc_binary(
+        name = name,
+        srcs = srcs + framework_so,
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels),
+        linkshared = 1,
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        linkopts = linkopts + _rpath_linkopts(name) + select({
+            clean_dep("//tensorflow:darwin"): [
+                "-Wl,-install_name,@rpath/" + name.split("/")[-1],
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            "//conditions:default": [
+                "-Wl,-soname," + name.split("/")[-1],
+            ],
+        }),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cc_shared_object",
@@ -365,26 +381,28 @@ register_extension_info(
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
 # statically. Also adds linker options (rpaths) so that the framework shared
 # object can be found.
-def tf_cc_binary(name,
-                 srcs=[],
-                 deps=[],
-                 data=[],
-                 linkopts=[],
-                 copts=tf_copts(),
-                 kernels=[],
-                 **kwargs):
-  native.cc_binary(
-      name=name,
-      copts=copts,
-      srcs=srcs + tf_binary_additional_srcs(),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
-          [
-              "//third_party/mkl:intel_binary_blob",
-          ],
-      ),
-      data=data +  tf_binary_dynamic_kernel_dsos(kernels),
-      linkopts=linkopts + _rpath_linkopts(name),
-      **kwargs)
+def tf_cc_binary(
+        name,
+        srcs = [],
+        deps = [],
+        data = [],
+        linkopts = [],
+        copts = tf_copts(),
+        kernels = [],
+        **kwargs):
+    native.cc_binary(
+        name = name,
+        copts = copts,
+        srcs = srcs + tf_binary_additional_srcs(),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+            [
+                "//third_party/mkl:intel_binary_blob",
+            ],
+        ),
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        linkopts = linkopts + _rpath_linkopts(name),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cc_binary",
@@ -394,64 +412,72 @@ register_extension_info(
 # A simple wrap around native.cc_binary rule.
 # When using this rule, you should realize it doesn't link to any tensorflow
 # dependencies by default.
-def tf_native_cc_binary(name,
-                        copts=tf_copts(),
-                        **kwargs):
-  native.cc_binary(
-      name=name,
-      copts=copts,
-      **kwargs)
+def tf_native_cc_binary(
+        name,
+        copts = tf_copts(),
+        **kwargs):
+    native.cc_binary(
+        name = name,
+        copts = copts,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_native_cc_binary",
     label_regex_for_dep = "{extension_name}.*",
 )
 
-def tf_gen_op_wrapper_cc(name,
-                         out_ops_file,
-                         pkg="",
-                         op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
-                         deps=None,
-                         include_internal_ops=0,
-                         # ApiDefs will be loaded in the order specified in this list.
-                         api_def_srcs=[]):
-  # Construct an op generator binary for these ops.
-  tool = out_ops_file + "_gen_cc"
-  if deps == None:
-    deps = [pkg + ":" + name + "_op_lib"]
-  tf_cc_binary(
-      name=tool,
-      copts=tf_copts(),
-      linkopts=if_not_windows(["-lm"]),
-      linkstatic=1,  # Faster to link this one-time-use binary dynamically
-      deps=[op_gen] + deps)
-
-  srcs = api_def_srcs[:]
-
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          " $$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
-
-  native.genrule(
-      name=name + "_genrule",
-      outs=[
-          out_ops_file + ".h", out_ops_file + ".cc",
-          out_ops_file + "_internal.h", out_ops_file + "_internal.cc"
-      ],
-      srcs=srcs,
-      tools=[":" + tool] + tf_binary_additional_srcs(),
-      cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
-           "$(location :" + out_ops_file + ".cc) " +
-           str(include_internal_ops) + " " + api_def_args_str))
+def tf_gen_op_wrapper_cc(
+        name,
+        out_ops_file,
+        pkg = "",
+        op_gen = clean_dep("//tensorflow/cc:cc_op_gen_main"),
+        deps = None,
+        include_internal_ops = 0,
+        # ApiDefs will be loaded in the order specified in this list.
+        api_def_srcs = []):
+    # Construct an op generator binary for these ops.
+    tool = out_ops_file + "_gen_cc"
+    if deps == None:
+        deps = [pkg + ":" + name + "_op_lib"]
+    tf_cc_binary(
+        name = tool,
+        copts = tf_copts(),
+        linkopts = if_not_windows(["-lm"]),
+        linkstatic = 1,  # Faster to link this one-time-use binary dynamically
+        deps = [op_gen] + deps,
+    )
+
+    srcs = api_def_srcs[:]
+
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                " $$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
+
+    native.genrule(
+        name = name + "_genrule",
+        outs = [
+            out_ops_file + ".h",
+            out_ops_file + ".cc",
+            out_ops_file + "_internal.h",
+            out_ops_file + "_internal.cc",
+        ],
+        srcs = srcs,
+        tools = [":" + tool] + tf_binary_additional_srcs(),
+        cmd = ("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
+               "$(location :" + out_ops_file + ".cc) " +
+               str(include_internal_ops) + " " + api_def_args_str),
+    )
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
@@ -480,68 +506,72 @@ def tf_gen_op_wrapper_cc(name,
 #                     "ops/math_ops_internal.h" ],
 #            deps = [ ... ])
 # TODO(joshl): Cleaner approach for hidden ops.
-def tf_gen_op_wrappers_cc(name,
-                          op_lib_names=[],
-                          other_srcs=[],
-                          other_hdrs=[],
-                          pkg="",
-                          deps=[
-                              clean_dep("//tensorflow/cc:ops"),
-                              clean_dep("//tensorflow/cc:scope"),
-                              clean_dep("//tensorflow/cc:const_op"),
-                          ],
-                          op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
-                          include_internal_ops=0,
-                          visibility=None,
-                          # ApiDefs will be loaded in the order apecified in this list.
-                          api_def_srcs=[]):
-  subsrcs = other_srcs[:]
-  subhdrs = other_hdrs[:]
-  internalsrcs = []
-  internalhdrs = []
-  for n in op_lib_names:
-    tf_gen_op_wrapper_cc(
-        n,
-        "ops/" + n,
-        pkg=pkg,
-        op_gen=op_gen,
-        include_internal_ops=include_internal_ops,
-        api_def_srcs=api_def_srcs)
-    subsrcs += ["ops/" + n + ".cc"]
-    subhdrs += ["ops/" + n + ".h"]
-    internalsrcs += ["ops/" + n + "_internal.cc"]
-    internalhdrs += ["ops/" + n + "_internal.h"]
-
-  native.cc_library(
-      name=name,
-      srcs=subsrcs,
-      hdrs=subhdrs,
-      deps=deps + if_not_android([
-          clean_dep("//tensorflow/core:core_cpu"),
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/core:lib"),
-          clean_dep("//tensorflow/core:protos_all_cc"),
-      ]) + if_android([
-          clean_dep("//tensorflow/core:android_tensorflow_lib"),
-      ]),
-      copts=tf_copts(),
-      alwayslink=1,
-      visibility=visibility)
-  native.cc_library(
-      name=name + "_internal",
-      srcs=internalsrcs,
-      hdrs=internalhdrs,
-      deps=deps + if_not_android([
-          clean_dep("//tensorflow/core:core_cpu"),
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/core:lib"),
-          clean_dep("//tensorflow/core:protos_all_cc"),
-      ]) + if_android([
-          clean_dep("//tensorflow/core:android_tensorflow_lib"),
-      ]),
-      copts=tf_copts(),
-      alwayslink=1,
-      visibility=[clean_dep("//tensorflow:internal")])
+def tf_gen_op_wrappers_cc(
+        name,
+        op_lib_names = [],
+        other_srcs = [],
+        other_hdrs = [],
+        pkg = "",
+        deps = [
+            clean_dep("//tensorflow/cc:ops"),
+            clean_dep("//tensorflow/cc:scope"),
+            clean_dep("//tensorflow/cc:const_op"),
+        ],
+        op_gen = clean_dep("//tensorflow/cc:cc_op_gen_main"),
+        include_internal_ops = 0,
+        visibility = None,
+        # ApiDefs will be loaded in the order apecified in this list.
+        api_def_srcs = []):
+    subsrcs = other_srcs[:]
+    subhdrs = other_hdrs[:]
+    internalsrcs = []
+    internalhdrs = []
+    for n in op_lib_names:
+        tf_gen_op_wrapper_cc(
+            n,
+            "ops/" + n,
+            pkg = pkg,
+            op_gen = op_gen,
+            include_internal_ops = include_internal_ops,
+            api_def_srcs = api_def_srcs,
+        )
+        subsrcs += ["ops/" + n + ".cc"]
+        subhdrs += ["ops/" + n + ".h"]
+        internalsrcs += ["ops/" + n + "_internal.cc"]
+        internalhdrs += ["ops/" + n + "_internal.h"]
+
+    native.cc_library(
+        name = name,
+        srcs = subsrcs,
+        hdrs = subhdrs,
+        deps = deps + if_not_android([
+            clean_dep("//tensorflow/core:core_cpu"),
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:protos_all_cc"),
+        ]) + if_android([
+            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+        ]),
+        copts = tf_copts(),
+        alwayslink = 1,
+        visibility = visibility,
+    )
+    native.cc_library(
+        name = name + "_internal",
+        srcs = internalsrcs,
+        hdrs = internalhdrs,
+        deps = deps + if_not_android([
+            clean_dep("//tensorflow/core:core_cpu"),
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:protos_all_cc"),
+        ]) + if_android([
+            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+        ]),
+        copts = tf_copts(),
+        alwayslink = 1,
+        visibility = [clean_dep("//tensorflow:internal")],
+    )
 
 # Generates a Python library target wrapping the ops registered in "deps".
 #
@@ -567,96 +597,102 @@ def tf_gen_op_wrappers_cc(name,
 #     is invalid to specify both "hidden" and "op_whitelist".
 #   cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
 #     specified ops.
-def tf_gen_op_wrapper_py(name,
-                         out=None,
-                         hidden=None,
-                         visibility=None,
-                         deps=[],
-                         require_shape_functions=False,
-                         hidden_file=None,
-                         generated_target_name=None,
-                         op_whitelist=[],
-                         cc_linkopts=[],
-                         api_def_srcs=[]):
-  if (hidden or hidden_file) and op_whitelist:
-    fail('Cannot pass specify both hidden and op_whitelist.')
-
-  # Construct a cc_binary containing the specified ops.
-  tool_name = "gen_" + name + "_py_wrappers_cc"
-  if not deps:
-    deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
-  tf_cc_binary(
-      name=tool_name,
-      linkopts=if_not_windows(["-lm"]) + cc_linkopts,
-      copts=tf_copts(),
-      linkstatic=1,  # Faster to link this one-time-use binary dynamically
-      deps=([
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/python:python_op_gen_main")
-      ] + deps),
-      visibility=[clean_dep("//tensorflow:internal")],)
-
-  # Invoke the previous cc_binary to generate a python file.
-  if not out:
-    out = "ops/gen_" + name + ".py"
-
-  if hidden:
-    op_list_arg = ",".join(hidden)
-    op_list_is_whitelist = False
-  elif op_whitelist:
-    op_list_arg = ",".join(op_whitelist)
-    op_list_is_whitelist = True
-  else:
-    op_list_arg = "''"
-    op_list_is_whitelist = False
-
-  # Prepare ApiDef directories to pass to the genrule.
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
-
-  if hidden_file:
-    # `hidden_file` is file containing a list of op names to be hidden in the
-    # generated module.
-    native.genrule(
-        name=name + "_pygenrule",
-        outs=[out],
-        srcs=api_def_srcs + [hidden_file],
-        tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") " + api_def_args_str +
-             " @$(location " + hidden_file + ") " +
-             ("1" if require_shape_functions else "0") + " > $@"))
-  else:
-    native.genrule(
-        name=name + "_pygenrule",
-        outs=[out],
-        srcs=api_def_srcs,
-        tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") " + api_def_args_str + " " +
-             op_list_arg + " " +
-             ("1" if require_shape_functions else "0") + " " +
-             ("1" if op_list_is_whitelist else "0") + " > $@"))
-
-  # Make a py_library out of the generated python file.
-  if not generated_target_name:
-    generated_target_name = name
-  native.py_library(
-      name=generated_target_name,
-      srcs=[out],
-      srcs_version="PY2AND3",
-      visibility=visibility,
-      deps=[
-          clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
-      ],)
+def tf_gen_op_wrapper_py(
+        name,
+        out = None,
+        hidden = None,
+        visibility = None,
+        deps = [],
+        require_shape_functions = False,
+        hidden_file = None,
+        generated_target_name = None,
+        op_whitelist = [],
+        cc_linkopts = [],
+        api_def_srcs = []):
+    if (hidden or hidden_file) and op_whitelist:
+        fail("Cannot pass specify both hidden and op_whitelist.")
+
+    # Construct a cc_binary containing the specified ops.
+    tool_name = "gen_" + name + "_py_wrappers_cc"
+    if not deps:
+        deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
+    tf_cc_binary(
+        name = tool_name,
+        linkopts = if_not_windows(["-lm"]) + cc_linkopts,
+        copts = tf_copts(),
+        linkstatic = 1,  # Faster to link this one-time-use binary dynamically
+        deps = ([
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/python:python_op_gen_main"),
+        ] + deps),
+        visibility = [clean_dep("//tensorflow:internal")],
+    )
+
+    # Invoke the previous cc_binary to generate a python file.
+    if not out:
+        out = "ops/gen_" + name + ".py"
+
+    if hidden:
+        op_list_arg = ",".join(hidden)
+        op_list_is_whitelist = False
+    elif op_whitelist:
+        op_list_arg = ",".join(op_whitelist)
+        op_list_is_whitelist = True
+    else:
+        op_list_arg = "''"
+        op_list_is_whitelist = False
+
+    # Prepare ApiDef directories to pass to the genrule.
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                "$$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
+
+    if hidden_file:
+        # `hidden_file` is file containing a list of op names to be hidden in the
+        # generated module.
+        native.genrule(
+            name = name + "_pygenrule",
+            outs = [out],
+            srcs = api_def_srcs + [hidden_file],
+            tools = [tool_name] + tf_binary_additional_srcs(),
+            cmd = ("$(location " + tool_name + ") " + api_def_args_str +
+                   " @$(location " + hidden_file + ") " +
+                   ("1" if require_shape_functions else "0") + " > $@"),
+        )
+    else:
+        native.genrule(
+            name = name + "_pygenrule",
+            outs = [out],
+            srcs = api_def_srcs,
+            tools = [tool_name] + tf_binary_additional_srcs(),
+            cmd = ("$(location " + tool_name + ") " + api_def_args_str + " " +
+                   op_list_arg + " " +
+                   ("1" if require_shape_functions else "0") + " " +
+                   ("1" if op_list_is_whitelist else "0") + " > $@"),
+        )
+
+    # Make a py_library out of the generated python file.
+    if not generated_target_name:
+        generated_target_name = name
+    native.py_library(
+        name = generated_target_name,
+        srcs = [out],
+        srcs_version = "PY2AND3",
+        visibility = visibility,
+        deps = [
+            clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
+        ],
+    )
 
 # Define a bazel macro that creates cc_test for tensorflow.
 #
@@ -667,52 +703,54 @@ def tf_gen_op_wrapper_py(name,
 #
 # TODO(opensource): we need to enable this to work around the hidden symbol
 # __cudaRegisterFatBinary error. Need more investigations.
-def tf_cc_test(name,
-               srcs,
-               deps,
-               data=[],
-               linkstatic=0,
-               extra_copts=[],
-               suffix="",
-               linkopts=[],
-               nocopts=None,
-               kernels=[],
-               **kwargs):
-  native.cc_test(
-      name="%s%s" % (name, suffix),
-      srcs=srcs + tf_binary_additional_srcs(),
-      copts=tf_copts() + extra_copts,
-      linkopts=select({
-        clean_dep("//tensorflow:android"): [
-            "-pie",
-        ],
-        clean_dep("//tensorflow:windows"): [],
-        clean_dep("//tensorflow:darwin"): [
-            "-lm",
-        ],
-        "//conditions:default": [
-            "-lpthread",
-            "-lm"
-        ],
-      }) + linkopts + _rpath_linkopts(name),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
-          [
-              "//third_party/mkl:intel_binary_blob",
-          ],
-      ),
-      data=data + tf_binary_dynamic_kernel_dsos(kernels),
-      # Nested select() statements seem not to be supported when passed to
-      # linkstatic, and we already have a cuda select() passed in to this
-      # function.
-      linkstatic=linkstatic or select({
-          # cc_tests with ".so"s in srcs incorrectly link on Darwin unless
-          # linkstatic=1 (https://github.com/bazelbuild/bazel/issues/3450).
-          # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-          clean_dep("//tensorflow:darwin"): 1,
-          "//conditions:default": 0,
-      }),
-      nocopts=nocopts,
-      **kwargs)
+def tf_cc_test(
+        name,
+        srcs,
+        deps,
+        data = [],
+        linkstatic = 0,
+        extra_copts = [],
+        suffix = "",
+        linkopts = [],
+        nocopts = None,
+        kernels = [],
+        **kwargs):
+    native.cc_test(
+        name = "%s%s" % (name, suffix),
+        srcs = srcs + tf_binary_additional_srcs(),
+        copts = tf_copts() + extra_copts,
+        linkopts = select({
+            clean_dep("//tensorflow:android"): [
+                "-pie",
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            clean_dep("//tensorflow:darwin"): [
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-lpthread",
+                "-lm",
+            ],
+        }) + linkopts + _rpath_linkopts(name),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+            [
+                "//third_party/mkl:intel_binary_blob",
+            ],
+        ),
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        # Nested select() statements seem not to be supported when passed to
+        # linkstatic, and we already have a cuda select() passed in to this
+        # function.
+        linkstatic = linkstatic or select({
+            # cc_tests with ".so"s in srcs incorrectly link on Darwin unless
+            # linkstatic=1 (https://github.com/bazelbuild/bazel/issues/3450).
+            # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+            clean_dep("//tensorflow:darwin"): 1,
+            "//conditions:default": 0,
+        }),
+        nocopts = nocopts,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cc_test",
@@ -721,107 +759,115 @@ register_extension_info(
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
-def tf_cc_test_gpu(name,
-                   srcs,
-                   deps,
-                   linkstatic=0,
-                   tags=[],
-                   data=[],
-                   size="medium",
-                   suffix="",
-                   args=None):
-  tf_cc_test(
-      name,
-      srcs,
-      deps,
-      linkstatic=linkstatic,
-      tags=tags,
-      data=data,
-      size=size,
-      suffix=suffix,
-      args=args)
+def tf_cc_test_gpu(
+        name,
+        srcs,
+        deps,
+        linkstatic = 0,
+        tags = [],
+        data = [],
+        size = "medium",
+        suffix = "",
+        args = None):
+    tf_cc_test(
+        name,
+        srcs,
+        deps,
+        linkstatic = linkstatic,
+        tags = tags,
+        data = data,
+        size = size,
+        suffix = suffix,
+        args = args,
+    )
 
 register_extension_info(
     extension_name = "tf_cc_test_gpu",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_cc_test(name,
-                    srcs=[],
-                    deps=[],
-                    tags=[],
-                    data=[],
-                    size="medium",
-                    extra_copts=[],
-                    linkstatic=0,
-                    args=[],
-                    linkopts=[]):
-  tf_cc_test(
-      name=name,
-      srcs=srcs,
-      deps=deps,
-      tags=tags + ["manual"],
-      data=data,
-      size=size,
-      extra_copts=extra_copts,
-      linkstatic=linkstatic,
-      linkopts=linkopts,
-      args=args)
-  tf_cc_test(
-      name=name,
-      srcs=srcs,
-      suffix="_gpu",
-      deps=deps + if_cuda([
-          clean_dep("//tensorflow/core:gpu_runtime"),
-      ]),
-      linkstatic=select({
-          # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-          clean_dep("//tensorflow:darwin"): 1,
-          "@local_config_cuda//cuda:using_nvcc": 1,
-          "@local_config_cuda//cuda:using_clang": 1,
-          "//conditions:default": 0,
-      }),
-      tags=tags + tf_cuda_tests_tags(),
-      data=data,
-      size=size,
-      extra_copts=extra_copts,
-      linkopts=linkopts,
-      args=args)
+def tf_cuda_cc_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        size = "medium",
+        extra_copts = [],
+        linkstatic = 0,
+        args = [],
+        linkopts = []):
+    tf_cc_test(
+        name = name,
+        srcs = srcs,
+        deps = deps,
+        tags = tags + ["manual"],
+        data = data,
+        size = size,
+        extra_copts = extra_copts,
+        linkstatic = linkstatic,
+        linkopts = linkopts,
+        args = args,
+    )
+    tf_cc_test(
+        name = name,
+        srcs = srcs,
+        suffix = "_gpu",
+        deps = deps + if_cuda([
+            clean_dep("//tensorflow/core:gpu_runtime"),
+        ]),
+        linkstatic = select({
+            # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+            clean_dep("//tensorflow:darwin"): 1,
+            "@local_config_cuda//cuda:using_nvcc": 1,
+            "@local_config_cuda//cuda:using_clang": 1,
+            "//conditions:default": 0,
+        }),
+        tags = tags + tf_cuda_tests_tags(),
+        data = data,
+        size = size,
+        extra_copts = extra_copts,
+        linkopts = linkopts,
+        args = args,
+    )
 
 register_extension_info(
     extension_name = "tf_cuda_cc_test",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_only_cc_test(name,
-                    srcs=[],
-                    deps=[],
-                    tags=[],
-                    data=[],
-                    size="medium",
-                    linkstatic=0,
-                    args=[],
-                    kernels=[],
-                    linkopts=[]):
-  native.cc_test(
-      name="%s%s" % (name, "_gpu"),
-      srcs=srcs + tf_binary_additional_srcs(),
-      size=size,
-      args=args,
-      copts= _cuda_copts() + tf_copts(),
-      data=data + tf_binary_dynamic_kernel_dsos(kernels),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda([
-          clean_dep("//tensorflow/core:cuda"),
-          clean_dep("//tensorflow/core:gpu_lib")]),
-      linkopts=if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
-      linkstatic=linkstatic or select({
-          # cc_tests with ".so"s in srcs incorrectly link on Darwin
-          # unless linkstatic=1.
-          # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-          clean_dep("//tensorflow:darwin"): 1,
-          "//conditions:default": 0,
-      }),
-      tags=tags + tf_cuda_tests_tags())
+def tf_cuda_only_cc_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        size = "medium",
+        linkstatic = 0,
+        args = [],
+        kernels = [],
+        linkopts = []):
+    native.cc_test(
+        name = "%s%s" % (name, "_gpu"),
+        srcs = srcs + tf_binary_additional_srcs(),
+        size = size,
+        args = args,
+        copts = _cuda_copts() + tf_copts(),
+        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda([
+            clean_dep("//tensorflow/core:cuda"),
+            clean_dep("//tensorflow/core:gpu_lib"),
+        ]),
+        linkopts = if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
+        linkstatic = linkstatic or select({
+            # cc_tests with ".so"s in srcs incorrectly link on Darwin
+            # unless linkstatic=1.
+            # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+            clean_dep("//tensorflow:darwin"): 1,
+            "//conditions:default": 0,
+        }),
+        tags = tags + tf_cuda_tests_tags(),
+    )
 
 register_extension_info(
     extension_name = "tf_cuda_only_cc_test",
@@ -829,108 +875,112 @@ register_extension_info(
 )
 
 # Create a cc_test for each of the tensorflow tests listed in "tests"
-def tf_cc_tests(srcs,
-                deps,
-                name="",
-                linkstatic=0,
-                tags=[],
-                size="medium",
-                args=None,
-                linkopts=[],
-                nocopts=None):
-  for src in srcs:
-    tf_cc_test(
-        name=src_to_test_name(src),
-        srcs=[src],
-        deps=deps,
-        linkstatic=linkstatic,
-        tags=tags,
-        size=size,
-        args=args,
-        linkopts=linkopts,
-        nocopts=nocopts)
-
-def tf_cc_test_mkl(srcs,
-                   deps,
-                   name="",
-                   data=[],
-                   linkstatic=0,
-                   tags=[],
-                   size="medium",
-                   kernels=[],
-                   args=None):
-  # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
-  disable_header_modules = ["-use_header_modules"]
-
-  for src in srcs:
-    native.cc_test(
-      name=src_to_test_name(src),
-      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
-      copts=tf_copts(),
-      linkopts=select({
-        clean_dep("//tensorflow:android"): [
-            "-pie",
-          ],
-        clean_dep("//tensorflow:windows"): [],
-        "//conditions:default": [
-            "-lpthread",
-            "-lm"
-        ],
-      }) + _rpath_linkopts(src_to_test_name(src)),
-      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
-          [
-              "//third_party/mkl:intel_binary_blob",
-          ],
-      ),
-      data=data + tf_binary_dynamic_kernel_dsos(kernels),
-      linkstatic=linkstatic,
-      tags=tags,
-      size=size,
-      args=args,
-      features=disable_header_modules,
-      nocopts="-fno-exceptions")
-
-
-def tf_cc_tests_gpu(srcs,
-                    deps,
-                    name="",
-                    linkstatic=0,
-                    tags=[],
-                    size="medium",
-                    args=None):
-  tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
-
-def tf_cuda_cc_tests(srcs,
-                     deps,
-                     name="",
-                     tags=[],
-                     size="medium",
-                     linkstatic=0,
-                     args=None,
-                     linkopts=[]):
-  for src in srcs:
-    tf_cuda_cc_test(
-        name=src_to_test_name(src),
-        srcs=[src],
-        deps=deps,
-        tags=tags,
-        size=size,
-        linkstatic=linkstatic,
-        args=args,
-        linkopts=linkopts)
-
-def tf_java_test(name,
-                 srcs=[],
-                 deps=[],
-                 kernels=[],
-                 *args,
-                 **kwargs):
-  native.java_test(
-      name=name,
-      srcs=srcs,
-      deps=deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
-      *args,
-      **kwargs)
+def tf_cc_tests(
+        srcs,
+        deps,
+        name = "",
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        args = None,
+        linkopts = [],
+        nocopts = None):
+    for src in srcs:
+        tf_cc_test(
+            name = src_to_test_name(src),
+            srcs = [src],
+            deps = deps,
+            linkstatic = linkstatic,
+            tags = tags,
+            size = size,
+            args = args,
+            linkopts = linkopts,
+            nocopts = nocopts,
+        )
+
+def tf_cc_test_mkl(
+        srcs,
+        deps,
+        name = "",
+        data = [],
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        kernels = [],
+        args = None):
+    # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
+    disable_header_modules = ["-use_header_modules"]
+
+    for src in srcs:
+        native.cc_test(
+            name = src_to_test_name(src),
+            srcs = if_mkl([src]) + tf_binary_additional_srcs(),
+            copts = tf_copts(),
+            linkopts = select({
+                clean_dep("//tensorflow:android"): [
+                    "-pie",
+                ],
+                clean_dep("//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-lpthread",
+                    "-lm",
+                ],
+            }) + _rpath_linkopts(src_to_test_name(src)),
+            deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_deps(),
+            data = data + tf_binary_dynamic_kernel_dsos(kernels),
+            linkstatic = linkstatic,
+            tags = tags,
+            size = size,
+            args = args,
+            features = disable_header_modules,
+            nocopts = "-fno-exceptions",
+        )
+
+def tf_cc_tests_gpu(
+        srcs,
+        deps,
+        name = "",
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        args = None):
+    tf_cc_tests(srcs, deps, linkstatic, tags = tags, size = size, args = args)
+
+def tf_cuda_cc_tests(
+        srcs,
+        deps,
+        name = "",
+        tags = [],
+        size = "medium",
+        linkstatic = 0,
+        args = None,
+        linkopts = []):
+    for src in srcs:
+        tf_cuda_cc_test(
+            name = src_to_test_name(src),
+            srcs = [src],
+            deps = deps,
+            tags = tags,
+            size = size,
+            linkstatic = linkstatic,
+            args = args,
+            linkopts = linkopts,
+        )
+
+def tf_java_test(
+        name,
+        srcs = [],
+        deps = [],
+        kernels = [],
+        *args,
+        **kwargs):
+    native.java_test(
+        name = name,
+        srcs = srcs,
+        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
+        *args,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_java_test",
@@ -938,85 +988,89 @@ register_extension_info(
 )
 
 def _cuda_copts():
-  """Gets the appropriate set of copts for (maybe) CUDA compilation.
-
-    If we're doing CUDA compilation, returns copts for our particular CUDA
-    compiler.  If we're not doing CUDA compilation, returns an empty list.
-
-    """
-  return cuda_default_copts() + select({
-      "//conditions:default": [],
-      "@local_config_cuda//cuda:using_nvcc": ([
-          "-nvcc_options=relaxed-constexpr",
-          "-nvcc_options=ftz=true",
-      ]),
-      "@local_config_cuda//cuda:using_clang": ([
-          "-fcuda-flush-denormals-to-zero",
-      ]),
-  })
+    """Gets the appropriate set of copts for (maybe) CUDA compilation.
+
+      If we're doing CUDA compilation, returns copts for our particular CUDA
+      compiler.  If we're not doing CUDA compilation, returns an empty list.
+
+      """
+    return cuda_default_copts() + select({
+        "//conditions:default": [],
+        "@local_config_cuda//cuda:using_nvcc": ([
+            "-nvcc_options=relaxed-constexpr",
+            "-nvcc_options=ftz=true",
+        ]),
+        "@local_config_cuda//cuda:using_clang": ([
+            "-fcuda-flush-denormals-to-zero",
+        ]),
+    })
 
 # Build defs for TensorFlow kernels
 
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
-def tf_gpu_kernel_library(srcs,
-                          copts=[],
-                          cuda_copts=[],
-                          deps=[],
-                          hdrs=[],
-                          **kwargs):
-  copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
-  kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
-
-  native.cc_library(
-      srcs=srcs,
-      hdrs=hdrs,
-      copts=copts,
-      deps=deps + if_cuda([
-          clean_dep("//tensorflow/core:cuda"),
-          clean_dep("//tensorflow/core:gpu_lib"),
-      ]),
-      alwayslink=1,
-      **kwargs)
+def tf_gpu_kernel_library(
+        srcs,
+        copts = [],
+        cuda_copts = [],
+        deps = [],
+        hdrs = [],
+        **kwargs):
+    copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
+    kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
+
+    native.cc_library(
+        srcs = srcs,
+        hdrs = hdrs,
+        copts = copts,
+        deps = deps + if_cuda([
+            clean_dep("//tensorflow/core:cuda"),
+            clean_dep("//tensorflow/core:gpu_lib"),
+        ]),
+        alwayslink = 1,
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_gpu_kernel_library",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
-  """Generate a cc_library with a conditional set of CUDA dependencies.
-
-  When the library is built with --config=cuda:
-
-  - Both deps and cuda_deps are used as dependencies.
-  - The cuda runtime is added as a dependency (if necessary).
-  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
-  - In addition, when the library is also built with TensorRT enabled, it
-      additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
-
-  Args:
-  - cuda_deps: BUILD dependencies which will be linked if and only if:
-      '--config=cuda' is passed to the bazel command line.
-  - deps: dependencies which will always be linked.
-  - copts: copts always passed to the cc_library.
-  - kwargs: Any other argument to cc_library.
-  """
-  if not deps:
-    deps = []
-  if not cuda_deps:
-    cuda_deps = []
-
-  kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
-  native.cc_library(
-      deps=deps + if_cuda(cuda_deps + [
-          clean_dep("//tensorflow/core:cuda"),
-          "@local_config_cuda//cuda:cuda_headers"
-      ]),
-      copts=(copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
-             if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
-      **kwargs)
+def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
+    """Generate a cc_library with a conditional set of CUDA dependencies.
+
+    When the library is built with --config=cuda:
+
+    - Both deps and cuda_deps are used as dependencies.
+    - The cuda runtime is added as a dependency (if necessary).
+    - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
+    - In addition, when the library is also built with TensorRT enabled, it
+        additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
+
+    Args:
+    - cuda_deps: BUILD dependencies which will be linked if and only if:
+        '--config=cuda' is passed to the bazel command line.
+    - deps: dependencies which will always be linked.
+    - copts: copts always passed to the cc_library.
+    - kwargs: Any other argument to cc_library.
+    """
+    if not deps:
+        deps = []
+    if not cuda_deps:
+        cuda_deps = []
+
+    kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
+    native.cc_library(
+        deps = deps + if_cuda(cuda_deps + [
+            clean_dep("//tensorflow/core:cuda"),
+            "@local_config_cuda//cuda:cuda_headers",
+        ]),
+        copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
+                 if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
+                 if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "tf_cuda_library",
@@ -1034,126 +1088,138 @@ def tf_kernel_library(
         copts = None,
         is_external = False,
         **kwargs):
-  """A rule to build a TensorFlow OpKernel.
-
-  May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
-  but with alwayslink=1 by default.  If prefix is specified:
-    * prefix*.cc (except *.cu.cc) is added to srcs
-    * prefix*.h (except *.cu.h) is added to hdrs
-    * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
-  With the exception that test files are excluded.
-  For example, with prefix = "cast_op",
-    * srcs = ["cast_op.cc"]
-    * hdrs = ["cast_op.h"]
-    * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
-    * "cast_op_test.cc" is excluded
-  With prefix = "cwise_op"
-    * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
-    * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
-    * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
-                  "cwise_ops.h", "cwise_ops_common.h",
-                  "cwise_ops_gpu_common.cu.h"]
-    * "cwise_ops_test.cc" is excluded
-  """
-  if not srcs:
-    srcs = []
-  if not hdrs:
-    hdrs = []
-  if not deps:
-    deps = []
-  if not copts:
-    copts = []
-  textual_hdrs = []
-  copts = copts + tf_copts(is_external=is_external)
-  if prefix:
-    if native.glob([prefix + "*.cu.cc"], exclude=["*test*"]):
-      if not gpu_srcs:
-        gpu_srcs = []
-      gpu_srcs = gpu_srcs + native.glob(
-          [prefix + "*.cu.cc", prefix + "*.h"], exclude=[prefix + "*test*"])
-    srcs = srcs + native.glob(
-        [prefix + "*.cc"], exclude=[prefix + "*test*", prefix + "*.cu.cc"])
-    hdrs = hdrs + native.glob(
+    """A rule to build a TensorFlow OpKernel.
+
+    May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
+    but with alwayslink=1 by default.  If prefix is specified:
+      * prefix*.cc (except *.cu.cc) is added to srcs
+      * prefix*.h (except *.cu.h) is added to hdrs
+      * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
+    With the exception that test files are excluded.
+    For example, with prefix = "cast_op",
+      * srcs = ["cast_op.cc"]
+      * hdrs = ["cast_op.h"]
+      * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
+      * "cast_op_test.cc" is excluded
+    With prefix = "cwise_op"
+      * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
+      * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
+      * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
+                    "cwise_ops.h", "cwise_ops_common.h",
+                    "cwise_ops_gpu_common.cu.h"]
+      * "cwise_ops_test.cc" is excluded
+    """
+    if not srcs:
+        srcs = []
+    if not hdrs:
+        hdrs = []
+    if not deps:
+        deps = []
+    if not copts:
+        copts = []
+    textual_hdrs = []
+    copts = copts + tf_copts(is_external = is_external)
+    if prefix:
+        if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
+            if not gpu_srcs:
+                gpu_srcs = []
+            gpu_srcs = gpu_srcs + native.glob(
+                [prefix + "*.cu.cc", prefix + "*.h"],
+                exclude = [prefix + "*test*"],
+            )
+        srcs = srcs + native.glob(
+            [prefix + "*.cc"],
+            exclude = [prefix + "*test*", prefix + "*.cu.cc"],
+        )
+        hdrs = hdrs + native.glob(
             [prefix + "*.h"],
             exclude = [prefix + "*test*", prefix + "*.cu.h", prefix + "*impl.h"],
         )
-    textual_hdrs = native.glob(
+        textual_hdrs = native.glob(
             [prefix + "*impl.h"],
             exclude = [prefix + "*test*", prefix + "*.cu.h"],
         )
-  cuda_deps = [clean_dep("//tensorflow/core:gpu_lib")]
-  if gpu_srcs:
-    for gpu_src in gpu_srcs:
-      if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
-        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".
-             format(gpu_src))
-    tf_gpu_kernel_library(
-        name=name + "_gpu", srcs=gpu_srcs, deps=deps, **kwargs)
-    cuda_deps.extend([":" + name + "_gpu"])
-  kwargs["tags"] = kwargs.get("tags", []) + [
-      "req_dep=%s" % clean_dep("//tensorflow/core:gpu_lib"),
-      "req_dep=@local_config_cuda//cuda:cuda_headers",
-  ]
-  tf_cuda_library(
-      name=name,
-      srcs=srcs,
-      hdrs=hdrs,
-      textual_hdrs = textual_hdrs,
-      copts=copts,
-      cuda_deps=cuda_deps,
-      linkstatic=1,  # Needed since alwayslink is broken in bazel b/27630669
-      alwayslink=alwayslink,
-      deps=deps,
-      **kwargs)
-
-  # TODO(gunan): CUDA dependency not clear here. Fix it.
-  tf_cc_shared_object(
-      name="libtfkernel_%s.so" % name,
-      srcs=srcs + hdrs,
-      copts=copts,
-      deps=deps,
-      tags=["manual", "notap"])
+    cuda_deps = [clean_dep("//tensorflow/core:gpu_lib")]
+    if gpu_srcs:
+        for gpu_src in gpu_srcs:
+            if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
+                fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc"
+                    .format(gpu_src))
+        tf_gpu_kernel_library(
+            name = name + "_gpu",
+            srcs = gpu_srcs,
+            deps = deps,
+            **kwargs
+        )
+        cuda_deps.extend([":" + name + "_gpu"])
+    kwargs["tags"] = kwargs.get("tags", []) + [
+        "req_dep=%s" % clean_dep("//tensorflow/core:gpu_lib"),
+        "req_dep=@local_config_cuda//cuda:cuda_headers",
+    ]
+    tf_cuda_library(
+        name = name,
+        srcs = srcs,
+        hdrs = hdrs,
+        textual_hdrs = textual_hdrs,
+        copts = copts,
+        cuda_deps = cuda_deps,
+        linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+        alwayslink = alwayslink,
+        deps = deps,
+        **kwargs
+    )
 
+    # TODO(gunan): CUDA dependency not clear here. Fix it.
+    tf_cc_shared_object(
+        name = "libtfkernel_%s.so" % name,
+        srcs = srcs + hdrs,
+        copts = copts,
+        deps = deps,
+        tags = ["manual", "notap"],
+    )
 
 register_extension_info(
     extension_name = "tf_kernel_library",
     label_regex_for_dep = "{extension_name}(_gpu)?",
 )
 
-def tf_mkl_kernel_library(name,
-                          prefix=None,
-                          srcs=None,
-                          hdrs=None,
-                          deps=None,
-                          alwayslink=1,
-                          copts=tf_copts(),
-                          nocopts="-fno-exceptions"):
-  """A rule to build MKL-based TensorFlow kernel libraries."""
-
-  if not bool(srcs):
-    srcs = []
-  if not bool(hdrs):
-    hdrs = []
-
-  if prefix:
-    srcs = srcs + native.glob(
-        [prefix + "*.cc"])
-    hdrs = hdrs + native.glob(
-        [prefix + "*.h"])
-
-  # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
-  disable_header_modules = ["-use_header_modules"]
-
-  native.cc_library(
-      name=name,
-      srcs=if_mkl(srcs),
-      hdrs=hdrs,
-      deps=deps,
-      alwayslink=alwayslink,
-      copts=copts,
-      nocopts=nocopts,
-      features = disable_header_modules
-  )
+def tf_mkl_kernel_library(
+        name,
+        prefix = None,
+        srcs = None,
+        hdrs = None,
+        deps = None,
+        alwayslink = 1,
+        copts = tf_copts(),
+        nocopts = "-fno-exceptions"):
+    """A rule to build MKL-based TensorFlow kernel libraries."""
+
+    if not bool(srcs):
+        srcs = []
+    if not bool(hdrs):
+        hdrs = []
+
+    if prefix:
+        srcs = srcs + native.glob(
+            [prefix + "*.cc"],
+        )
+        hdrs = hdrs + native.glob(
+            [prefix + "*.h"],
+        )
+
+    # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
+    disable_header_modules = ["-use_header_modules"]
+
+    native.cc_library(
+        name = name,
+        srcs = if_mkl(srcs),
+        hdrs = hdrs,
+        deps = deps,
+        alwayslink = alwayslink,
+        copts = copts,
+        nocopts = nocopts,
+        features = disable_header_modules,
+    )
 
 register_extension_info(
     extension_name = "tf_mkl_kernel_library",
@@ -1162,35 +1228,42 @@ register_extension_info(
 
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
-  srcs = ctx.files.srcs
-  if len(srcs) != 1:
-    fail("Exactly one SWIG source file label must be specified.", "srcs")
-  module_name = ctx.attr.module_name
-  src = ctx.files.srcs[0]
-  inputs = depset([src])
-  inputs += ctx.files.swig_includes
-  for dep in ctx.attr.deps:
-    inputs += dep.cc.transitive_headers
-  inputs += ctx.files._swiglib
-  inputs += ctx.files.toolchain_deps
-  swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
-  swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
-  args = [
-      "-c++", "-python", "-module", module_name, "-o", ctx.outputs.cc_out.path,
-      "-outdir", ctx.outputs.py_out.dirname
-  ]
-  args += ["-l" + f.path for f in ctx.files.swig_includes]
-  args += ["-I" + i for i in swig_include_dirs]
-  args += [src.path]
-  outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
-  ctx.action(
-      executable=ctx.executable._swig,
-      arguments=args,
-      inputs=list(inputs),
-      outputs=outputs,
-      mnemonic="PythonSwig",
-      progress_message="SWIGing " + src.path)
-  return struct(files=depset(outputs))
+    srcs = ctx.files.srcs
+    if len(srcs) != 1:
+        fail("Exactly one SWIG source file label must be specified.", "srcs")
+    module_name = ctx.attr.module_name
+    src = ctx.files.srcs[0]
+    inputs = depset([src])
+    inputs += ctx.files.swig_includes
+    for dep in ctx.attr.deps:
+        inputs += dep.cc.transitive_headers
+    inputs += ctx.files._swiglib
+    inputs += ctx.files.toolchain_deps
+    swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
+    swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
+    args = [
+        "-c++",
+        "-python",
+        "-module",
+        module_name,
+        "-o",
+        ctx.outputs.cc_out.path,
+        "-outdir",
+        ctx.outputs.py_out.dirname,
+    ]
+    args += ["-l" + f.path for f in ctx.files.swig_includes]
+    args += ["-I" + i for i in swig_include_dirs]
+    args += [src.path]
+    outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
+    ctx.action(
+        executable = ctx.executable._swig,
+        arguments = args,
+        inputs = list(inputs),
+        outputs = outputs,
+        mnemonic = "PythonSwig",
+        progress_message = "SWIGing " + src.path,
+    )
+    return struct(files = depset(outputs))
 
 _py_wrap_cc = rule(
     attrs = {
@@ -1228,40 +1301,40 @@ _py_wrap_cc = rule(
 )
 
 def _get_repository_roots(ctx, files):
-  """Returns abnormal root directories under which files reside.
-
-  When running a ctx.action, source files within the main repository are all
-  relative to the current directory; however, files that are generated or exist
-  in remote repositories will have their root directory be a subdirectory,
-  e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
-  returns the set of these devious directories, ranked and sorted by popularity
-  in order to hopefully minimize the number of I/O system calls within the
-  compiler, because includes have quadratic complexity.
-  """
-  result = {}
-  for f in files:
-    root = f.root.path
-    if root:
-      if root not in result:
-        result[root] = 0
-      result[root] -= 1
-    work = f.owner.workspace_root
-    if work:
-      if root:
-        root += "/"
-      root += work
-    if root:
-      if root not in result:
-        result[root] = 0
-      result[root] -= 1
-  return [k for v, k in sorted([(v, k) for k, v in result.items()])]
+    """Returns abnormal root directories under which files reside.
+
+    When running a ctx.action, source files within the main repository are all
+    relative to the current directory; however, files that are generated or exist
+    in remote repositories will have their root directory be a subdirectory,
+    e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
+    returns the set of these devious directories, ranked and sorted by popularity
+    in order to hopefully minimize the number of I/O system calls within the
+    compiler, because includes have quadratic complexity.
+    """
+    result = {}
+    for f in files:
+        root = f.root.path
+        if root:
+            if root not in result:
+                result[root] = 0
+            result[root] -= 1
+        work = f.owner.workspace_root
+        if work:
+            if root:
+                root += "/"
+            root += work
+        if root:
+            if root not in result:
+                result[root] = 0
+            result[root] -= 1
+    return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
-  outputs = depset()
-  for dep in ctx.attr.deps:
-    outputs += dep.cc.transitive_headers
-  return struct(files=outputs)
+    outputs = depset()
+    for dep in ctx.attr.deps:
+        outputs += dep.cc.transitive_headers
+    return struct(files = outputs)
 
 _transitive_hdrs = rule(
     attrs = {
@@ -1273,52 +1346,54 @@ _transitive_hdrs = rule(
     implementation = _transitive_hdrs_impl,
 )
 
-def transitive_hdrs(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather", deps=deps)
-  native.filegroup(name=name, srcs=[":" + name + "_gather"])
+def transitive_hdrs(name, deps = [], **kwargs):
+    _transitive_hdrs(name = name + "_gather", deps = deps)
+    native.filegroup(name = name, srcs = [":" + name + "_gather"])
 
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
-def cc_header_only_library(name, deps=[], includes=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather", deps=deps)
-  native.cc_library(name=name,
-                    hdrs=[":" + name + "_gather"],
-                    includes=includes,
-                    **kwargs)
+def cc_header_only_library(name, deps = [], includes = [], **kwargs):
+    _transitive_hdrs(name = name + "_gather", deps = deps)
+    native.cc_library(
+        name = name,
+        hdrs = [":" + name + "_gather"],
+        includes = includes,
+        **kwargs
+    )
 
 def tf_custom_op_library_additional_deps():
-  return [
+    return [
       "@protobuf_archive//:protobuf_headers",
-      clean_dep("//third_party/eigen3"),
-      clean_dep("//tensorflow/core:framework_headers_lib"),
-  ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
+        clean_dep("//third_party/eigen3"),
+        clean_dep("//tensorflow/core:framework_headers_lib"),
+    ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
 
 # A list of targets that contains the implemenation of
 # tf_custom_op_library_additional_deps. It's used to generate a DEF file for
 # exporting symbols from _pywrap_tensorflow.dll on Windows.
 def tf_custom_op_library_additional_deps_impl():
-  return [
+    return [
       "@protobuf_archive//:protobuf",
       "@nsync//:nsync_cpp",
-      # for //third_party/eigen3
-      clean_dep("//third_party/eigen3"),
-      # for //tensorflow/core:framework_headers_lib
-      clean_dep("//tensorflow/core:framework"),
-      clean_dep("//tensorflow/core:reader_base"),
-  ]
+        # for //third_party/eigen3
+        clean_dep("//third_party/eigen3"),
+        # for //tensorflow/core:framework_headers_lib
+        clean_dep("//tensorflow/core:framework"),
+        clean_dep("//tensorflow/core:reader_base"),
+    ]
 
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
 # and the tf_collected_deps of the dependencies of this target.
 def _collect_deps_aspect_impl(target, ctx):
-  alldeps = depset()
-  if hasattr(ctx.rule.attr, "deps"):
-    for dep in ctx.rule.attr.deps:
-      alldeps = alldeps | depset([dep.label])
-      if hasattr(dep, "tf_collected_deps"):
-        alldeps = alldeps | dep.tf_collected_deps
-  return struct(tf_collected_deps=alldeps)
+    alldeps = depset()
+    if hasattr(ctx.rule.attr, "deps"):
+        for dep in ctx.rule.attr.deps:
+            alldeps = alldeps | depset([dep.label])
+            if hasattr(dep, "tf_collected_deps"):
+                alldeps = alldeps | dep.tf_collected_deps
+    return struct(tf_collected_deps = alldeps)
 
 collect_deps_aspect = aspect(
     attr_aspects = ["deps"],
@@ -1326,24 +1401,26 @@ collect_deps_aspect = aspect(
 )
 
 def _dep_label(dep):
-  label = dep.label
-  return label.package + ":" + label.name
+    label = dep.label
+    return label.package + ":" + label.name
 
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
 def _check_deps_impl(ctx):
-  disallowed_deps = ctx.attr.disallowed_deps
-  for input_dep in ctx.attr.deps:
-    if not hasattr(input_dep, "tf_collected_deps"):
-      continue
-    for dep in input_dep.tf_collected_deps:
-      for disallowed_dep in disallowed_deps:
-        if dep == disallowed_dep.label:
-          fail(
-              _dep_label(input_dep) + " cannot depend on " + _dep_label(
-                  disallowed_dep))
-  return struct()
+    disallowed_deps = ctx.attr.disallowed_deps
+    for input_dep in ctx.attr.deps:
+        if not hasattr(input_dep, "tf_collected_deps"):
+            continue
+        for dep in input_dep.tf_collected_deps:
+            for disallowed_dep in disallowed_deps:
+                if dep == disallowed_dep.label:
+                    fail(
+                        _dep_label(input_dep) + " cannot depend on " + _dep_label(
+                            disallowed_dep,
+                        ),
+                    )
+    return struct()
 
 check_deps = rule(
     _check_deps_impl,
@@ -1362,65 +1439,70 @@ check_deps = rule(
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
-def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
-  cuda_deps = [
-      clean_dep("//tensorflow/core:stream_executor_headers_lib"),
-      "@local_config_cuda//cuda:cuda_headers",
-      "@local_config_cuda//cuda:cudart_static",
-  ]
-  deps = deps + tf_custom_op_library_additional_deps()
-  if gpu_srcs:
-    basename = name.split(".")[0]
-    native.cc_library(
-        name=basename + "_gpu",
-        srcs=gpu_srcs,
-        copts=_cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
-        features = if_cuda(["-use_header_modules"]),
-        deps=deps + if_cuda(cuda_deps))
-    cuda_deps.extend([":" + basename + "_gpu"])
-
-  check_deps(
-      name=name + "_check_deps",
-      deps=deps + if_cuda(cuda_deps),
-      disallowed_deps=[
-          clean_dep("//tensorflow/core:framework"),
-          clean_dep("//tensorflow/core:lib")
-      ])
-  tf_cc_shared_object(
-      name=name,
-      srcs=srcs,
-      deps=deps + if_cuda(cuda_deps),
-      data=if_static([name + "_check_deps"]),
-      copts=tf_copts(is_external=True),
-      features = ["windows_export_all_symbols"],
-      linkopts=linkopts + select({
-          "//conditions:default": [
-              "-lm",
-          ],
-          clean_dep("//tensorflow:windows"): [],
-          clean_dep("//tensorflow:darwin"): [],
-      }),)
+def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = []):
+    cuda_deps = [
+        clean_dep("//tensorflow/core:stream_executor_headers_lib"),
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart_static",
+    ]
+    deps = deps + tf_custom_op_library_additional_deps()
+    if gpu_srcs:
+        basename = name.split(".")[0]
+        native.cc_library(
+            name = basename + "_gpu",
+            srcs = gpu_srcs,
+            copts = _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
+            features = if_cuda(["-use_header_modules"]),
+            deps = deps + if_cuda(cuda_deps),
+        )
+        cuda_deps.extend([":" + basename + "_gpu"])
+
+    check_deps(
+        name = name + "_check_deps",
+        deps = deps + if_cuda(cuda_deps),
+        disallowed_deps = [
+            clean_dep("//tensorflow/core:framework"),
+            clean_dep("//tensorflow/core:lib"),
+        ],
+    )
+    tf_cc_shared_object(
+        name = name,
+        srcs = srcs,
+        deps = deps + if_cuda(cuda_deps),
+        data = if_static([name + "_check_deps"]),
+        copts = tf_copts(is_external = True),
+        features = ["windows_export_all_symbols"],
+        linkopts = linkopts + select({
+            "//conditions:default": [
+                "-lm",
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            clean_dep("//tensorflow:darwin"): [],
+        }),
+    )
 
 register_extension_info(
     extension_name = "tf_custom_op_library",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_custom_op_py_library(name,
-                            srcs=[],
-                            dso=[],
-                            kernels=[],
-                            srcs_version="PY2AND3",
-                            visibility=None,
-                            deps=[]):
-  kernels = kernels  # unused argument
-  native.py_library(
-      name=name,
-      data=dso,
-      srcs=srcs,
-      srcs_version=srcs_version,
-      visibility=visibility,
-      deps=deps,)
+def tf_custom_op_py_library(
+        name,
+        srcs = [],
+        dso = [],
+        kernels = [],
+        srcs_version = "PY2AND3",
+        visibility = None,
+        deps = []):
+    kernels = kernels  # unused argument
+    native.py_library(
+        name = name,
+        data = dso,
+        srcs = srcs,
+        srcs_version = srcs_version,
+        visibility = visibility,
+        deps = deps,
+    )
 
 register_extension_info(
     extension_name = "tf_custom_op_py_library",
@@ -1434,117 +1516,127 @@ register_extension_info(
 # This function attempts to append init_module_name to list of
 # exported functions in version script
 def _append_init_to_versionscript_impl(ctx):
-  mod_name = ctx.attr.module_name
-  if ctx.attr.is_version_script:
-    ctx.actions.expand_template(
-      template=ctx.file.template_file,
-      output=ctx.outputs.versionscript,
-      substitutions={
-        "global:":"global:\n     init_%s;\n     PyInit_*;"%(mod_name),
-      },
-      is_executable=False,
-    )
-  else:
-    ctx.actions.expand_template(
-      template=ctx.file.template_file,
-      output=ctx.outputs.versionscript,
-      substitutions={
-        "*tensorflow*":"*tensorflow*\ninit_%s\nPyInit_*\n"%(mod_name),
-      },
-      is_executable=False,
-    )
-
+    mod_name = ctx.attr.module_name
+    if ctx.attr.is_version_script:
+        ctx.actions.expand_template(
+            template = ctx.file.template_file,
+            output = ctx.outputs.versionscript,
+            substitutions = {
+                "global:": "global:\n     init_%s;\n     PyInit_*;" % (mod_name),
+            },
+            is_executable = False,
+        )
+    else:
+        ctx.actions.expand_template(
+            template = ctx.file.template_file,
+            output = ctx.outputs.versionscript,
+            substitutions = {
+                "*tensorflow*": "*tensorflow*\ninit_%s\nPyInit_*\n" % (mod_name),
+            },
+            is_executable = False,
+        )
 
-_append_init_to_versionscript= rule(
-  implementation=_append_init_to_versionscript_impl,
-  attrs={
-    "module_name":attr.string(mandatory=True),
-    "template_file":attr.label(allow_files=True,single_file=True,mandatory=True),
-    "is_version_script":attr.bool(default=True,
-      doc='whether target is a ld version script or exported symbol list',
-      mandatory=False),
-  },
-  outputs={"versionscript":"%{name}.lds"},
+_append_init_to_versionscript = rule(
+    implementation = _append_init_to_versionscript_impl,
+    attrs = {
+        "module_name": attr.string(mandatory = True),
+        "template_file": attr.label(allow_files = True, single_file = True, mandatory = True),
+        "is_version_script": attr.bool(
+            default = True,
+            doc = "whether target is a ld version script or exported symbol list",
+            mandatory = False,
+        ),
+    },
+    outputs = {"versionscript": "%{name}.lds"},
 )
 
-def tf_py_wrap_cc(name,
-                             srcs,
-                             swig_includes=[],
-                             deps=[],
-                             copts=[],
-                             **kwargs):
-  module_name = name.split("/")[-1]
-  # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
-  # and use that as the name for the rule producing the .so file.
-  cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
-  cc_library_pyd_name = "/".join(
-      name.split("/")[:-1] + ["_" + module_name + ".pyd"])
-  extra_deps = []
-  _py_wrap_cc(
-      name=name + "_py_wrap",
-      srcs=srcs,
-      swig_includes=swig_includes,
-      deps=deps + extra_deps,
-      toolchain_deps=["@bazel_tools//tools/cpp:current_cc_toolchain"],
-      module_name=module_name,
-      py_module_name=name)
-  vscriptname=name+"_versionscript"
-  _append_init_to_versionscript(
-      name=vscriptname,
-      module_name=module_name,
-      is_version_script=select({
-          "@local_config_cuda//cuda:darwin":False,
-          "//conditions:default":True,
-          }),
-      template_file=select({
-          "@local_config_cuda//cuda:darwin":clean_dep("//tensorflow:tf_exported_symbols.lds"),
-          "//conditions:default":clean_dep("//tensorflow:tf_version_script.lds")
-      })
-  )
-  extra_linkopts = select({
-      "@local_config_cuda//cuda:darwin": [
-          "-Wl,-exported_symbols_list",
-          "$(location %s.lds)"%vscriptname,
-      ],
-      clean_dep("//tensorflow:windows"): [],
-      "//conditions:default": [
-          "-Wl,--version-script",
-          "$(location %s.lds)"%vscriptname,
-      ]
-  })
-  extra_deps += select({
-      "@local_config_cuda//cuda:darwin": [
-          "%s.lds"%vscriptname,
-      ],
-      clean_dep("//tensorflow:windows"): [],
-      "//conditions:default": [
-          "%s.lds"%vscriptname,
-      ]
-  })
-
-  tf_cc_shared_object(
-      name=cc_library_name,
-      srcs=[module_name + ".cc"],
-      copts=copts + if_not_windows([
-          "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
-      ]),
-      linkopts=extra_linkopts,
-      linkstatic=1,
-      deps=deps + extra_deps,
-      **kwargs)
-  native.genrule(
-      name="gen_" + cc_library_pyd_name,
-      srcs=[":" + cc_library_name],
-      outs=[cc_library_pyd_name],
-      cmd="cp $< $@",)
-  native.py_library(
-      name=name,
-      srcs=[":" + name + ".py"],
-      srcs_version="PY2AND3",
-      data=select({
-          clean_dep("//tensorflow:windows"): [":" + cc_library_pyd_name],
-          "//conditions:default": [":" + cc_library_name],
-      }))
+def tf_py_wrap_cc(
+        name,
+        srcs,
+        swig_includes = [],
+        deps = [],
+        copts = [],
+        **kwargs):
+    module_name = name.split("/")[-1]
+
+    # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
+    # and use that as the name for the rule producing the .so file.
+    cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
+    cc_library_pyd_name = "/".join(
+        name.split("/")[:-1] + ["_" + module_name + ".pyd"],
+    )
+    extra_deps = []
+    _py_wrap_cc(
+        name = name + "_py_wrap",
+        srcs = srcs,
+        swig_includes = swig_includes,
+        deps = deps + extra_deps,
+        toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
+        module_name = module_name,
+        py_module_name = name,
+    )
+    vscriptname = name + "_versionscript"
+    _append_init_to_versionscript(
+        name = vscriptname,
+        module_name = module_name,
+        is_version_script = select({
+            "@local_config_cuda//cuda:darwin": False,
+            "//conditions:default": True,
+        }),
+        template_file = select({
+            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
+            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
+        }),
+    )
+    extra_linkopts = select({
+        "@local_config_cuda//cuda:darwin": [
+            "-Wl,-exported_symbols_list",
+            "$(location %s.lds)" % vscriptname,
+        ],
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": [
+            "-Wl,--version-script",
+            "$(location %s.lds)" % vscriptname,
+        ],
+    })
+    extra_deps += select({
+        "@local_config_cuda//cuda:darwin": [
+            "%s.lds" % vscriptname,
+        ],
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": [
+            "%s.lds" % vscriptname,
+        ],
+    })
+
+    tf_cc_shared_object(
+        name = cc_library_name,
+        srcs = [module_name + ".cc"],
+        copts = copts + if_not_windows([
+            "-Wno-self-assign",
+            "-Wno-sign-compare",
+            "-Wno-write-strings",
+        ]),
+        linkopts = extra_linkopts,
+        linkstatic = 1,
+        deps = deps + extra_deps,
+        **kwargs
+    )
+    native.genrule(
+        name = "gen_" + cc_library_pyd_name,
+        srcs = [":" + cc_library_name],
+        outs = [cc_library_pyd_name],
+        cmd = "cp $< $@",
+    )
+    native.py_library(
+        name = name,
+        srcs = [":" + name + ".py"],
+        srcs_version = "PY2AND3",
+        data = select({
+            clean_dep("//tensorflow:windows"): [":" + cc_library_pyd_name],
+            "//conditions:default": [":" + cc_library_name],
+        }),
+    )
 
 # This macro is for running python tests against system installed pip package
 # on Windows.
@@ -1562,246 +1654,263 @@ def tf_py_wrap_cc(name,
 #    Note that this only works on Windows. See the definition of
 #    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
-def py_test(deps=[], data=[], **kwargs):
-  native.py_test(
-      # TODO(jlebar): Ideally we'd use tcmalloc here.,
-      deps=select({
-          "//conditions:default": deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
-      }),
-      data = data + select({
-          "//conditions:default": [],
-          clean_dep("//tensorflow:no_tensorflow_py_deps"):
-          ["//tensorflow/tools/pip_package:win_pip_package_marker"],
-      }),
-      **kwargs)
+def py_test(deps = [], data = [], **kwargs):
+    native.py_test(
+        # TODO(jlebar): Ideally we'd use tcmalloc here.,
+        deps = select({
+            "//conditions:default": deps,
+            clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
+        }),
+        data = data + select({
+            "//conditions:default": [],
+            clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
+        }),
+        **kwargs
+    )
 
 register_extension_info(
     extension_name = "py_test",
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_py_test(name,
-               srcs,
-               size="medium",
-               data=[],
-               main=None,
-               args=[],
-               tags=[],
-               shard_count=1,
-               additional_deps=[],
-               flaky=0,
-               xla_enabled=False,
-               grpc_enabled=False):
-  if xla_enabled:
-    additional_deps = additional_deps + tf_additional_xla_deps_py()
-  if grpc_enabled:
-    additional_deps = additional_deps + tf_additional_grpc_deps_py()
-  py_test(
-      name=name,
-      size=size,
-      srcs=srcs,
-      main=main,
-      args=args,
-      tags=tags,
-      visibility=[clean_dep("//tensorflow:internal")],
-      shard_count=shard_count,
-      data=data,
-      deps=[
+def tf_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        tags = [],
+        shard_count = 1,
+        additional_deps = [],
+        flaky = 0,
+        xla_enabled = False,
+        grpc_enabled = False):
+    if xla_enabled:
+        additional_deps = additional_deps + tf_additional_xla_deps_py()
+    if grpc_enabled:
+        additional_deps = additional_deps + tf_additional_grpc_deps_py()
+    py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        main = main,
+        args = args,
+        tags = tags,
+        visibility = [clean_dep("//tensorflow:internal")],
+        shard_count = shard_count,
+        data = data,
+        deps = [
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
-          ] + additional_deps,
-      flaky=flaky,
-      srcs_version="PY2AND3")
+        ] + additional_deps,
+        flaky = flaky,
+        srcs_version = "PY2AND3",
+    )
 
 register_extension_info(
     extension_name = "tf_py_test",
     label_regex_map = {"additional_deps": "deps:{extension_name}"},
 )
 
-def cuda_py_test(name,
-                 srcs,
-                 size="medium",
-                 data=[],
-                 main=None,
-                 args=[],
-                 shard_count=1,
-                 additional_deps=[],
-                 tags=[],
-                 flaky=0,
-                 xla_enabled=False,
-                 grpc_enabled=False):
-  test_tags = tags + tf_cuda_tests_tags()
-  tf_py_test(
-      name=name,
-      size=size,
-      srcs=srcs,
-      data=data,
-      main=main,
-      args=args,
-      tags=test_tags,
-      shard_count=shard_count,
-      additional_deps=additional_deps,
-      flaky=flaky,
-      xla_enabled=xla_enabled,
-      grpc_enabled=grpc_enabled)
+def cuda_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        shard_count = 1,
+        additional_deps = [],
+        tags = [],
+        flaky = 0,
+        xla_enabled = False,
+        grpc_enabled = False):
+    test_tags = tags + tf_cuda_tests_tags()
+    tf_py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        data = data,
+        main = main,
+        args = args,
+        tags = test_tags,
+        shard_count = shard_count,
+        additional_deps = additional_deps,
+        flaky = flaky,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 register_extension_info(
     extension_name = "cuda_py_test",
     label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
 )
 
-def sycl_py_test(name,
-                 srcs,
-                 size="medium",
-                 data=[],
-                 main=None,
-                 args=[],
-                 shard_count=1,
-                 additional_deps=[],
-                 tags=[],
-                 flaky=0,
-                 xla_enabled=False,
-                 grpc_enabled=False):
-  test_tags = tags + tf_sycl_tests_tags()
-  tf_py_test(
-      name=name,
-      size=size,
-      srcs=srcs,
-      data=data,
-      main=main,
-      args=args,
-      tags=test_tags,
-      shard_count=shard_count,
-      additional_deps=additional_deps,
-      flaky=flaky,
-      xla_enabled=xla_enabled,
-      grpc_enabled=grpc_enabled)
+def sycl_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        shard_count = 1,
+        additional_deps = [],
+        tags = [],
+        flaky = 0,
+        xla_enabled = False,
+        grpc_enabled = False):
+    test_tags = tags + tf_sycl_tests_tags()
+    tf_py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        data = data,
+        main = main,
+        args = args,
+        tags = test_tags,
+        shard_count = shard_count,
+        additional_deps = additional_deps,
+        flaky = flaky,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 register_extension_info(
     extension_name = "sycl_py_test",
     label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
 )
 
-def py_tests(name,
-             srcs,
-             size="medium",
-             additional_deps=[],
-             data=[],
-             tags=[],
-             shard_count=1,
-             prefix="",
-             xla_enabled=False,
-             grpc_enabled=False):
-  for src in srcs:
-    test_name = src.split("/")[-1].split(".")[0]
-    if prefix:
-      test_name = "%s_%s" % (prefix, test_name)
-    tf_py_test(
-        name=test_name,
-        size=size,
-        srcs=[src],
-        main=src,
-        tags=tags,
-        shard_count=shard_count,
-        data=data,
-        additional_deps=additional_deps,
-        xla_enabled=xla_enabled,
-        grpc_enabled=grpc_enabled)
-
-def cuda_py_tests(name,
-                  srcs,
-                  size="medium",
-                  additional_deps=[],
-                  data=[],
-                  shard_count=1,
-                  tags=[],
-                  prefix="",
-                  xla_enabled=False,
-                  grpc_enabled=False):
-  test_tags = tags + tf_cuda_tests_tags()
-  py_tests(
-      name=name,
-      size=size,
-      srcs=srcs,
-      additional_deps=additional_deps,
-      data=data,
-      tags=test_tags,
-      shard_count=shard_count,
-      prefix=prefix,
-      xla_enabled=xla_enabled,
-      grpc_enabled=grpc_enabled)
+def py_tests(
+        name,
+        srcs,
+        size = "medium",
+        additional_deps = [],
+        data = [],
+        tags = [],
+        shard_count = 1,
+        prefix = "",
+        xla_enabled = False,
+        grpc_enabled = False):
+    for src in srcs:
+        test_name = src.split("/")[-1].split(".")[0]
+        if prefix:
+            test_name = "%s_%s" % (prefix, test_name)
+        tf_py_test(
+            name = test_name,
+            size = size,
+            srcs = [src],
+            main = src,
+            tags = tags,
+            shard_count = shard_count,
+            data = data,
+            additional_deps = additional_deps,
+            xla_enabled = xla_enabled,
+            grpc_enabled = grpc_enabled,
+        )
+
+def cuda_py_tests(
+        name,
+        srcs,
+        size = "medium",
+        additional_deps = [],
+        data = [],
+        shard_count = 1,
+        tags = [],
+        prefix = "",
+        xla_enabled = False,
+        grpc_enabled = False):
+    test_tags = tags + tf_cuda_tests_tags()
+    py_tests(
+        name = name,
+        size = size,
+        srcs = srcs,
+        additional_deps = additional_deps,
+        data = data,
+        tags = test_tags,
+        shard_count = shard_count,
+        prefix = prefix,
+        xla_enabled = xla_enabled,
+        grpc_enabled = grpc_enabled,
+    )
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
 #
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
-def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps=[], deps=[], visibility=None):
-  out_hdrs = (
-      [p.replace(".proto", ".pb_text.h")
-       for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
-  out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
-  native.genrule(
-      name=name + "_srcs",
-      srcs=srcs + protodeps + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
-      outs=out_hdrs + out_srcs,
-      visibility=visibility,
-      cmd=
-      "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) "
-      + "$(@D) " + srcs_relative_dir + " $(SRCS)",
-      tools=[
-          clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions")
-      ],)
-
-  native.filegroup(
-      name=name + "_hdrs",
-      srcs=out_hdrs,
-      visibility=visibility,
-  )
-
-  native.cc_library(
-      name=name,
-      srcs=out_srcs,
-      hdrs=out_hdrs,
-      visibility=visibility,
-      deps = deps,
-  )
+def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None):
+    out_hdrs = (
+        [
+            p.replace(".proto", ".pb_text.h")
+            for p in srcs
+        ] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs]
+    )
+    out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
+    native.genrule(
+        name = name + "_srcs",
+        srcs = srcs + protodeps + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
+        outs = out_hdrs + out_srcs,
+        visibility = visibility,
+        cmd =
+            "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
+            "$(@D) " + srcs_relative_dir + " $(SRCS)",
+        tools = [
+            clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions"),
+        ],
+    )
+
+    native.filegroup(
+        name = name + "_hdrs",
+        srcs = out_hdrs,
+        visibility = visibility,
+    )
+
+    native.cc_library(
+        name = name,
+        srcs = out_srcs,
+        hdrs = out_hdrs,
+        visibility = visibility,
+        deps = deps,
+    )
 
 def tf_genrule_cmd_append_to_srcs(to_append):
-  return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
-          " >> $(@)")
+    return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
+            " >> $(@)")
 
 def tf_version_info_genrule():
-  native.genrule(
-      name="version_info_gen",
-      srcs=[
-          clean_dep("@local_config_git//:gen/spec.json"),
-          clean_dep("@local_config_git//:gen/head"),
-          clean_dep("@local_config_git//:gen/branch_ref"),
-      ],
-      outs=["util/version_info.cc"],
-      cmd=
-      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
-      local=1,
-      tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
+    native.genrule(
+        name = "version_info_gen",
+        srcs = [
+            clean_dep("@local_config_git//:gen/spec.json"),
+            clean_dep("@local_config_git//:gen/head"),
+            clean_dep("@local_config_git//:gen/branch_ref"),
+        ],
+        outs = ["util/version_info.cc"],
+        cmd =
+            "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
+        local = 1,
+        tools = [clean_dep("//tensorflow/tools/git:gen_git_source.py")],
+    )
 
 def tf_py_build_info_genrule():
-  native.genrule(
-      name="py_build_info_gen",
-      outs=["platform/build_info.py"],
-      cmd=
-     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
-      local=1,
-      tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
-
-def cc_library_with_android_deps(deps,
-                                 android_deps=[],
-                                 common_deps=[],
-                                 copts=tf_copts(),
-                                 **kwargs):
-  deps = if_not_android(deps) + if_android(android_deps) + common_deps
-  native.cc_library(deps=deps, copts=copts, **kwargs)
+    native.genrule(
+        name = "py_build_info_gen",
+        outs = ["platform/build_info.py"],
+        cmd =
+            "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+        local = 1,
+        tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],
+    )
+
+def cc_library_with_android_deps(
+        deps,
+        android_deps = [],
+        common_deps = [],
+        copts = tf_copts(),
+        **kwargs):
+    deps = if_not_android(deps) + if_android(android_deps) + common_deps
+    native.cc_library(deps = deps, copts = copts, **kwargs)
 
 register_extension_info(
     extension_name = "cc_library_with_android_deps",
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index 1f041ef193..4389a999e7 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -13,5 +13,5 @@ filegroup(
 
 filegroup(
     name = "api_golden_v2",
-    srcs = glob(["v1/*.pbtxt"]),
+    srcs = glob(["v2/*.pbtxt"]),
 )
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
index eac236d498..3add49e90d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
@@ -24,6 +24,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "values"
     mtype: "<type \'property\'>"
   }
@@ -32,6 +36,10 @@ tf_class {
     argspec: "args=[\'self\', \'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "eval"
     argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index e841c4ad89..05698b03ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -53,15 +53,15 @@ tf_class {
   }
   member_method {
     name: "assign"
-    argspec: "args=[\'self\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'value\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
   member_method {
     name: "assign_add"
-    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
   member_method {
     name: "assign_sub"
-    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
   member_method {
     name: "count_up_to"
@@ -92,8 +92,28 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "scatter_add"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_add"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_sub"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_update"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "scatter_sub"
-    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "set_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
index bf1f94b6ae..269e18a0a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
@@ -96,7 +96,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index e579fe6a1a..d843194ef0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 97688fcb0f..b8e9baca71 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -124,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 86e328888e..5510465d7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index b0ed545781..38ec8a0aff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 42f98ed03d..41cb8e30bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 000898a4be..9a7aaa8e96 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 380b49f99c..c3dd2ad046 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 82db5e6137..cc303bf7b9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index b6ff688ec3..628447ce35 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index b41290f8b0..f03c986c22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 88a033e61f..c440604aae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index c1b9b96044..a01eaf8a12 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index f59f7727a3..0d6698f2ef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index 7d3744ed92..f1b23be48f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 3fd4ccdab2..0672cd5b7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index ba21b50be4..b25ae1e82e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 46f9fa2bbb..bb1918eba6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -188,7 +188,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index c3ad326589..16e0fd5a31 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index fd9eb43066..065bb4d35b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 40d61688f2..543bae6fa9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b8c227d725..c7ba6056f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 095d35e574..072943dc2c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 8f99961198..222a1ef4fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 96d522a016..8f4f7918ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index de2824dab4..f939067178 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 1d563241d8..93c442bd55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index c87e52c537..471b18ef85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index dccf5523e3..0f250a09b7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index 7ac4116d92..f52128483c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 024f72705d..98daf3bab1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 4e0233331b..64e7a9046b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 32d46ce8f3..6fdffef776 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 858486c725..3ac3825759 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index f65d750926..280ec8c25f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 2e71ef503d..560f66f9c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index 42533bcd21..c0543529c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index b5df169417..04eb2824b9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index 0ea17919a9..f400432915 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index a33248bc00..ab176b441a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 4ba21a25cd..c3895a0ac1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -133,6 +133,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index a7a570418e..a0fe598ab9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -171,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 763bc23113..55e0d7ef02 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 3c50a3d7f2..38fbff5e4a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index ac78bdafad..5ea61d118d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 275282d9d2..929f48df23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 0e31e6058b..2e6d59337f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index aacd0b1791..11dca17c6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index c236548663..4e3e258430 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 6b9c0290aa..fb9166316f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 0d7b2211e6..278429af6f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index d080ad6aed..87b7f6797a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index fcb0a109da..98bf96fa0c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 1d0e22abd0..935a69ab2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 653c9f547b..c9d4158d1c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index cdbaf82cf6..9953102ff9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 230c5e9034..2617f5a95f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 511456e740..e9f6ef45aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -133,6 +133,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 4a3492ebd6..ecdbf48157 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -171,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 2dff7a6de4..2e0b6bac24 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 7efa29be77..1e93d1118a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -97,7 +97,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 0ca8e0b52c..bfd36012a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index f754fa1da8..5ad5990d7e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index c9516b8f07..40d03369a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 850ecff974..86666b51bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 7c69e31f9a..238d96cca6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index fba42642d7..85f23df671 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 9c277411ea..235806b965 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 7c2f6ccc8a..4a45bf7997 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 802178dba6..fda2562fc8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index e870dfe9ad..71d2d09a8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index c1337ce0cb..12949b39a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index ed27a62765..ab16d0021e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index b9f05cb3e5..61ccbf5962 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 336d9f76fb..ce2320d703 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 46282217e0..69848af8cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 42cd7e87ee..2b6e8af11d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -102,7 +102,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 4d3de58bd1..413f45f018 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9f094a877a..9c61ff6027 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index 2f519a2438..baa91804c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 6b93116ba0..15a5d6ac9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index fd17115e27..be43bd5b3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 4b37a94478..6105992c7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 5bdadca74a..1b6cf1e9ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 9dfda96fc8..29488a37f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -133,6 +133,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 7b7684ccd2..182efb83b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -159,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 3b15407fca..d29731ecf9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 6d04415267..a6d7494ca7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 04950654d5..c36e802693 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index c424e6dcc8..9c46cfe40f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 6718e36dc6..8982f78794 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -141,6 +141,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 740a03367b..ec2cc50298 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index a08c583adb..d7bc1980f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index c1294fed0f..fec2de6b49 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -103,7 +103,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index dc401d3ed0..3d285e7f17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 4b5165ae97..40a56a0c94 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 789af15fea..728eca415a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index 0536a7cee7..da64e77c39 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -102,7 +102,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 8915353ec3..2f505f9293 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 6efb5ef15a..f82c77072e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 4c33c5d0bf..54e01a9917 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 56914e1746..472b9818df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index acfb3521c0..937516eff1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -124,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
index 8ba0e7480b..7ad4a32d43 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.pbtxt
@@ -9,6 +9,10 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member_method {
+    name: "clone_model"
+    argspec: "args=[\'model\', \'input_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "load_model"
     argspec: "args=[\'filepath\', \'custom_objects\', \'compile\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index e606eab919..88b8f37c4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 5deb02d569..a4483fefa2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 8a63b49180..381c4975d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -151,6 +151,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index db1aae2757..912365a28b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -155,6 +155,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 32fa151a8e..a4bb3219c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 30c6c2ce3b..715bfd5fc7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 72b40cc9f7..b66c0f89cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -151,6 +151,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index a5c2b4aefd..faeb4f3513 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -150,6 +150,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 61d5f04b22..caa2e60080 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -151,6 +151,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 4f19627691..f710524031 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -785,6 +785,14 @@ tf_module {
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "batch_gather"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -997,10 +1005,18 @@ tf_module {
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "disable_resource_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "div"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "div_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -1025,6 +1041,10 @@ tf_module {
     argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
+    name: "enable_resource_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index bbfe395031..ba9e651b34 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -8,4 +8,12 @@ tf_module {
     name: "cross_hashed"
     argspec: "args=[\'inputs\', \'num_buckets\', \'hash_key\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "expand_dims"
+    argspec: "args=[\'sp_input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "eye"
+    argspec: "args=[\'num_rows\', \'num_columns\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 9a831fed26..018be7b9f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -5,6 +5,10 @@ tf_module {
     argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
+    name: "length"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
index ef9fe096a1..eb41deee13 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
@@ -14,5 +14,11 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "executor_type"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
index eeef15515d..e565b903d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
@@ -137,6 +137,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "executor_type"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
     }
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
index eac236d498..3add49e90d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
@@ -24,6 +24,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "values"
     mtype: "<type \'property\'>"
   }
@@ -32,6 +36,10 @@ tf_class {
     argspec: "args=[\'self\', \'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "eval"
     argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index e841c4ad89..05698b03ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -53,15 +53,15 @@ tf_class {
   }
   member_method {
     name: "assign"
-    argspec: "args=[\'self\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'value\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
   member_method {
     name: "assign_add"
-    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
   member_method {
     name: "assign_sub"
-    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
   member_method {
     name: "count_up_to"
@@ -92,8 +92,28 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "scatter_add"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_add"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_sub"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_update"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "scatter_sub"
-    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "set_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
index 1f9aeb6ad6..4f0147a523 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.Iterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "initializer"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index 9dbb5d16a4..c23b04b4ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 34a30c2874..6878d28fff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
index 5aa4b3d4fb..269e18a0a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
+    name: "eval_distribute"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
   }
@@ -92,7 +96,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 6ec3aba775..5c46dc5ee7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -125,6 +125,10 @@ tf_module {
     argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
   }
   member_method {
+    name: "non_max_suppression_padded"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
+  }
+  member_method {
     name: "pad_to_bounding_box"
     argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 40e82b18b6..d843194ef0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 65cfad77d1..b8e9baca71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -124,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -267,10 +267,6 @@ tf_class {
     argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "symbolic_set_inputs"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
     name: "test_on_batch"
     argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
index 2cd83baf65..2e9de9ebb2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "relu"
-    argspec: "args=[\'x\', \'alpha\', \'max_value\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
   }
   member_method {
     name: "selu"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.densenet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.densenet.pbtxt
deleted file mode 100644
index 42cb914450..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.densenet.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.keras.applications.densenet"
-tf_module {
-  member_method {
-    name: "DenseNet121"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet169"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet201"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.inception_resnet_v2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.inception_resnet_v2.pbtxt
deleted file mode 100644
index 211080c19b..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.inception_resnet_v2.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.inception_resnet_v2"
-tf_module {
-  member_method {
-    name: "InceptionResNetV2"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.inception_v3.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.inception_v3.pbtxt
deleted file mode 100644
index b67cee80ab..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.inception_v3.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.inception_v3"
-tf_module {
-  member_method {
-    name: "InceptionV3"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet.pbtxt
deleted file mode 100644
index ef774e1dd7..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.mobilenet.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.mobilenet"
-tf_module {
-  member_method {
-    name: "MobileNet"
-    argspec: "args=[\'input_shape\', \'alpha\', \'depth_multiplier\', \'dropout\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'1\', \'0.001\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.nasnet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.nasnet.pbtxt
deleted file mode 100644
index cd75b87540..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.nasnet.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.applications.nasnet"
-tf_module {
-  member_method {
-    name: "NASNetLarge"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "NASNetMobile"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
deleted file mode 100644
index 9fc086eb8e..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.pbtxt
+++ /dev/null
@@ -1,87 +0,0 @@
-path: "tensorflow.keras.applications"
-tf_module {
-  member {
-    name: "densenet"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "inception_resnet_v2"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "inception_v3"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "mobilenet"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "nasnet"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "resnet50"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "vgg16"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "vgg19"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "xception"
-    mtype: "<type \'module\'>"
-  }
-  member_method {
-    name: "DenseNet121"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet169"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "DenseNet201"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "InceptionResNetV2"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "InceptionV3"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "MobileNet"
-    argspec: "args=[\'input_shape\', \'alpha\', \'depth_multiplier\', \'dropout\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'1\', \'0.001\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "NASNetLarge"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "NASNetMobile"
-    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "ResNet50"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "VGG16"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "VGG19"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "Xception"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.resnet50.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.resnet50.pbtxt
deleted file mode 100644
index 7385af064d..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.resnet50.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.resnet50"
-tf_module {
-  member_method {
-    name: "ResNet50"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.vgg16.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.vgg16.pbtxt
deleted file mode 100644
index ba66fba8f3..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.vgg16.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.vgg16"
-tf_module {
-  member_method {
-    name: "VGG16"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.vgg19.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.vgg19.pbtxt
deleted file mode 100644
index e55a1345b6..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.vgg19.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.vgg19"
-tf_module {
-  member_method {
-    name: "VGG19"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.xception.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.xception.pbtxt
deleted file mode 100644
index 59dd2108f2..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.applications.xception.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.applications.xception"
-tf_module {
-  member_method {
-    name: "Xception"
-    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
-  }
-  member_method {
-    name: "decode_predictions"
-    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index fddac63b78..126ce8db6a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -366,7 +366,7 @@ tf_module {
   }
   member_method {
     name: "relu"
-    argspec: "args=[\'x\', \'alpha\', \'max_value\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
   }
   member_method {
     name: "repeat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 86e328888e..5510465d7b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index b0ed545781..38ec8a0aff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 42f98ed03d..41cb8e30bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 000898a4be..9a7aaa8e96 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 380b49f99c..c3dd2ad046 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 82db5e6137..cc303bf7b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index b6ff688ec3..628447ce35 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index b41290f8b0..f03c986c22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 88a033e61f..c440604aae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index c1b9b96044..a01eaf8a12 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index f59f7727a3..0d6698f2ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 7d3744ed92..f1b23be48f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 3fd4ccdab2..0672cd5b7b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index ba21b50be4..b25ae1e82e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 46f9fa2bbb..bb1918eba6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -188,7 +188,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index c3ad326589..16e0fd5a31 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index fd9eb43066..065bb4d35b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 40d61688f2..543bae6fa9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b8c227d725..c7ba6056f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 095d35e574..072943dc2c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 8f99961198..222a1ef4fc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 96d522a016..8f4f7918ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index de2824dab4..f939067178 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 1d563241d8..93c442bd55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index c87e52c537..471b18ef85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index dccf5523e3..0f250a09b7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index 7ac4116d92..f52128483c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 024f72705d..98daf3bab1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 4e0233331b..64e7a9046b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 32d46ce8f3..6fdffef776 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -108,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 858486c725..3ac3825759 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index f65d750926..280ec8c25f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 2e71ef503d..560f66f9c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index 42533bcd21..c0543529c3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index b5df169417..04eb2824b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index 0ea17919a9..f400432915 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index a33248bc00..ab176b441a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 4ba21a25cd..c3895a0ac1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -133,6 +133,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index a7a570418e..a0fe598ab9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -171,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 763bc23113..55e0d7ef02 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 3c50a3d7f2..38fbff5e4a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index ac78bdafad..5ea61d118d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 275282d9d2..929f48df23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 0e31e6058b..2e6d59337f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index aacd0b1791..11dca17c6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index c236548663..4e3e258430 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 6b9c0290aa..fb9166316f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 0d7b2211e6..278429af6f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index d080ad6aed..87b7f6797a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index fcb0a109da..98bf96fa0c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 1d0e22abd0..935a69ab2f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 653c9f547b..c9d4158d1c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index cdbaf82cf6..9953102ff9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 230c5e9034..2617f5a95f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 511456e740..e9f6ef45aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -133,6 +133,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 4a3492ebd6..ecdbf48157 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -171,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 5d05cf689f..2e0b6bac24 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 7efa29be77..1e93d1118a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -97,7 +97,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 0ca8e0b52c..bfd36012a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index f754fa1da8..5ad5990d7e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index c9516b8f07..40d03369a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1\'], "
   }
   member_method {
     name: "add_loss"
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 850ecff974..86666b51bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 7c69e31f9a..238d96cca6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index fba42642d7..85f23df671 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 9c277411ea..235806b965 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 7c2f6ccc8a..4a45bf7997 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 802178dba6..fda2562fc8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index e870dfe9ad..71d2d09a8d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index c1337ce0cb..12949b39a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index ed27a62765..ab16d0021e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index b9f05cb3e5..61ccbf5962 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 336d9f76fb..ce2320d703 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 46282217e0..69848af8cf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 42cd7e87ee..2b6e8af11d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -102,7 +102,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index c00fa79adf..413f45f018 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_value\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'max_value\', \'negative_slope\', \'threshold\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0\', \'0\'], "
   }
   member_method {
     name: "add_loss"
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9f094a877a..9c61ff6027 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index 2f519a2438..baa91804c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 6b93116ba0..15a5d6ac9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index fd17115e27..be43bd5b3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 4b37a94478..6105992c7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 5bdadca74a..1b6cf1e9ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -100,7 +100,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 9dfda96fc8..29488a37f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -133,6 +133,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 7b7684ccd2..182efb83b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -159,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 3b15407fca..d29731ecf9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 6d04415267..a6d7494ca7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 04950654d5..c36e802693 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index c424e6dcc8..9c46cfe40f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 1160d2840f..8982f78794 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -61,6 +61,10 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
     name: "state_size"
     mtype: "<type \'property\'>"
   }
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -137,6 +141,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 740a03367b..ec2cc50298 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -99,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index a08c583adb..d7bc1980f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index c1294fed0f..fec2de6b49 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -103,7 +103,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index dc401d3ed0..3d285e7f17 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 4b5165ae97..40a56a0c94 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 789af15fea..728eca415a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index 0536a7cee7..da64e77c39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -102,7 +102,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 8915353ec3..2f505f9293 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 6efb5ef15a..f82c77072e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 4c33c5d0bf..54e01a9917 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -98,7 +98,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 85f7c2bfed..472b9818df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -135,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 6a83129f7d..937516eff1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -124,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -267,10 +267,6 @@ tf_class {
     argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "symbolic_set_inputs"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
     name: "test_on_batch"
     argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
index 8ba0e7480b..7ad4a32d43 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.pbtxt
@@ -9,6 +9,10 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member_method {
+    name: "clone_model"
+    argspec: "args=[\'model\', \'input_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "load_model"
     argspec: "args=[\'filepath\', \'custom_objects\', \'compile\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
deleted file mode 100644
index dddace87dc..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.DirectoryIterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.DirectoryIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
deleted file mode 100644
index c1e2e94f0b..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.ImageDataGenerator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.ImageDataGenerator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'featurewise_center\', \'samplewise_center\', \'featurewise_std_normalization\', \'samplewise_std_normalization\', \'zca_whitening\', \'zca_epsilon\', \'rotation_range\', \'width_shift_range\', \'height_shift_range\', \'brightness_range\', \'shear_range\', \'zoom_range\', \'channel_shift_range\', \'fill_mode\', \'cval\', \'horizontal_flip\', \'vertical_flip\', \'rescale\', \'preprocessing_function\', \'data_format\', \'validation_split\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'1e-06\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'0.0\', \'0.0\', \'0.0\', \'nearest\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\', \'0.0\'], "
-  }
-  member_method {
-    name: "fit"
-    argspec: "args=[\'self\', \'x\', \'augment\', \'rounds\', \'seed\'], varargs=None, keywords=None, defaults=[\'False\', \'1\', \'None\'], "
-  }
-  member_method {
-    name: "flow"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'None\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'None\'], "
-  }
-  member_method {
-    name: "flow_from_directory"
-    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'subset\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'None\', \'nearest\'], "
-  }
-  member_method {
-    name: "random_transform"
-    argspec: "args=[\'self\', \'x\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "standardize"
-    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-iterator.pbtxt
deleted file mode 100644
index 825d9f1d1d..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-iterator.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.Iterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'n\', \'batch_size\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
deleted file mode 100644
index 75924a254a..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.keras.preprocessing.image.NumpyArrayIterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.NumpyArrayIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'x\', \'y\', \'image_data_generator\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'subset\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'None\', \'None\', \'\', \'png\', \'None\'], "
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
deleted file mode 100644
index 6b850dd6b7..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.keras.preprocessing.image"
-tf_module {
-  member {
-    name: "DirectoryIterator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ImageDataGenerator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Iterator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NumpyArrayIterator"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "apply_transform"
-    argspec: "args=[\'x\', \'transform_matrix\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "array_to_img"
-    argspec: "args=[\'x\', \'data_format\', \'scale\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
-  }
-  member_method {
-    name: "flip_axis"
-    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "img_to_array"
-    argspec: "args=[\'img\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "load_img"
-    argspec: "args=[\'path\', \'grayscale\', \'target_size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'nearest\'], "
-  }
-  member_method {
-    name: "random_brightness"
-    argspec: "args=[\'x\', \'brightness_range\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "random_channel_shift"
-    argspec: "args=[\'x\', \'intensity\', \'channel_axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
-  }
-  member_method {
-    name: "random_rotation"
-    argspec: "args=[\'x\', \'rg\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "random_shear"
-    argspec: "args=[\'x\', \'intensity\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "random_shift"
-    argspec: "args=[\'x\', \'wrg\', \'hrg\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-  member_method {
-    name: "random_zoom"
-    argspec: "args=[\'x\', \'zoom_range\', \'row_axis\', \'col_axis\', \'channel_axis\', \'fill_mode\', \'cval\'], varargs=None, keywords=None, defaults=[\'1\', \'2\', \'0\', \'nearest\', \'0.0\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.pbtxt
deleted file mode 100644
index 5a78581fc5..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-path: "tensorflow.keras.preprocessing"
-tf_module {
-  member {
-    name: "image"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "sequence"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "text"
-    mtype: "<type \'module\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
deleted file mode 100644
index 326b1fa4fd..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.preprocessing.sequence.TimeseriesGenerator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.sequence.TimeseriesGenerator\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'data\', \'targets\', \'length\', \'sampling_rate\', \'stride\', \'start_index\', \'end_index\', \'shuffle\', \'reverse\', \'batch_size\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'None\', \'False\', \'False\', \'128\'], "
-  }
-  member_method {
-    name: "on_epoch_end"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.sequence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.sequence.pbtxt
deleted file mode 100644
index cf59f8a272..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.sequence.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.preprocessing.sequence"
-tf_module {
-  member {
-    name: "TimeseriesGenerator"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "make_sampling_table"
-    argspec: "args=[\'size\', \'sampling_factor\'], varargs=None, keywords=None, defaults=[\'1e-05\'], "
-  }
-  member_method {
-    name: "pad_sequences"
-    argspec: "args=[\'sequences\', \'maxlen\', \'dtype\', \'padding\', \'truncating\', \'value\'], varargs=None, keywords=None, defaults=[\'None\', \'int32\', \'pre\', \'pre\', \'0.0\'], "
-  }
-  member_method {
-    name: "skipgrams"
-    argspec: "args=[\'sequence\', \'vocabulary_size\', \'window_size\', \'negative_samples\', \'shuffle\', \'categorical\', \'sampling_table\', \'seed\'], varargs=None, keywords=None, defaults=[\'4\', \'1.0\', \'True\', \'False\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
deleted file mode 100644
index b42b12b6c0..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
+++ /dev/null
@@ -1,33 +0,0 @@
-path: "tensorflow.keras.preprocessing.text.Tokenizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.preprocessing.text.Tokenizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\', \'oov_token\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fit_on_sequences"
-    argspec: "args=[\'self\', \'sequences\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "fit_on_texts"
-    argspec: "args=[\'self\', \'texts\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "sequences_to_matrix"
-    argspec: "args=[\'self\', \'sequences\', \'mode\'], varargs=None, keywords=None, defaults=[\'binary\'], "
-  }
-  member_method {
-    name: "texts_to_matrix"
-    argspec: "args=[\'self\', \'texts\', \'mode\'], varargs=None, keywords=None, defaults=[\'binary\'], "
-  }
-  member_method {
-    name: "texts_to_sequences"
-    argspec: "args=[\'self\', \'texts\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "texts_to_sequences_generator"
-    argspec: "args=[\'self\', \'texts\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.text.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.text.pbtxt
deleted file mode 100644
index 50b54fc7e1..0000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.text.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.preprocessing.text"
-tf_module {
-  member {
-    name: "Tokenizer"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "hashing_trick"
-    argspec: "args=[\'text\', \'n\', \'hash_function\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
-  }
-  member_method {
-    name: "one_hot"
-    argspec: "args=[\'text\', \'n\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
-  }
-  member_method {
-    name: "text_to_word_sequence"
-    argspec: "args=[\'text\', \'filters\', \'lower\', \'split\'], varargs=None, keywords=None, defaults=[\'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index c74773000a..88b8f37c4f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index d251f54806..a4483fefa2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 8a63b49180..381c4975d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -151,6 +151,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index db1aae2757..912365a28b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -155,6 +155,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index d76eab7eb8..a4bb3219c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 944db6ac93..715bfd5fc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -101,7 +101,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -125,7 +125,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -152,6 +152,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 72b40cc9f7..b66c0f89cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -151,6 +151,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index a5c2b4aefd..faeb4f3513 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -150,6 +150,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 61d5f04b22..caa2e60080 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -151,6 +151,10 @@ tf_class {
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 5eb42b4db3..c8114c431a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -761,18 +761,6 @@ tf_module {
     argspec: "args=[\'var_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "assign"
-    argspec: "args=[\'ref\', \'value\', \'validate_shape\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assign_add"
-    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "assign_sub"
-    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
     name: "atan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -785,6 +773,14 @@ tf_module {
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "batch_gather"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -921,10 +917,6 @@ tf_module {
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "count_up_to"
-    argspec: "args=[\'ref\', \'limit\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
     name: "create_partitioned_variables"
     argspec: "args=[\'shape\', \'slicing\', \'initializer\', \'dtype\', \'trainable\', \'collections\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'True\', \'None\', \'None\', \'None\'], "
   }
@@ -1001,6 +993,10 @@ tf_module {
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
+    name: "div_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -1725,10 +1721,6 @@ tf_module {
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'False\', \'None\'], "
   }
   member_method {
-    name: "scatter_add"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
     name: "scatter_div"
     argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
@@ -1749,26 +1741,6 @@ tf_module {
     argspec: "args=[\'indices\', \'updates\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "scatter_nd_add"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_nd_sub"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_nd_update"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_sub"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "scatter_update"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
     name: "segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
@@ -1902,19 +1874,19 @@ tf_module {
   }
   member_method {
     name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_reorder"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index bbfe395031..ba9e651b34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -8,4 +8,12 @@ tf_module {
     name: "cross_hashed"
     argspec: "args=[\'inputs\', \'num_buckets\', \'hash_key\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "expand_dims"
+    argspec: "args=[\'sp_input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "eye"
+    argspec: "args=[\'num_rows\', \'num_columns\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 9a831fed26..018be7b9f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -5,6 +5,10 @@ tf_module {
     argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
+    name: "length"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 871ebb5247..7ed9cd77a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -50,7 +50,7 @@ tf_module {
   }
   member_method {
     name: "merge_all"
-    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\'], "
+    argspec: "args=[\'key\', \'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\', \'None\'], "
   }
   member_method {
     name: "scalar"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 2d067e4eff..5be37200f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -20,4 +20,8 @@ tf_class {
     name: "save"
     argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_prefix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index b0fb04d7d4..9f35395284 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -298,7 +298,7 @@ tf_module {
   }
   member_method {
     name: "generate_checkpoint_state_proto"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "get_checkpoint_mtimes"
@@ -446,7 +446,7 @@ tf_module {
   }
   member_method {
     name: "update_checkpoint_state"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "warm_start"
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index b65dbc4b7d..43d19bc99c 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -79,7 +79,7 @@ def _KeyToFilePath(key, api_version):
   case_insensitive_key = re.sub('([A-Z]{1})', _ReplaceCapsWithDash, key)
   api_folder = (
       _API_GOLDEN_FOLDER_V2 if api_version == 2 else _API_GOLDEN_FOLDER_V1)
-  return os.path.join(_API_GOLDEN_FOLDER_V1, '%s.pbtxt' % case_insensitive_key)
+  return os.path.join(api_folder, '%s.pbtxt' % case_insensitive_key)
 
 
 def _FileNameToKey(filename):
@@ -315,7 +315,7 @@ class ApiCompatibilityTest(test.TestCase):
   def testAPIBackwardsCompatibilityV2(self):
     if not hasattr(tf.compat, 'v2'):
       return
-    api_version = 1
+    api_version = 2
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index 4587bcf891..b7450c83de 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -28,8 +28,8 @@ RUN pip install --upgrade astor
 RUN pip install --upgrade gast
 RUN pip install --upgrade numpy
 RUN pip install --upgrade termcolor
-RUN pip install keras_applications==1.0.4
-RUN pip install keras_preprocessing==1.0.2
+RUN pip install keras_applications==1.0.5
+RUN pip install keras_preprocessing==1.0.3
 
 # Install golang
 RUN apt-get install -t xenial-backports -y golang-1.9
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 866fe95d2b..a98c15d961 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -99,6 +99,7 @@ do_pylint() {
 "^tensorflow/contrib/layers/python/layers/feature_column\.py.*\[E0110.*abstract-class-instantiated "\
 "^tensorflow/contrib/eager/python/evaluator\.py.*\[E0202.*method-hidden "\
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
+"^tensorflow/contrib/rate/rate\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
 "^tensorflow/python/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index bb316ecfc9..af478eded4 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -115,10 +115,10 @@ pip2 install --upgrade setuptools==39.1.0
 pip3 install --upgrade setuptools==39.1.0
 
 # Keras
-pip2 install keras_applications==1.0.4 --no-deps
-pip3 install keras_applications==1.0.4 --no-deps
-pip2 install keras_preprocessing==1.0.2 --no-deps
-pip3 install keras_preprocessing==1.0.2 --no-deps
+pip2 install keras_applications==1.0.5 --no-deps
+pip3 install keras_applications==1.0.5 --no-deps
+pip2 install keras_preprocessing==1.0.3 --no-deps
+pip3 install keras_preprocessing==1.0.3 --no-deps
 
 # Install last working version of setuptools.
 pip2 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 15e4396ce3..93ea0c3db6 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -85,8 +85,8 @@ pip3.5 install --upgrade termcolor
 pip3.5 install --upgrade setuptools==39.1.0
 
 # Keras
-pip3.5 install keras_applications==1.0.4
-pip3.5 install keras_preprocessing==1.0.2
+pip3.5 install keras_applications==1.0.5
+pip3.5 install keras_preprocessing==1.0.3
 
 # Install last working version of setuptools.
 pip3.5 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 0fc3eee71c..7a9eef7c64 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -101,7 +101,7 @@ pip3 install --upgrade termcolor
 pip3 install --upgrade setuptools==39.1.0
 
 # Keras
-pip3 install keras_applications==1.0.4
-pip3 install keras_preprocessing==1.0.2
+pip3 install keras_applications==1.0.5
+pip3 install keras_preprocessing==1.0.3
 
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index a1d91a6123..b497326d98 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -57,6 +57,17 @@ TF_DOCKER_BUILD_TYPE="MKL" \
   TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
   ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
 
+# build the python3.6 container and whl
+TF_DOCKER_BUILD_TYPE="MKL" \
+  TF_DOCKER_BUILD_IS_DEVEL="YES" \
+  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
+  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
+  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}" \
+  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3.6" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
+  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+
+
 # Build containers for AVX2
 # Include the instructions for haswell and later, but tune for broadwell
 TF_BAZEL_BUILD_OPTIONS="--config=mkl --copt=-march=haswell --copt=-mtune=broadwell --copt=-O3 --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
@@ -80,3 +91,13 @@ TF_DOCKER_BUILD_TYPE="MKL" \
   TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
   ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
 
+# build the python3.6 container and whl
+TF_DOCKER_BUILD_TYPE="MKL" \
+  TF_DOCKER_BUILD_IS_DEVEL="YES" \
+  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
+  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
+  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}-avx2" \
+  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3.6" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
+  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 0482cf619a..27b350e13e 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -27,7 +27,7 @@ function run_configure_for_gpu_build {
 }
 
 function set_remote_cache_options {
-  echo "build --remote_instance_name=projects/tensorflow-testing-cpu" >> "${TMP_BAZELRC}"
+  echo "build --remote_instance_name=projects/tensorflow-testing/instances/default_instance" >> "${TMP_BAZELRC}"
   echo "build --experimental_remote_platform_override='properties:{name:\"build\" value:\"windows-x64\"}'" >> "${TMP_BAZELRC}"
   echo "build --remote_cache=remotebuildexecution.googleapis.com" >> "${TMP_BAZELRC}"
   echo "build --tls_enabled=true" >> "${TMP_BAZELRC}"
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 47e0e5dd59..177ef390db 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -57,8 +57,7 @@ PY_TEST_DIR="py_test_dir"
 
 SKIP_TEST=0
 RELEASE_BUILD=0
-TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
-    //${PY_TEST_DIR}/tensorflow/contrib/... "
+TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -68,6 +67,7 @@ TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
 # --test_contrib_only    Use tensorflow/contrib/... as test target
 for ARG in "$@"; do
   case "$ARG" in
+    --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
     --enable_remote_cache) set_remote_cache_options ;;
     --release_build) RELEASE_BUILD=1 ;;
@@ -86,6 +86,11 @@ else
   export TF_OVERRIDE_EIGEN_STRONG_INLINE=1
 fi
 
+if [[ "$TF_NIGHTLY" == 1 ]]; then
+  python tensorflow/tools/ci_build/update_version.py --nightly
+  EXTRA_PIP_FLAG="--nightly_flag"
+fi
+
 # Enable short object file path to avoid long path issue on Windows.
 echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
 
@@ -104,7 +109,11 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAG}"
+
+if [[ "$TF_NIGHTLY" == 1 ]]; then
+  exit 0
+fi
 
 # Running python tests on Windows needs pip package installed
 PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow-*.whl)
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index e3eee11080..28d5565b98 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -57,8 +57,7 @@ PY_TEST_DIR="py_test_dir"
 
 SKIP_TEST=0
 RELEASE_BUILD=0
-TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
-  //${PY_TEST_DIR}/tensorflow/contrib/... "
+TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -68,6 +67,7 @@ TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
 # --test_contrib_only    Use tensorflow/contrib/... as test target
 for ARG in "$@"; do
   case "$ARG" in
+    --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
     --enable_remote_cache) set_remote_cache_options ;;
     --release_build) RELEASE_BUILD=1 ;;
@@ -86,6 +86,11 @@ else
   export TF_OVERRIDE_EIGEN_STRONG_INLINE=1
 fi
 
+if [[ "$TF_NIGHTLY" == 1 ]]; then
+  python tensorflow/tools/ci_build/update_version.py --nightly
+  EXTRA_PIP_FLAG="--nightly_flag"
+fi
+
 # Enable short object file path to avoid long path issue on Windows.
 echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
 
@@ -107,10 +112,14 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" --gpu "${EXTRA_PIP_FLAG}"
+
+if [[ "$TF_NIGHTLY" == 1 ]]; then
+  exit 0
+fi
 
 # Running python tests on Windows needs pip package installed
-PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow-*.whl)
+PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow_gpu-*.whl)
 reinstall_tensorflow_pip ${PIP_NAME}
 
 TF_GPU_COUNT=${TF_GPU_COUNT:-8}
diff --git a/tensorflow/tools/common/public_api.py b/tensorflow/tools/common/public_api.py
index 09933d266b..82bb0713c4 100644
--- a/tensorflow/tools/common/public_api.py
+++ b/tensorflow/tools/common/public_api.py
@@ -102,9 +102,10 @@ class PublicAPIVisitor(object):
     """Override the default root name of 'tf'."""
     self._root_name = root_name
 
-  def _is_private(self, path, name):
+  def _is_private(self, path, name, obj=None):
     """Return whether a name is private."""
     # TODO(wicke): Find out what names to exclude.
+    del obj  # Unused.
     return ((path in self._private_map and
              name in self._private_map[path]) or
             (name.startswith('_') and not re.match('__.*__$', name) or
@@ -129,7 +130,7 @@ class PublicAPIVisitor(object):
 
     # Remove things that are not visible.
     for name, child in list(children):
-      if self._is_private(full_path, name):
+      if self._is_private(full_path, name, child):
         children.remove((name, child))
 
     self._visitor(path, parent, children)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index bf06214009..0114ef9dbf 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -29,6 +29,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         numpy==1.14.5 \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 0157ba892e..aec5ca965e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -33,8 +33,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         mock \
         numpy==1.14.5 \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 2d46af65bc..ba421d9978 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -49,8 +49,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         mock \
         numpy==1.14.5 \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 7fe97a1214..eb139ec5f8 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -37,8 +37,8 @@ RUN pip --no-cache-dir install --upgrade \
 RUN pip --no-cache-dir install \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         numpy \
         scipy \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index 2a1903c224..371451d2aa 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -18,18 +18,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libhdf5-serial-dev \
         libpng12-dev \
         libzmq3-dev \
+        libssl-dev \
         pkg-config \
-        python-dev \
-        ${PYTHON3_DEV} \
         rsync \
         software-properties-common \
         unzip \
         zip \
         zlib1g-dev \
         openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        && \
-    apt-get clean && \
+        openjdk-8-jre-headless
+
+#install Python 3
+RUN if [ ${PYTHON} = "python3.6" ]; then \
+      curl https://www.python.org/ftp/python/3.6.5/Python-3.6.5.tar.xz -o /opt/python.tar.xz && \
+      cd /opt && tar xvf python.tar.xz && \
+      cd /opt/*/ && ./configure && \
+      make && make install; \
+    else \
+      apt-get install -y --no-install-recommends \
+        python-dev \
+        ${PYTHON3_DEV}; \
+    fi
+
+RUN    apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
@@ -41,8 +52,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         mock \
         numpy \
@@ -53,7 +64,9 @@ RUN ${PIP} --no-cache-dir install \
     ${PYTHON} -m ipykernel.kernelspec
 
 RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
+      ln -s -f /usr/bin/python3 /usr/bin/python; \
+  elif [ "${PYTHON}" = "python3.6" ]; then \
+      ln -s -f /usr/local/bin/python3.6 /usr/bin/python; \
   fi
 
 # Set up our notebook config.
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index a87cbf472d..987b582d10 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -45,8 +45,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         mock \
         numpy \
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 7b1870c5b9..806b8836c7 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -37,8 +37,8 @@ RUN pip --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         numpy==1.14.5 \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index f0bd9fcd7f..641c9e3b16 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -38,8 +38,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         numpy \
         pandas \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
index 91fa037bf6..2b11679f54 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -38,8 +38,8 @@ RUN ${PIP} --no-cache-dir install \
         h5py \
         ipykernel \
         jupyter \
-        keras_applications == 1.0.4 \
-        keras_preprocessing == 1.0.2 \
+        keras_applications==1.0.5 \
+        keras_preprocessing==1.0.3 \
         matplotlib \
         numpy \
         pandas \
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index a286e8a212..263f25bc48 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -1,3 +1,10 @@
+# WARNING: THESE IMAGES ARE DEPRECATED.
+
+TensorFlow's Dockerfiles are now located in
+[`tensorflow/tools/dockerfiles/`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles).
+
+This directory will eventually be removed.
+
 # Using TensorFlow via Docker
 
 This directory contains `Dockerfile`s to make it easy to get up and running with
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 04fc1659af..448a3a7647 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -197,6 +197,8 @@ if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python2" ]]; then
   :
 elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
   FINAL_TAG="${FINAL_TAG}-py3"
+elif [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]]; then
+  FINAL_TAG="${FINAL_TAG}-py3.6"
 else
   die "Unrecognized value in TF_DOCKER_BUILD_PYTHON_VERSION: "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION}"
@@ -377,15 +379,17 @@ else # TF_DOCKER_BUILD_IS_DEVEL == 'yes'
   fi
 
   # Modify python/pip version if necessary.
-  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]]; then
-    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || \
-        [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
+  if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3" ]] || [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]]; then
+    if [[ ${TF_DOCKER_BUILD_TYPE} == "mkl" ]] || [[ ${TF_DOCKER_BUILD_TYPE} == "mkl-horovod" ]]; then
         TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON=${TF_DOCKER_BUILD_PYTHON_VERSION}")
         TF_DOCKER_BUILD_ARGS+=("--build-arg PYTHON3_DEV=python3-dev")
         TF_DOCKER_BUILD_ARGS+=("--build-arg WHL_DIR=/tmp/pip3")
         TF_DOCKER_BUILD_ARGS+=("--build-arg PIP=pip3")
         cp "${ORIG_DOCKERFILE}" "${DOCKERFILE}"
     else
+      if [[ "${TF_DOCKER_BUILD_PYTHON_VERSION}" == "python3.6" ]] && [[ "${TF_DOCKER_BUILD_TYPE}" != "mkl" ]]; then
+        die "Python 3.6 build only supported for MKL builds."
+      fi
       if sed -i -e 's/python-dev/python-dev python3-dev/g' "${DOCKERFILE}" && \
          sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
          sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
new file mode 100644
index 0000000000..c484c162cb
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -0,0 +1,67 @@
+# TensorFlow Dockerfiles
+
+This directory houses TensorFlow's Dockerfiles. **DO NOT EDIT THE DOCKERFILES
+MANUALLY!** They are maintained by `assembler.py`, which builds Dockerfiles from
+the files in `partials/` and the rules in `spec.yml`. See [the Maintaining
+section](#maintaining) for more information.
+
+## Building
+
+The Dockerfiles in the `dockerfiles` directory must have their build context set
+to **the directory with this README.md** to copy in helper files. For example:
+
+```bash
+$ docker build -f ./dockerfiles/cpu.Dockerfile -t tf .
+```
+
+Each Dockerfile has its own set of available `--build-arg`s which are documented
+in the Dockerfile itself.
+
+## Running
+
+After building the image with the tag `tf` (for example), use `docker run` to
+run the images. Examples are below.
+
+Note for new Docker users: the `-v` and `-u` flags share directories between
+the Docker container and your machine, and very important. Without
+`-v`, your work will be wiped once the container quits, and without `-u`, files
+created by the container will have the wrong file permissions on your host
+machine. If you are confused, check out the [Docker run
+documentation](https://docs.docker.com/engine/reference/run/).
+
+```bash
+# Volume mount (-v) is optional but highly recommended, especially for Jupyter.
+# User permissions (-u) are required if you use (-v).
+
+# CPU-based images
+$ docker run -u $(id -u):$(id -g) -v $(PWD):/my-devel -it tf
+
+# GPU-based images (set up nvidia-docker2 first)
+$ docker run --runtime=nvidia -u $(id -u):$(id -g) -v $(PWD):/my-devel -it tf
+
+# Images with Jupyter run on port 8888, and needs a volume for notebooks
+$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(PWD):/notebooks -it tf
+```
+
+These images do not come with the TensorFlow source code -- but the development
+images have git included, so you can `git clone` it yourself.
+
+## Contributing
+
+To make changes to TensorFlow's Dockerfiles, you'll update `spec.yml` and the
+`*.partial.Dockerfile` files in the `partials` directory, then run
+`assembler.py` to re-generate the full Dockerfiles before creating a pull
+request.
+
+You can use the `Dockerfile` in this directory to build an editing environment
+that has all of the Python dependencies you'll need:
+
+```bash
+$ docker build -t tf-assembler -f assembler.Dockerfile .
+
+# Set --user to set correct permissions on generated files
+$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash 
+
+# In the container...
+/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml
+```
diff --git a/tensorflow/contrib/kfac/python/ops/op_queue_lib.py b/tensorflow/tools/dockerfiles/assembler.Dockerfile
index 09c9a4ab33..7a8e07fced 100644
--- a/tensorflow/contrib/kfac/python/ops/op_queue_lib.py
+++ b/tensorflow/tools/dockerfiles/assembler.Dockerfile
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Helper for choosing which op to run next in a distributed setting."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+#
+# TensorFlow Dockerfile Development Container
+#
+# You can use this image to quickly develop changes to the Dockerfile assembler
+# or set of TF Docker partials. See README.md for usage instructions.
+FROM debian:stretch
+LABEL maintainer="Austin Anderson <angerson@google.com>"
 
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.kfac.python.ops.op_queue import *
-from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
+RUN apt-get update && apt-get install -y python3 python3-pip bash
+RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus
 
-_allowed_symbols = [
-    'OpQueue',
-]
+WORKDIR /tf
+VOLUME ["/tf"]
 
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
new file mode 100644
index 0000000000..9cdd9bb0cb
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -0,0 +1,554 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Assemble common TF Dockerfiles from many parts.
+
+This script constructs TF's Dockerfiles by aggregating partial
+Dockerfiles. See README.md for usage examples.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import errno
+import os
+import os.path
+import re
+import shutil
+import textwrap
+
+from absl import app
+from absl import flags
+import cerberus
+import yaml
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_boolean(
+    'dry_run', False, 'Do not actually generate Dockerfiles', short_name='n')
+
+flags.DEFINE_string(
+    'spec_file',
+    './spec.yml',
+    'Path to a YAML specification file',
+    short_name='s')
+
+flags.DEFINE_string(
+    'output_dir',
+    './dockerfiles', ('Path to an output directory for Dockerfiles. '
+                      'Will be created if it doesn\'t exist.'),
+    short_name='o')
+
+flags.DEFINE_string(
+    'partial_dir',
+    './partials',
+    'Path to a directory containing foo.partial.Dockerfile partial files.',
+    short_name='p')
+
+flags.DEFINE_boolean(
+    'quiet_dry_run',
+    True,
+    'Do not print contents of dry run Dockerfiles.',
+    short_name='q')
+
+flags.DEFINE_boolean(
+    'validate', True, 'Validate generated Dockerfiles', short_name='c')
+
+# Schema to verify the contents of spec.yml with Cerberus.
+# Must be converted to a dict from yaml to work.
+# Note: can add python references with e.g.
+# !!python/name:builtins.str
+# !!python/name:__main__.funcname
+SCHEMA_TEXT = """
+header:
+  type: string
+
+partials:
+  type: dict
+  keyschema:
+    type: string
+  valueschema:
+    type: dict
+    schema:
+      desc:
+        type: string
+      args:
+        type: dict
+        keyschema:
+          type: string
+        valueschema:
+          anyof:
+            - type: [ boolean, number, string ]
+            - type: dict
+              schema:
+                 default:
+                    type: [ boolean, number, string ]
+                 desc:
+                    type: string
+                 options:
+                    type: list
+                    schema:
+                       type: string
+
+images:
+  keyschema:
+    type: string
+  valueschema:
+    type: dict
+    schema:
+      desc:
+        type: string
+      arg-defaults:
+        type: list
+        schema:
+          anyof:
+            - type: dict
+              keyschema:
+                type: string
+                arg_in_use: true
+              valueschema:
+                type: string
+            - type: string
+              isimage: true
+      create-dockerfile:
+        type: boolean
+      partials:
+        type: list
+        schema:
+          anyof:
+            - type: dict
+              keyschema:
+                type: string
+                regex: image
+              valueschema:
+                type: string
+                isimage: true
+            - type: string
+              ispartial: true
+"""
+
+
+class TfDockerValidator(cerberus.Validator):
+  """Custom Cerberus validator for TF dockerfile spec.
+
+  Note: Each _validate_foo function's docstring must end with a segment
+  describing its own validation schema, e.g. "The rule's arguments are...". If
+  you add a new validator, you can copy/paste that section.
+  """
+
+  def _validate_ispartial(self, ispartial, field, value):
+    """Validate that a partial references an existing partial spec.
+
+    Args:
+      ispartial: Value of the rule, a bool
+      field: The field being validated
+      value: The field's value
+
+    The rule's arguments are validated against this schema:
+    {'type': 'boolean'}
+    """
+    if ispartial and value not in self.root_document.get('partials', dict()):
+      self._error(field, '{} is not an existing partial.'.format(value))
+
+  def _validate_isimage(self, isimage, field, value):
+    """Validate that an image references an existing partial spec.
+
+    Args:
+      isimage: Value of the rule, a bool
+      field: The field being validated
+      value: The field's value
+
+    The rule's arguments are validated against this schema:
+    {'type': 'boolean'}
+    """
+    if isimage and value not in self.root_document.get('images', dict()):
+      self._error(field, '{} is not an existing image.'.format(value))
+
+  def _validate_arg_in_use(self, arg_in_use, field, value):
+    """Validate that an arg references an existing partial spec's args.
+
+    Args:
+      arg_in_use: Value of the rule, a bool
+      field: The field being validated
+      value: The field's value
+
+    The rule's arguments are validated against this schema:
+    {'type': 'boolean'}
+    """
+    if arg_in_use:
+      for partial in self.root_document.get('partials', dict()).values():
+        if value in partial.get('args', tuple()):
+          return
+
+      self._error(field, '{} is not an arg used in any partial.'.format(value))
+
+
+def build_partial_description(partial_spec):
+  """Create the documentation lines for a specific partial.
+
+  Generates something like this:
+
+    # This is the partial's description, from spec.yml.
+    # --build-arg ARG_NAME=argdefault
+    #    this is one of the args.
+    # --build-arg ANOTHER_ARG=(some|choices)
+    #    another arg.
+
+  Args:
+    partial_spec: A dict representing one of the partials from spec.yml. Doesn't
+      include the name of the partial; is a dict like { desc: ..., args: ... }.
+
+  Returns:
+    A commented string describing this partial.
+  """
+
+  # Start from linewrapped desc field
+  lines = []
+  wrapper = textwrap.TextWrapper(
+      initial_indent='# ', subsequent_indent='# ', width=80)
+  description = wrapper.fill(partial_spec.get('desc', '( no comments )'))
+  lines.extend(['#', description])
+
+  # Document each arg
+  for arg, arg_data in partial_spec.get('args', dict()).items():
+    # Wrap arg description with comment lines
+    desc = arg_data.get('desc', '( no description )')
+    desc = textwrap.fill(
+        desc,
+        initial_indent='#    ',
+        subsequent_indent='#    ',
+        width=80,
+        drop_whitespace=False)
+
+    # Document (each|option|like|this)
+    if 'options' in arg_data:
+      arg_options = ' ({})'.format('|'.join(arg_data['options']))
+    else:
+      arg_options = ''
+
+    # Add usage sample
+    arg_use = '# --build-arg {}={}{}'.format(arg,
+                                             arg_data.get('default', '(unset)'),
+                                             arg_options)
+    lines.extend([arg_use, desc])
+
+  return '\n'.join(lines)
+
+
+def construct_contents(partial_specs, image_spec):
+  """Assemble the dockerfile contents for an image spec.
+
+  It assembles a concrete list of partial references into a single, large
+  string.
+  Also expands argument defaults, so that the resulting Dockerfile doesn't have
+  to be configured with --build-arg=... every time. That is, any ARG directive
+  will be updated with a new default value.
+
+  Args:
+    partial_specs: The dict from spec.yml["partials"].
+    image_spec: One of the dict values from spec.yml["images"].
+
+  Returns:
+    A string containing a valid Dockerfile based on the partials listed in
+    image_spec.
+  """
+  processed_partial_strings = []
+  for partial_name in image_spec['partials']:
+    # Apply image arg-defaults to existing arg defaults
+    partial_spec = copy.deepcopy(partial_specs[partial_name])
+    args = partial_spec.get('args', dict())
+    for k_v in image_spec.get('arg-defaults', []):
+      arg, value = list(k_v.items())[0]
+      if arg in args:
+        args[arg]['default'] = value
+
+    # Read partial file contents
+    filename = partial_spec.get('file', partial_name)
+    partial_path = os.path.join(FLAGS.partial_dir,
+                                '{}.partial.Dockerfile'.format(filename))
+    with open(partial_path, 'r') as f_partial:
+      partial_contents = f_partial.read()
+
+    # Replace ARG FOO=BAR with ARG FOO=[new-default]
+    for arg, arg_data in args.items():
+      if 'default' in arg_data and arg_data['default']:
+        default = '={}'.format(arg_data['default'])
+      else:
+        default = ''
+      partial_contents = re.sub(r'ARG {}.*'.format(arg), 'ARG {}{}'.format(
+          arg, default), partial_contents)
+
+    # Store updated partial contents
+    processed_partial_strings.append(partial_contents)
+
+  # Join everything together
+  return '\n'.join(processed_partial_strings)
+
+
+def mkdir_p(path):
+  """Create a directory and its parents, even if it already exists."""
+  try:
+    os.makedirs(path)
+  except OSError as e:
+    if e.errno != errno.EEXIST:
+      raise
+
+
+def construct_documentation(header, partial_specs, image_spec):
+  """Assemble all of the documentation for a single dockerfile.
+
+  Builds explanations of included partials and available build args.
+
+  Args:
+    header: The string from spec.yml["header"]; will be commented and wrapped.
+    partial_specs: The dict from spec.yml["partials"].
+    image_spec: The spec for the dockerfile being built.
+
+  Returns:
+    A string containing a commented header that documents the contents of the
+    dockerfile.
+
+  """
+  # Comment and wrap header and image description
+  commented_header = '\n'.join(
+      [('# ' + l).rstrip() for l in header.splitlines()])
+  commented_desc = '\n'.join(
+      ['# ' + l for l in image_spec.get('desc', '').splitlines()])
+  partial_descriptions = []
+
+  # Build documentation for each partial in the image
+  for partial in image_spec['partials']:
+    # Copy partial data for default args unique to this image
+    partial_spec = copy.deepcopy(partial_specs[partial])
+    args = partial_spec.get('args', dict())
+
+    # Overwrite any existing arg defaults
+    for k_v in image_spec.get('arg-defaults', []):
+      arg, value = list(k_v.items())[0]
+      if arg in args:
+        args[arg]['default'] = value
+
+    # Build the description from new args
+    partial_description = build_partial_description(partial_spec)
+    partial_descriptions.append(partial_description)
+
+  contents = [commented_header, '#', commented_desc] + partial_descriptions
+  return '\n'.join(contents) + '\n'
+
+
+def normalize_partial_args(partial_specs):
+  """Normalize the shorthand form of a partial's args specification.
+
+  Turns this:
+
+    partial:
+      args:
+        SOME_ARG: arg_value
+
+  Into this:
+
+    partial:
+       args:
+         SOME_ARG:
+            default: arg_value
+
+  Args:
+    partial_specs: The dict from spec.yml["partials"]. This dict is modified in
+      place.
+
+  Returns:
+    The modified contents of partial_specs.
+
+  """
+  for _, partial in partial_specs.items():
+    args = partial.get('args', dict())
+    for arg, value in args.items():
+      if not isinstance(value, dict):
+        new_value = {'default': value}
+        args[arg] = new_value
+
+  return partial_specs
+
+
+def flatten_args_references(image_specs):
+  """Resolve all default-args in each image spec to a concrete dict.
+
+  Turns this:
+
+    example-image:
+      arg-defaults:
+        - MY_ARG: ARG_VALUE
+
+    another-example:
+      arg-defaults:
+        - ANOTHER_ARG: ANOTHER_VALUE
+        - example_image
+
+  Into this:
+
+    example-image:
+      arg-defaults:
+        - MY_ARG: ARG_VALUE
+
+    another-example:
+      arg-defaults:
+        - ANOTHER_ARG: ANOTHER_VALUE
+        - MY_ARG: ARG_VALUE
+
+  Args:
+    image_specs: A dict of image_spec dicts; should be the contents of the
+      "images" key in the global spec.yaml. This dict is modified in place and
+      then returned.
+
+  Returns:
+    The modified contents of image_specs.
+  """
+  for _, image_spec in image_specs.items():
+    too_deep = 0
+    while str in map(type, image_spec.get('arg-defaults', [])) and too_deep < 5:
+      new_args = []
+      for arg in image_spec['arg-defaults']:
+        if isinstance(arg, str):
+          new_args.extend(image_specs[arg]['arg-defaults'])
+        else:
+          new_args.append(arg)
+
+      image_spec['arg-defaults'] = new_args
+      too_deep += 1
+
+  return image_specs
+
+
+def flatten_partial_references(image_specs):
+  """Resolve all partial references in each image spec to a concrete list.
+
+  Turns this:
+
+    example-image:
+      partials:
+        - foo
+
+    another-example:
+      partials:
+        - bar
+        - image: example-image
+        - bat
+
+  Into this:
+
+    example-image:
+      partials:
+        - foo
+
+    another-example:
+      partials:
+        - bar
+        - foo
+        - bat
+  Args:
+    image_specs: A dict of image_spec dicts; should be the contents of the
+      "images" key in the global spec.yaml. This dict is modified in place and
+      then returned.
+
+  Returns:
+    The modified contents of image_specs.
+  """
+  for _, image_spec in image_specs.items():
+    too_deep = 0
+    while dict in map(type, image_spec['partials']) and too_deep < 5:
+      new_partials = []
+      for partial in image_spec['partials']:
+        if isinstance(partial, str):
+          new_partials.append(partial)
+        else:
+          new_partials.extend(image_specs[partial['image']]['partials'])
+
+      image_spec['partials'] = new_partials
+      too_deep += 1
+
+  return image_specs
+
+
+def construct_dockerfiles(tf_spec):
+  """Generate a mapping of {"cpu": <cpu dockerfile contents>, ...}.
+
+  Args:
+    tf_spec: The full spec.yml loaded as a python object.
+
+  Returns:
+    A string:string dict of short names ("cpu-devel") to Dockerfile contents.
+  """
+  names_to_contents = dict()
+  image_specs = tf_spec['images']
+  image_specs = flatten_partial_references(image_specs)
+  image_specs = flatten_args_references(image_specs)
+  partial_specs = tf_spec['partials']
+  partial_specs = normalize_partial_args(partial_specs)
+
+  for name, image_spec in image_specs.items():
+    if not image_spec.get('create-dockerfile', True):
+      continue
+    documentation = construct_documentation(tf_spec['header'], partial_specs,
+                                            image_spec)
+    contents = construct_contents(partial_specs, image_spec)
+    names_to_contents[name] = '\n'.join([documentation, contents])
+
+  return names_to_contents
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Unexpected command line args found: {}'.format(argv))
+
+  with open(FLAGS.spec_file, 'r') as spec_file:
+    tf_spec = yaml.load(spec_file)
+
+  # Abort if spec.yaml is invalid
+  if FLAGS.validate:
+    schema = yaml.load(SCHEMA_TEXT)
+    v = TfDockerValidator(schema)
+    if not v.validate(tf_spec):
+      print('>> ERROR: {} is an invalid spec! The errors are:'.format(
+          FLAGS.spec_file))
+      print(yaml.dump(v.errors, indent=2))
+      exit(1)
+  else:
+    print('>> WARNING: Not validating {}'.format(FLAGS.spec_file))
+
+  # Generate mapping of { "cpu-devel": "<cpu-devel dockerfile contents>", ... }
+  names_to_contents = construct_dockerfiles(tf_spec)
+
+  # Write each completed Dockerfile
+  if not FLAGS.dry_run:
+    print('>> Emptying destination dir "{}"'.format(FLAGS.output_dir))
+    shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
+    mkdir_p(FLAGS.output_dir)
+  else:
+    print('>> Skipping creation of {} (dry run)'.format(FLAGS.output_dir))
+  for name, contents in names_to_contents.items():
+    path = os.path.join(FLAGS.output_dir, name + '.Dockerfile')
+    if FLAGS.dry_run:
+      print('>> Skipping writing contents of {} (dry run)'.format(path))
+      print(contents)
+    else:
+      mkdir_p(FLAGS.output_dir)
+      print('>> Writing {}'.format(path))
+      with open(path, 'w') as f:
+        f.write(contents)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/tools/dockerfiles/bashrc b/tensorflow/tools/dockerfiles/bashrc
new file mode 100644
index 0000000000..48cacf20f6
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/bashrc
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > "
+export TERM=xterm-256color
+alias grep="grep --color=auto"
+alias ls="ls --color=auto"
+
+echo -e "\e[1;31m"
+cat<<TF
+________                               _______________                
+___  __/__________________________________  ____/__  /________      __
+__  /  _  _ \_  __ \_  ___/  __ \_  ___/_  /_   __  /_  __ \_ | /| / /
+_  /   /  __/  / / /(__  )/ /_/ /  /   _  __/   _  / / /_/ /_ |/ |/ / 
+/_/    \___//_/ /_//____/ \____//_/    /_/      /_/  \____/____/|__/
+
+TF
+echo -e "\e[0;33m"
+
+if [[ $EUID -eq 0 ]]; then
+  cat <<WARN
+WARNING: You are running this container as root, which can cause new files in
+mounted volumes to be created as the root user on your host machine.
+
+To avoid this, run the container by specifying your user's userid:
+
+$ docker run -u \$(id -u):\$(id -g) args...
+WARN
+else
+  cat <<EXPL
+You are running this container as user with ID $(id -u) and group $(id -g),
+which should map to the ID and group for your user on the Docker host. Great!
+EXPL
+fi
+
+# Turn off colors
+echo -e "\e[m"
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
new file mode 100644
index 0000000000..dbbad7d03a
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
@@ -0,0 +1,100 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, CPU-only environment for developing changes for TensorFlow, with Jupyter included.
+#
+# Start from Ubuntu, with TF development packages (no GPU support)
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the latest version of Bazel and Python development tools.
+#
+# Configure TensorFlow's shell prompt and login tools.
+#
+# Launch Jupyter on execution instead of a bash prompt.
+
+ARG UBUNTU_VERSION=16.04
+FROM ubuntu:${UBUNTU_VERSION}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter
+
+RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /notebooks
+EXPOSE 8888
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
new file mode 100644
index 0000000000..160d7c02e2
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, CPU-only environment for developing changes for TensorFlow.
+#
+# Start from Ubuntu, with TF development packages (no GPU support)
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the latest version of Bazel and Python development tools.
+#
+# Configure TensorFlow's shell prompt and login tools.
+
+ARG UBUNTU_VERSION=16.04
+FROM ubuntu:${UBUNTU_VERSION}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
new file mode 100644
index 0000000000..8d5d653ab7
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter included.
+#
+# Start from Ubuntu (no GPU support)
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the TensorFlow Python package.
+# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
+#    The specific TensorFlow Python package to install
+#
+# Configure TensorFlow's shell prompt and login tools.
+#
+# Launch Jupyter on execution instead of a bash prompt.
+
+ARG UBUNTU_VERSION=16.04
+FROM ubuntu:${UBUNTU_VERSION}
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+ARG TF_PACKAGE=tensorflow
+RUN ${PIP} install ${TF_PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter
+
+RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /notebooks
+EXPOSE 8888
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
new file mode 100644
index 0000000000..35c41b49fd
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -0,0 +1,58 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, CPU-only environment for using TensorFlow
+#
+# Start from Ubuntu (no GPU support)
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the TensorFlow Python package.
+# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
+#    The specific TensorFlow Python package to install
+#
+# Configure TensorFlow's shell prompt and login tools.
+
+ARG UBUNTU_VERSION=16.04
+FROM ubuntu:${UBUNTU_VERSION}
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+ARG TF_PACKAGE=tensorflow
+RUN ${PIP} install ${TF_PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
new file mode 100644
index 0000000000..0f5fedf2fe
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow, with Jupyter included.
+#
+# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
+# packages.
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the latest version of Bazel and Python development tools.
+#
+# Configure TensorFlow's shell prompt and login tools.
+#
+# Launch Jupyter on execution instead of a bash prompt.
+
+ARG UBUNTU_VERSION=16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
+        curl \
+        git \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libnccl-dev=2.2.13-1+cuda9.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        && \
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
+# TODO(tobyboyd): Remove after license is excluded from BUILD file.
+RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
+    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter
+
+RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /notebooks
+EXPOSE 8888
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
new file mode 100644
index 0000000000..a6e280082e
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
@@ -0,0 +1,109 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow.
+#
+# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
+# packages.
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the latest version of Bazel and Python development tools.
+#
+# Configure TensorFlow's shell prompt and login tools.
+
+ARG UBUNTU_VERSION=16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
+        curl \
+        git \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libnccl-dev=2.2.13-1+cuda9.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        && \
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
+# TODO(tobyboyd): Remove after license is excluded from BUILD file.
+RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
+    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
new file mode 100644
index 0000000000..f1799113b1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
@@ -0,0 +1,90 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with Jupyter included.
+#
+# NVIDIA with CUDA and CuDNN, no dev stuff
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the TensorFlow Python package.
+# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
+#    The specific TensorFlow Python package to install
+#
+# Configure TensorFlow's shell prompt and login tools.
+#
+# Launch Jupyter on execution instead of a bash prompt.
+
+FROM nvidia/cuda:9.0-base-ubuntu16.04
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+ARG TF_PACKAGE=tensorflow-gpu
+RUN ${PIP} install ${TF_PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter
+
+RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /notebooks
+EXPOSE 8888
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
new file mode 100644
index 0000000000..690eb68b22
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
@@ -0,0 +1,79 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# below. Please refer to the the TensorFlow dockerfiles documentation for
+# more information. Build args are documented as their default value.
+#
+# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
+#
+# NVIDIA with CUDA and CuDNN, no dev stuff
+# --build-arg UBUNTU_VERSION=16.04
+#    ( no description )
+#
+# Python is required for TensorFlow and other libraries.
+# --build-arg USE_PYTHON_3_NOT_2=True
+#    Install python 3 over Python 2
+#
+# Install the TensorFlow Python package.
+# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
+#    The specific TensorFlow Python package to install
+#
+# Configure TensorFlow's shell prompt and login tools.
+
+FROM nvidia/cuda:9.0-base-ubuntu16.04
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ARG USE_PYTHON_3_NOT_2=True
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
+
+ARG TF_PACKAGE=tensorflow-gpu
+RUN ${PIP} install ${TF_PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
new file mode 100644
index 0000000000..b08d8bdd14
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
@@ -0,0 +1,13 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
new file mode 100644
index 0000000000..2c9b9f3f9a
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -0,0 +1,8 @@
+RUN ${PIP} install jupyter
+
+RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir /.local && chmod a+rwx /.local
+WORKDIR /notebooks
+EXPOSE 8888
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
new file mode 100644
index 0000000000..f31b695e77
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
@@ -0,0 +1,43 @@
+ARG UBUNTU_VERSION=16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
+        curl \
+        git \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libnccl-dev=2.2.13-1+cuda9.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        && \
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
+# TODO(tobyboyd): Remove after license is excluded from BUILD file.
+RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
+    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
new file mode 100644
index 0000000000..13d865b9d4
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
@@ -0,0 +1,23 @@
+FROM nvidia/cuda:9.0-base-ubuntu16.04
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
new file mode 100644
index 0000000000..6f346236a5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
@@ -0,0 +1,12 @@
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} install --upgrade \
+    pip \
+    setuptools
diff --git a/tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile
new file mode 100644
index 0000000000..d641a11b06
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/shell.partial.Dockerfile
@@ -0,0 +1,2 @@
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
new file mode 100644
index 0000000000..96e79547f0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -0,0 +1,2 @@
+ARG TF_PACKAGE
+RUN ${PIP} install ${TF_PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
new file mode 100644
index 0000000000..bc79272276
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
@@ -0,0 +1,24 @@
+ARG UBUNTU_VERSION=16.04
+FROM ubuntu:${UBUNTU_VERSION}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
new file mode 100644
index 0000000000..0a50735bf8
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
@@ -0,0 +1,2 @@
+ARG UBUNTU_VERSION=16.04
+FROM ubuntu:${UBUNTU_VERSION}
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
new file mode 100644
index 0000000000..28bf9a55da
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -0,0 +1,195 @@
+# ======
+# HEADER
+# ======
+#
+# This is commented-out and prepended to each generated Dockerfile.
+header: |
+    Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+    ============================================================================
+
+    THIS IS A GENERATED DOCKERFILE.
+
+    This file was assembled from multiple pieces, whose use is documented
+    below. Please refer to the the TensorFlow dockerfiles documentation for
+    more information. Build args are documented as their default value.
+
+# ========
+# PARTIALS
+# ========
+#
+# Represent and document pieces of a Dockerfile. Spec:
+# 
+# name: the name of the partial, is referenced from the images section
+#   desc: A description, inserted later into the Dockerfile
+#   file: Alternative file prefix, e.g. file.partial.Dockerfile. The default is
+#         the name of the partial.
+#   args: A dict of ARGs in the Dockerfile; each entry has the format
+#      ARG_NAME: VALUE where VALUE is one of:
+#         - a dict:
+#             desc: Documentation for the arg
+#             default: Default value for the arg; is written to the Dockerfile
+#             options: List of strings, part of documentation
+#         - a concrete value: the same as a dictionary with default: [value].
+
+partials:
+    ubuntu:
+        desc: Start from Ubuntu (no GPU support)
+        args:
+            UBUNTU_VERSION: 16.04
+
+    ubuntu-devel:
+        desc: Start from Ubuntu, with TF development packages (no GPU support)
+        args:
+            UBUNTU_VERSION: 16.04
+
+    bazel:
+        desc: Install the latest version of Bazel and Python development tools.
+
+    nvidia:
+        desc: NVIDIA with CUDA and CuDNN, no dev stuff
+        args:
+            UBUNTU_VERSION: 16.04
+
+    nvidia-devel:
+        desc: >
+            Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF
+            development packages.
+        args:
+            UBUNTU_VERSION: 16.04
+
+    python:
+        desc: Python is required for TensorFlow and other libraries.
+        args:
+            USE_PYTHON_3_NOT_2:
+                default: true
+                desc: Install python 3 over Python 2
+                
+    tensorflow:
+        desc: Install the TensorFlow Python package.
+        args:
+            TF_PACKAGE:
+                default: tensorflow
+                options:
+                    - tensorflow
+                    - tensorflow-gpu
+                    - tf-nightly
+                    - tf-nightly-gpu
+                desc: The specific TensorFlow Python package to install
+    shell:
+        desc: Configure TensorFlow's shell prompt and login tools.
+    jupyter:
+        desc: Launch Jupyter on execution instead of a bash prompt.
+
+# ======
+# IMAGES
+# ======
+# 
+# Represent Dockerfiles. Spec:
+# 
+# name: the name of the image, possibly referenced by other images
+#   desc: A description, inserted later into the Dockerfile
+#   create-dockerfile: Create a dockerfile based on this. Useful for creating
+#      extensible base images that don't need a file. Default is true.
+#   partials: List of VALUEs, where a VALUE is either:
+#      - the name of a partial, which inserts that partial into this image
+#      - image: [name of another image], which inserts the partials from that
+#        image into this image
+#   arg-defaults: List of VALUEs, where a VALUE is either:
+#      - ARG_NAME: VALUE, which sets the ARG_NAME to VALUE wherever it appears
+#        in this image's partials
+#      - [name of another image], which loads the default args from that image
+images:
+
+    nodev:
+        create-dockerfile: false
+        partials:
+            - python
+            - tensorflow
+            - shell
+
+    dev:
+        create-dockerfile: false
+        partials:
+            - python
+            - bazel
+            - shell
+
+    cpu:
+      desc: Ubuntu-based, CPU-only environment for using TensorFlow
+      partials:
+        - ubuntu
+        - image: nodev
+
+    cpu-devel:
+      desc: >
+          Ubuntu-based, CPU-only environment for developing changes for
+          TensorFlow.
+      partials:
+        - ubuntu-devel
+        - image: dev
+
+    nvidia:
+      desc: Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
+      arg-defaults: 
+        - TF_PACKAGE: tensorflow-gpu
+      partials:
+        - nvidia
+        - image: nodev
+
+    nvidia-devel:
+      desc: >
+          Ubuntu-based, Nvidia-GPU-enabled environment for developing changes
+          for TensorFlow.
+      arg-defaults: 
+        - TF_PACKAGE: tensorflow-gpu
+      partials:
+        - nvidia-devel
+        - image: dev
+
+    cpu-jupyter:
+      desc: >
+          Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter
+          included.
+      partials:
+        - image: cpu
+        - jupyter
+
+    cpu-devel-jupyter:
+      desc: >
+         Ubuntu-based, CPU-only environment for developing changes for
+         TensorFlow, with Jupyter included.
+      partials:
+        - image: cpu-devel
+        - jupyter
+
+    nvidia-jupyter:
+      desc: >
+        Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with
+        Jupyter included.
+      arg-defaults: 
+        - nvidia
+      partials:
+        - image: nvidia
+        - jupyter
+
+    nvidia-devel-jupyter:
+      desc: >
+        Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for
+        TensorFlow, with Jupyter included.
+      arg-defaults: 
+        - nvidia-devel
+      partials:
+        - image: nvidia-devel
+        - jupyter
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index cc7885ab1b..4f7efe193f 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -34,11 +34,29 @@ py_test(
 )
 
 py_library(
+    name = "doc_controls",
+    srcs = ["doc_controls.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "doc_controls_test",
+    size = "small",
+    srcs = ["doc_controls_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":doc_controls",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_library(
     name = "parser",
     srcs = ["parser.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":doc_controls",
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "@astor_archive//:astor",
@@ -68,6 +86,7 @@ py_binary(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":doc_controls",
         ":doc_generator_visitor",
         ":parser",
         ":pretty_docs",
diff --git a/tensorflow/tools/docs/doc_controls.py b/tensorflow/tools/docs/doc_controls.py
new file mode 100644
index 0000000000..5e526443cc
--- /dev/null
+++ b/tensorflow/tools/docs/doc_controls.py
@@ -0,0 +1,319 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Documentation control decorators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+_DO_NOT_DOC = "_tf_docs_do_not_document"
+
+
+def do_not_generate_docs(obj):
+  """A decorator: Do not generate docs for this object.
+
+  For example the following classes:
+
+  ```
+  class Parent(object):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+
+  Produce the following api_docs:
+
+  ```
+  /Parent.md
+    # method1
+    # method2
+  /Child.md
+    # method1
+    # method2
+  ```
+
+  This decorator allows you to skip classes or methods:
+
+  ```
+  @do_not_generate_docs
+  class Parent(object):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child(Parent):
+    @do_not_generate_docs
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+
+  This will only produce the following docs:
+
+  ```
+  /Child.md
+    # method2
+  ```
+
+  Note: This is implemented by adding a hidden attribute on the object, so it
+  cannot be used on objects which do not allow new attributes to be added. So
+  this decorator must go *below* `@property`, `@classmethod`,
+  or `@staticmethod`:
+
+  ```
+  class Example(object):
+    @property
+    @do_not_generate_docs
+    def x(self):
+      return self._x
+  ```
+
+  Args:
+    obj: The object to hide from the generated docs.
+
+  Returns:
+    obj
+  """
+  setattr(obj, _DO_NOT_DOC, None)
+  return obj
+
+
+_DO_NOT_DOC_INHERITABLE = "_tf_docs_do_not_doc_inheritable"
+
+
+def do_not_doc_inheritable(obj):
+  """A decorator: Do not generate docs for this method.
+
+  This version of the decorator is "inherited" by subclasses. No docs will be
+  generated for the decorated method in any subclass. Even if the sub-class
+  overrides the method.
+
+  For example, to ensure that `method1` is **never documented** use this
+  decorator on the base-class:
+
+  ```
+  class Parent(object):
+    @do_not_doc_inheritable
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+  This will produce the following docs:
+
+  ```
+  /Parent.md
+    # method2
+  /Child.md
+    # method2
+  ```
+
+  When generating docs for a class's arributes, the `__mro__` is searched and
+  the attribute will be skipped if this decorator is detected on the attribute
+  on any class in the `__mro__`.
+
+  Note: This is implemented by adding a hidden attribute on the object, so it
+  cannot be used on objects which do not allow new attributes to be added. So
+  this decorator must go *below* `@property`, `@classmethod`,
+  or `@staticmethod`:
+
+  ```
+  class Example(object):
+    @property
+    @do_not_doc_inheritable
+    def x(self):
+      return self._x
+  ```
+
+  Args:
+    obj: The class-attribute to hide from the generated docs.
+
+  Returns:
+    obj
+  """
+  setattr(obj, _DO_NOT_DOC_INHERITABLE, None)
+  return obj
+
+
+_FOR_SUBCLASS_IMPLEMENTERS = "_tf_docs_tools_for_subclass_implementers"
+
+
+def for_subclass_implementers(obj):
+  """A decorator: Only generate docs for this method in the defining class.
+
+  Also group this method's docs with and `@abstractmethod` in the class's docs.
+
+  No docs will generated for this class attribute in sub-classes.
+
+  The canonical use case for this is `tf.keras.layers.Layer.call`: It's a
+  public method, essential for anyone implementing a subclass, but it should
+  never be called directly.
+
+  Works on method, or other class-attributes.
+
+  When generating docs for a class's arributes, the `__mro__` is searched and
+  the attribute will be skipped if this decorator is detected on the attribute
+  on any **parent** class in the `__mro__`.
+
+  For example:
+
+  ```
+  class Parent(object):
+    @for_subclass_implementers
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child1(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+
+  class Child2(Parent):
+    def method1(self):
+      pass
+    def method2(self):
+      pass
+  ```
+
+  This will produce the following docs:
+
+  ```
+  /Parent.md
+    # method1
+    # method2
+  /Child1.md
+    # method2
+  /Child2.md
+    # method2
+  ```
+
+  Note: This is implemented by adding a hidden attribute on the object, so it
+  cannot be used on objects which do not allow new attributes to be added. So
+  this decorator must go *below* `@property`, `@classmethod`,
+  or `@staticmethod`:
+
+  ```
+  class Example(object):
+    @property
+    @for_subclass_implementers
+    def x(self):
+      return self._x
+  ```
+
+  Args:
+    obj: The class-attribute to hide from the generated docs.
+
+  Returns:
+    obj
+  """
+  setattr(obj, _FOR_SUBCLASS_IMPLEMENTERS, None)
+  return obj
+
+
+def should_skip(obj):
+  """Returns true if docs generation should be skipped for this object.
+
+  checks for the `do_not_generate_docs` or `do_not_doc_inheritable` decorators.
+
+  Args:
+    obj: The object to document, or skip.
+
+  Returns:
+    True if the object should be skipped
+  """
+  # Unwrap fget if the object is a property
+  if isinstance(obj, property):
+    obj = obj.fget
+
+  return hasattr(obj, _DO_NOT_DOC) or hasattr(obj, _DO_NOT_DOC_INHERITABLE)
+
+
+def should_skip_class_attr(cls, name):
+  """Returns true if docs should be skipped for this class attribute.
+
+  Args:
+    cls: The class the attribute belongs to.
+    name: The name of the attribute.
+
+  Returns:
+    True if the attribute should be skipped.
+  """
+  # Get the object with standard lookup, from the nearest
+  # defining parent.
+  try:
+    obj = getattr(cls, name)
+  except AttributeError:
+    # Avoid error caused by enum metaclasses in python3
+    if name in ("name", "value"):
+      return True
+    raise
+
+  # Unwrap fget if the object is a property
+  if isinstance(obj, property):
+    obj = obj.fget
+
+  # Skip if the object is decorated with `do_not_generate_docs` or
+  # `do_not_doc_inheritable`
+  if should_skip(obj):
+    return True
+
+  # Use __dict__ lookup to get the version defined in *this* class.
+  obj = cls.__dict__.get(name, None)
+  if isinstance(obj, property):
+    obj = obj.fget
+  if obj is not None:
+    # If not none, the object is defined in *this* class.
+    # Do not skip if decorated with `for_subclass_implementers`.
+    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
+      return False
+
+  # for each parent class
+  for parent in cls.__mro__[1:]:
+    obj = getattr(parent, name, None)
+
+    if obj is None:
+      continue
+
+    if isinstance(obj, property):
+      obj = obj.fget
+
+    # Skip if the parent's definition is decorated with `do_not_doc_inheritable`
+    # or `for_subclass_implementers`
+    if hasattr(obj, _DO_NOT_DOC_INHERITABLE):
+      return True
+
+    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
+      return True
+
+  # No blockng decorators --> don't skip
+  return False
diff --git a/tensorflow/tools/docs/doc_controls_test.py b/tensorflow/tools/docs/doc_controls_test.py
new file mode 100644
index 0000000000..d5eb4ffc00
--- /dev/null
+++ b/tensorflow/tools/docs/doc_controls_test.py
@@ -0,0 +1,220 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for documentation control decorators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import googletest
+from tensorflow.tools.docs import doc_controls
+
+
+class DocControlsTest(googletest.TestCase):
+
+  def test_do_not_generate_docs(self):
+
+    @doc_controls.do_not_generate_docs
+    def dummy_function():
+      pass
+
+    self.assertTrue(doc_controls.should_skip(dummy_function))
+
+  def test_do_not_doc_on_method(self):
+    """The simple decorator is not aware of inheritance."""
+
+    class Parent(object):
+
+      @doc_controls.do_not_generate_docs
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+
+      def my_method(self):
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertFalse(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def test_do_not_doc_inheritable(self):
+
+    class Parent(object):
+
+      @doc_controls.do_not_doc_inheritable
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+
+      def my_method(self):
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def test_do_not_doc_inheritable_property(self):
+
+    class Parent(object):
+
+      @property
+      @doc_controls.do_not_doc_inheritable
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+
+      @property
+      def my_method(self):
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def test_do_not_doc_inheritable_staticmethod(self):
+
+    class GrandParent(object):
+
+      def my_method(self):
+        pass
+
+    class Parent(GrandParent):
+
+      @staticmethod
+      @doc_controls.do_not_doc_inheritable
+      def my_method():
+        pass
+
+    class Child(Parent):
+
+      @staticmethod
+      def my_method():
+        pass
+
+    class GrandChild(Child):
+      pass
+
+    self.assertFalse(doc_controls.should_skip(GrandParent.my_method))
+    self.assertTrue(doc_controls.should_skip(Parent.my_method))
+    self.assertFalse(doc_controls.should_skip(Child.my_method))
+    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
+
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+
+  def test_for_subclass_implementers(self):
+
+    class GrandParent(object):
+
+      def my_method(self):
+        pass
+
+    class Parent(GrandParent):
+
+      @doc_controls.for_subclass_implementers
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+      pass
+
+    class GrandChild(Child):
+
+      def my_method(self):
+        pass
+
+    class Grand2Child(Child):
+      pass
+
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
+    self.assertFalse(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(Grand2Child, 'my_method'))
+
+  def test_for_subclass_implementers_short_circuit(self):
+
+    class GrandParent(object):
+
+      @doc_controls.for_subclass_implementers
+      def my_method(self):
+        pass
+
+    class Parent(GrandParent):
+
+      def my_method(self):
+        pass
+
+    class Child(Parent):
+
+      @doc_controls.do_not_doc_inheritable
+      def my_method(self):
+        pass
+
+    class GrandChild(Child):
+
+      @doc_controls.for_subclass_implementers
+      def my_method(self):
+        pass
+
+    class Grand2Child(Child):
+      pass
+
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
+    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
+    self.assertFalse(
+        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
+    self.assertTrue(
+        doc_controls.should_skip_class_attr(Grand2Child, 'my_method'))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
index f96887e4c7..fc93085e3e 100644
--- a/tensorflow/tools/docs/generate.py
+++ b/tensorflow/tools/docs/generate.py
@@ -31,11 +31,6 @@ if __name__ == '__main__':
   doc_generator = generate_lib.DocGenerator()
   doc_generator.add_output_dir_argument()
   doc_generator.add_src_dir_argument()
-  doc_generator.argument_parser.add_argument(
-      '--site_api_path',
-      type=str, default='api_docs/python',
-      help='The path from the site-root to api_docs'
-           'directory for this project')
 
   # This doc generator works on the TensorFlow codebase. Since this script lives
   # at tensorflow/tools/docs, and all code is defined somewhere inside
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 4bc8cbf4b4..090cf48a07 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -22,12 +22,14 @@ import argparse
 import fnmatch
 import os
 import shutil
+import tempfile
 
 import six
 
 from tensorflow.python.util import tf_inspect
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
+from tensorflow.tools.docs import doc_controls
 from tensorflow.tools.docs import doc_generator_visitor
 from tensorflow.tools.docs import parser
 from tensorflow.tools.docs import pretty_docs
@@ -56,7 +58,7 @@ def write_docs(output_dir,
                yaml_toc,
                root_title='TensorFlow',
                search_hints=True,
-               site_api_path=None):
+               site_api_path=''):
   """Write previously extracted docs to disk.
 
   Write a docs page for each symbol included in the indices of parser_config to
@@ -74,8 +76,8 @@ def write_docs(output_dir,
     root_title: The title name for the root level index.md.
     search_hints: (bool) include meta-data search hints at the top of each
       output file.
-    site_api_path: Used to write the api-duplicates _redirects.yaml file. if
-      None (the default) the file is not generated.
+    site_api_path: The output path relative to the site root. Used in the
+      `_toc.yaml` and `_redirects.yaml` files.
 
   Raises:
     ValueError: if `output_dir` is not an absolute path
@@ -96,7 +98,7 @@ def write_docs(output_dir,
   symbol_to_file = {}
 
   # Collect redirects for an api _redirects.yaml file.
-  redirects = ['redirects:\n']
+  redirects = []
 
   # Parse and write Markdown pages, resolving cross-links (@{symbol}).
   for full_name, py_object in six.iteritems(parser_config.index):
@@ -156,23 +158,27 @@ def write_docs(output_dir,
       raise OSError(
           'Cannot write documentation for %s to %s' % (full_name, directory))
 
-    if site_api_path:
-      duplicates = parser_config.duplicates.get(full_name, [])
-      if not duplicates:
-        continue
+    duplicates = parser_config.duplicates.get(full_name, [])
+    if not duplicates:
+      continue
+
+    duplicates = [item for item in duplicates if item != full_name]
 
-      duplicates = [item for item in duplicates if item != full_name]
-      template = ('- from: /{}\n'
-                  '  to: /{}\n')
-      for dup in duplicates:
-        from_path = os.path.join(site_api_path, dup.replace('.', '/'))
-        to_path = os.path.join(site_api_path, full_name.replace('.', '/'))
-        redirects.append(
-            template.format(from_path, to_path))
+    for dup in duplicates:
+      from_path = os.path.join(site_api_path, dup.replace('.', '/'))
+      to_path = os.path.join(site_api_path, full_name.replace('.', '/'))
+      redirects.append((
+          os.path.join('/', from_path),
+          os.path.join('/', to_path)))
 
-  if site_api_path:
+  if redirects:
+    redirects = sorted(redirects)
+    template = ('- from: {}\n'
+                '  to: {}\n')
+    redirects = [template.format(f, t) for f, t in redirects]
     api_redirects_path = os.path.join(output_dir, '_redirects.yaml')
     with open(api_redirects_path, 'w') as redirect_file:
+      redirect_file.write('redirects:\n')
       redirect_file.write(''.join(redirects))
 
   if yaml_toc:
@@ -203,7 +209,8 @@ def write_docs(output_dir,
             '- title: ' + title,
             '  section:',
             '  - title: Overview',
-            '    path: /TARGET_DOC_ROOT/VERSION/' + symbol_to_file[module]]
+            '    path: ' + os.path.join('/', site_api_path,
+                                        symbol_to_file[module])]
         header = ''.join([indent+line+'\n' for line in header])
         f.write(header)
 
@@ -214,7 +221,8 @@ def write_docs(output_dir,
         for full_name in symbols_in_module:
           item = [
               '  - title: ' + full_name[len(module) + 1:],
-              '    path: /TARGET_DOC_ROOT/VERSION/' + symbol_to_file[full_name]]
+              '    path: ' + os.path.join('/', site_api_path,
+                                          symbol_to_file[full_name])]
           item = ''.join([indent+line+'\n' for line in item])
           f.write(item)
 
@@ -288,6 +296,15 @@ def _get_default_do_not_descend_map():
   }
 
 
+class DocControlsAwareCrawler(public_api.PublicAPIVisitor):
+  """A `docs_controls` aware API-crawler."""
+
+  def _is_private(self, path, name, obj):
+    if doc_controls.should_skip(obj):
+      return True
+    return super(DocControlsAwareCrawler, self)._is_private(path, name, obj)
+
+
 def extract(py_modules,
             private_map,
             do_not_descend_map,
@@ -295,7 +312,7 @@ def extract(py_modules,
   """Extract docs from tf namespace and write them to disk."""
   # Traverse the first module.
   visitor = visitor_cls(py_modules[0][0])
-  api_visitor = public_api.PublicAPIVisitor(visitor)
+  api_visitor = DocControlsAwareCrawler(visitor)
   api_visitor.set_root_name(py_modules[0][0])
   add_dict_to_dict(private_map, api_visitor.private_map)
   add_dict_to_dict(do_not_descend_map, api_visitor.do_not_descend_map)
@@ -525,6 +542,12 @@ class DocGenerator(object):
         action='store_false',
         default=True)
 
+    self.argument_parser.add_argument(
+        '--site_api_path',
+        type=str, default='',
+        help='The path from the site-root to api_docs'
+             'directory for this project')
+
   def add_output_dir_argument(self):
     self.argument_parser.add_argument(
         '--output_dir',
@@ -537,9 +560,9 @@ class DocGenerator(object):
     self.argument_parser.add_argument(
         '--src_dir',
         type=str,
-        default=None,
-        required=True,
-        help='Directory with the source docs.')
+        default=tempfile.mkdtemp(),
+        required=False,
+        help='Optional directory of source docs to add api_docs links to')
 
   def add_base_dir_argument(self, default_base_dir):
     self.argument_parser.add_argument(
@@ -641,7 +664,7 @@ class DocGenerator(object):
         yaml_toc=self.yaml_toc,
         root_title=root_title,
         search_hints=getattr(flags, 'search_hints', True),
-        site_api_path=getattr(flags, 'site_api_path', None))
+        site_api_path=getattr(flags, 'site_api_path', ''))
 
     # Replace all the @{} references in files under `FLAGS.src_dir`
     replace_refs(flags.src_dir, flags.output_dir, reference_resolver, '*.md')
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index ffb93027ed..8e444a15cf 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -32,6 +32,7 @@ import six
 from google.protobuf.message import Message as ProtoMessage
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
+from tensorflow.tools.docs import doc_controls
 
 
 # A regular expression capturing a python identifier.
@@ -1175,15 +1176,18 @@ class _ClassPageInfo(object):
 
       # Don't document anything that is defined in object or by protobuf.
       defining_class = _get_defining_class(py_class, short_name)
-      if (defining_class is object or
-          defining_class is type or defining_class is tuple or
-          defining_class is BaseException or defining_class is Exception or
-          # The following condition excludes most protobuf-defined symbols.
-          defining_class and defining_class.__name__ in ['CMessage', 'Message',
-                                                         'MessageMeta']):
+      if defining_class in [object, type, tuple, BaseException, Exception]:
+        continue
+
+      # The following condition excludes most protobuf-defined symbols.
+      if (defining_class and
+          defining_class.__name__ in ['CMessage', 'Message', 'MessageMeta']):
         continue
       # TODO(markdaoust): Add a note in child docs showing the defining class.
 
+      if doc_controls.should_skip_class_attr(py_class, short_name):
+        continue
+
       child_doc = _parse_md_docstring(child, relative_path,
                                       parser_config.reference_resolver)
 
@@ -1691,15 +1695,18 @@ class _Metadata(object):
 
   Attributes:
     name: The name of the page being described by the Metadata block.
+    version: The source version.
   """
 
-  def __init__(self, name):
+  def __init__(self, name, version='stable'):
     """Creates a Metadata builder.
 
     Args:
       name: The name of the page being described by the Metadata block.
+      version: The source version.
     """
     self.name = name
+    self.version = version
     self._content = []
 
   def append(self, item):
@@ -1716,6 +1723,7 @@ class _Metadata(object):
     parts = ['<div itemscope itemtype="%s">' % schema]
 
     parts.append('<meta itemprop="name" content="%s" />' % self.name)
+    parts.append('<meta itemprop="path" content="%s" />' % self.version)
     for item in self._content:
       parts.append('<meta itemprop="property" content="%s"/>' % item)
 
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index 274d48ef66..9f6b185e81 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -24,6 +24,7 @@ import sys
 
 from tensorflow.python.platform import googletest
 from tensorflow.python.util import tf_inspect
+from tensorflow.tools.docs import doc_controls
 from tensorflow.tools.docs import parser
 
 
@@ -37,13 +38,27 @@ def test_function_with_args_kwargs(unused_arg, *unused_args, **unused_kwargs):
   pass
 
 
-class TestClass(object):
+class ParentClass(object):
+
+  @doc_controls.do_not_doc_inheritable
+  def hidden_method(self):
+    pass
+
+
+class TestClass(ParentClass):
   """Docstring for TestClass itself."""
 
   def a_method(self, arg='default'):
     """Docstring for a method."""
     pass
 
+  def hidden_method(self):
+    pass
+
+  @doc_controls.do_not_generate_docs
+  def hidden_method2(self):
+    pass
+
   class ChildClass(object):
     """Docstring for a child class."""
     pass
@@ -175,6 +190,104 @@ class ParserTest(googletest.TestCase):
     # Make sure this file is contained as the definition location.
     self.assertEqual(os.path.relpath(__file__, '/'), page_info.defined_in.path)
 
+  def test_docs_for_class_should_skip(self):
+
+    class Parent(object):
+
+      @doc_controls.do_not_doc_inheritable
+      def a_method(self, arg='default'):
+        pass
+
+    class Child(Parent):
+
+      def a_method(self, arg='default'):
+        pass
+
+    index = {
+        'Child': Child,
+        'Child.a_method': Child.a_method,
+    }
+
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
+    tree = {
+        'Child': ['a_method'],
+    }
+
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
+
+    page_info = parser.docs_for_object(
+        full_name='Child', py_object=Child, parser_config=parser_config)
+
+    # Make sure the `a_method` is not present
+    self.assertEqual(0, len(page_info.methods))
+
+  def test_docs_for_message_class(self):
+
+    class CMessage(object):
+
+      def hidden(self):
+        pass
+
+    class Message(object):
+
+      def hidden2(self):
+        pass
+
+    class MessageMeta(object):
+
+      def hidden3(self):
+        pass
+
+    class ChildMessage(CMessage, Message, MessageMeta):
+
+      def my_method(self):
+        pass
+
+    index = {
+        'ChildMessage': ChildMessage,
+        'ChildMessage.hidden': ChildMessage.hidden,
+        'ChildMessage.hidden2': ChildMessage.hidden2,
+        'ChildMessage.hidden3': ChildMessage.hidden3,
+        'ChildMessage.my_method': ChildMessage.my_method,
+    }
+
+    visitor = DummyVisitor(index=index, duplicate_of={})
+
+    reference_resolver = parser.ReferenceResolver.from_visitor(
+        visitor=visitor, doc_index={}, py_module_names=['tf'])
+
+    tree = {'ChildMessage': ['hidden', 'hidden2', 'hidden3', 'my_method']}
+
+    parser_config = parser.ParserConfig(
+        reference_resolver=reference_resolver,
+        duplicates={},
+        duplicate_of={},
+        tree=tree,
+        index=index,
+        reverse_index={},
+        guide_index={},
+        base_dir='/')
+
+    page_info = parser.docs_for_object(
+        full_name='ChildMessage',
+        py_object=ChildMessage,
+        parser_config=parser_config)
+
+    self.assertEqual(1, len(page_info.methods))
+    self.assertEqual('my_method', page_info.methods[0].short_name)
+
   def test_docs_for_module(self):
     # Get the current module.
     module = sys.modules[__name__]
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.h b/tensorflow/tools/graph_transforms/fold_constants_lib.h
index 8aefa6ae0f..0802ebb815 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.h
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FOLD_CONSTANTS_H_
-#define TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FOLD_CONSTANTS_H_
+#ifndef TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FOLD_CONSTANTS_LIB_H_
+#define TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FOLD_CONSTANTS_LIB_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -40,4 +40,4 @@ Status RemoveUnusedNodes(const GraphDef& input_graph_def,
 }  // namespace graph_transforms
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FOLD_CONSTANTS_H_
+#endif  // TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FOLD_CONSTANTS_LIB_H_
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index a8c7afc040..7645612cf1 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -9,10 +9,14 @@ load(
     "if_windows",
     "transitive_hdrs",
 )
-load("//third_party/mkl:build_defs.bzl", "if_mkl")
+load("//third_party/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
 load("//tensorflow:tensorflow.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
+load(
+    "//third_party/ngraph:build_defs.bzl",
+    "if_ngraph",
+)
 
 # This returns a list of headers of all public header libraries (e.g.,
 # framework, lib), and all of the transitive dependencies of those
@@ -71,6 +75,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
     "//tensorflow/contrib/data/python/kernel_tests/serialization:dataset_serialization_test_base",
     "//tensorflow/contrib/data/python/kernel_tests:stats_dataset_test_base",
+    "//tensorflow/contrib/data/python/kernel_tests:test_utils",
     "//tensorflow/contrib/data/python/ops:contrib_op_loader",
     "//tensorflow/contrib/eager/python/examples:examples_pip",
     "//tensorflow/contrib/eager/python:evaluator",
@@ -82,6 +87,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/predictor:predictor_pip",
     "//tensorflow/contrib/proto:proto",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rate:rate",
     "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
@@ -200,21 +206,30 @@ filegroup(
             "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
-    ) + tf_additional_license_deps(),
+    ) + if_ngraph([
+        "@ngraph//:LICENSE",
+        "@ngraph_tf//:LICENSE",
+        "@nlohmann_json_lib//:LICENSE",
+    ]) + tf_additional_license_deps(),
 )
 
 sh_binary(
     name = "build_pip_package",
     srcs = ["build_pip_package.sh"],
     data = select({
-        "//tensorflow:windows": [":simple_console_for_windows"],
+        "//tensorflow:windows": [
+            ":simple_console_for_windows",
+            "//tensorflow/contrib/lite/python:interpreter_test_data",
+            "//tensorflow/contrib/lite/python:tflite_convert",
+            "//tensorflow/contrib/lite/toco/python:toco_from_protos",
+        ],
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
             "//tensorflow/contrib/lite/python:interpreter_test_data",
             "//tensorflow/contrib/lite/python:tflite_convert",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         ],
-    }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
+    }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
 )
 
 # A genrule for generating a marker file for the pip package on Windows
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 86c5e4776d..c4b4af93b8 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -1,5 +1,6 @@
 include README
 recursive-include * *.py
+recursive-include * *.pd
 recursive-include * *.so
 recursive-include * *.dll
 recursive-include * *.lib
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 5e179079c5..8cefbef82d 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -51,8 +51,8 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'keras_applications == 1.0.4',
-    'keras_preprocessing == 1.0.2',
+    'keras_applications >= 1.0.5',
+    'keras_preprocessing >= 1.0.3',
     'numpy >= 1.13.3, <= 1.14.5',
     'six >= 1.10.0',
     'protobuf >= 3.6.0',
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index fc2c041b6c..b4b70e0a78 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -39,6 +39,7 @@ cc_binary(
         ":gen_proto_text_functions_lib",
         "@protobuf_archive//:protobuf",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:lib_proto_compiler",
     ] + if_ios(["//tensorflow/core/platform/default/build_config:logging"]),
 )
 
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index 234afe879b..159976f1b0 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/protobuf_compiler.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/tools/proto_text/gen_proto_text_functions_lib.h"
 
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
index e18d749cff..20aa605480 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
-#define TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#ifndef TENSORFLOW_TOOLS_PROTO_TEXT_GEN_PROTO_TEXT_FUNCTIONS_LIB_H_
+#define TENSORFLOW_TOOLS_PROTO_TEXT_GEN_PROTO_TEXT_FUNCTIONS_LIB_H_
 
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,4 +50,4 @@ ProtoTextFunctionCode GetProtoTextFunctionCode(
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#endif  // TENSORFLOW_TOOLS_PROTO_TEXT_GEN_PROTO_TEXT_FUNCTIONS_LIB_H_
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2cf1c86395..7cd9246b78 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -19,6 +19,10 @@ load(
     "//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
     "def_file_filter_configure",
 )
+load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
+
+def initialize_third_party():
+    flatbuffers()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -40,6 +44,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
 
+    initialize_third_party()
+
     # For windows bazel build
     # TODO: Remove def file filter when TensorFlow can export symbols properly on Windows.
     def_file_filter_configure(name = "local_config_def_file_filter")
@@ -100,11 +106,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/9613678332c976568272c8f4a78631a29159271d.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/9613678332c976568272c8f4a78631a29159271d.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/fefc83638fb69395d259ed245699310610429064.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/fefc83638fb69395d259ed245699310610429064.tar.gz",
         ],
-        sha256 = "1273a1434ced93bc3e703a48c5dced058c95e995c8c009e9bdcb24a69e2180e9",
-        strip_prefix = "abseil-cpp-9613678332c976568272c8f4a78631a29159271d",
+        sha256 = "e5f94a6fcc42cb3f312987a1f8c1a62a915bab4df993cf6cde95f64f2d264259",
+        strip_prefix = "abseil-cpp-fefc83638fb69395d259ed245699310610429064",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
     )
 
@@ -403,21 +409,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "nsync",
         urls = [
-            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.0.tar.gz",
-            "https://github.com/google/nsync/archive/1.20.0.tar.gz",
+            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.1.tar.gz",
+            "https://github.com/google/nsync/archive/1.20.1.tar.gz",
         ],
-        sha256 = "0c1b03962b2f8450f21e74a5a46116bf2d6009a807c57eb4207e974a8c4bb7dd",
-        strip_prefix = "nsync-1.20.0",
+        sha256 = "692f9b30e219f71a6371b98edd39cef3cbda35ac3abc4cd99ce19db430a5591a",
+        strip_prefix = "nsync-1.20.1",
+        system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"),
     )
 
     tf_http_archive(
         name = "com_google_googletest",
         urls = [
-            "https://mirror.bazel.build/github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
-            "https://github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
+            "https://mirror.bazel.build/github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
+            "https://github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
         ],
-        sha256 = "9cbca84c4256bed17df2c8f4d00c912c19d247c11c9ba6647cd6dd5b5c996b8d",
-        strip_prefix = "googletest-9816b96a6ddc0430671693df90192bbee57108b6",
+        sha256 = "353ab86e35cea1cd386115279cf4b16695bbf21b897bfbf2721cf4cb5f64ade8",
+        strip_prefix = "googletest-997d343dd680e541ef96ce71ee54a91daf2577a0",
     )
 
     tf_http_archive(
@@ -494,11 +501,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/36f54002c931a026f490f9fb074c11d91e3487a2.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/36f54002c931a026f490f9fb074c11d91e3487a2.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d.tar.gz",
         ],
-        sha256 = "e360a9e9b0d4f1adedcdb89fc1efc171f68e250c115ddfaeb82d71edef7a10c8",
-        strip_prefix = "llvm-36f54002c931a026f490f9fb074c11d91e3487a2",
+        sha256 = "2889b79ab979e676e344974cfeefbaf2c21c7c69a015bd584e8ae67b87b136bc",
+        strip_prefix = "llvm-97d7bcd5c024ee6aec4eecbc723bb6d4f4c3dc3d",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
     )
 
@@ -529,11 +536,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "boringssl",
         urls = [
-            "https://mirror.bazel.build/github.com/google/boringssl/archive/45c4a87ae97eb95a8fc2906c035d6a8d0e02e1b8.tar.gz",
-            "https://github.com/google/boringssl/archive/45c4a87ae97eb95a8fc2906c035d6a8d0e02e1b8.tar.gz",
+            "https://mirror.bazel.build/github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz",
+            "https://github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz",
         ],
-        sha256 = "972e8d8a9d1daf9892fff7155312b1af46b4754446575a7b285e62f917424c78",
-        strip_prefix = "boringssl-45c4a87ae97eb95a8fc2906c035d6a8d0e02e1b8",
+        sha256 = "1188e29000013ed6517168600fc35a010d58c5d321846d6a6dfee74e4c788b45",
+        strip_prefix = "boringssl-7f634429a04abc48e2eb041c81c5235816c96514",
     )
 
     tf_http_archive(
@@ -584,11 +591,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "kafka",
         urls = [
-            "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.4.tar.gz",
-            "https://github.com/edenhill/librdkafka/archive/v0.11.4.tar.gz",
+            "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz",
+            "https://github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz",
         ],
-        sha256 = "9d8f1eb7b0e29e9ab1168347c939cb7ae5dff00a39cef99e7ef033fd8f92737c",
-        strip_prefix = "librdkafka-0.11.4",
+        sha256 = "cc6ebbcd0a826eec1b8ce1f625ffe71b53ef3290f8192b6cae38412a958f4fd3",
+        strip_prefix = "librdkafka-0.11.5",
         build_file = clean_dep("//third_party:kafka/BUILD"),
         patch_file = clean_dep("//third_party/kafka:config.patch"),
     )
@@ -741,18 +748,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
     )
 
-    tf_http_archive(
-        name = "flatbuffers",
-        strip_prefix = "flatbuffers-1.9.0",
-        sha256 = "5ca5491e4260cacae30f1a5786d109230db3f3a6e5a0eb45d0d0608293d247e3",
-        urls = [
-            "https://mirror.bazel.build/github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
-            "https://github.com/google/flatbuffers/archive/v1.9.0.tar.gz",
-        ],
-        build_file = clean_dep("//third_party/flatbuffers:flatbuffers.BUILD"),
-        system_build_file = clean_dep("//third_party/systemlibs:flatbuffers.BUILD"),
-    )
-
     native.new_http_archive(
         name = "double_conversion",
         urls = [
@@ -833,6 +828,39 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         strip_prefix = "rules_android-0.1.1",
     )
 
+    tf_http_archive(
+        name = "ngraph",
+        urls = [
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.5.0.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.5.0.tar.gz",
+        ],
+        sha256 = "cb35d3d98836f615408afd18371fb13e3400711247e0d822ba7f306c45e9bb2c",
+        strip_prefix = "ngraph-0.5.0",
+        build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
+    )
+
+    tf_http_archive(
+        name = "nlohmann_json_lib",
+        urls = [
+            "https://mirror.bazel.build/github.com/nlohmann/json/archive/v3.1.1.tar.gz",
+            "https://github.com/nlohmann/json/archive/v3.1.1.tar.gz",
+        ],
+        sha256 = "9f3549824af3ca7e9707a2503959886362801fb4926b869789d6929098a79e47",
+        strip_prefix = "json-3.1.1",
+        build_file = clean_dep("//third_party/ngraph:nlohmann_json.BUILD"),
+    )
+
+    tf_http_archive(
+        name = "ngraph_tf",
+        urls = [
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.3.0-rc1.tar.gz",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.3.0-rc1.tar.gz",
+        ],
+        sha256 = "7919332cb15120101c3e05c1b969a5e029a6411581312583c8f80b6aaaa83072",
+        strip_prefix = "ngraph-tf-0.3.0-rc1",
+        build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
+    )
+
     ##############################################################################
     # BIND DEFINITIONS
     #
author	Ben <bstriner@gmail.com>	2018-08-26 15:44:13 -0400
committer	Ben <bstriner@gmail.com>	2018-08-26 15:44:13 -0400
commit	88ec342544096d895908dac6b0bf6b44dadaaca1 (patch)
tree	cd570c40e6a40e37f14747d6fd387596ff324d01 /tensorflow
parent	32d4ffeb95a344fde6a1b956a4a8d6792432bf15 (diff)
parent	09792df012c22622324f085f46edde33006c7355 (diff)